In [68]:
import pandas as pd
import pickle
import warnings
warnings.filterwarnings(action='ignore') 


def save_pkl(data, path):
    with open(path, 'wb') as ww:
        pickle.dump(data, ww)
        
def load_pkl(path):
    with open(path, 'rb') as rr:
        data = pickle.load(rr)
    return data

## Load Data

In [4]:
london_raw = load_pkl('../data/london.pkl')
station_info = pd.read_csv('../data/london_stations.csv')

## 공통 전처리

In [6]:
london = london_raw.dropna(axis=0)

In [7]:
print("raw   data :", len(london_raw))
print("na rm data :", len(london))

raw   data : 38215560
na rm data : 38147278


In [8]:
station_list = set(london['start_station_name'].unique()) & set(station_info['station_name'].unique()) & set(london['end_station_name'].unique())

In [9]:
print(len(list(station_info['station_name'])), len(list(station_list)))

802 758


In [10]:
station_tbr = [item for item in station_info['station_name'] if item not in list(station_list)]

## Case에 맞게 전처리

### Case 2: 대여 요일 별 분석 - 평일/주말

In [28]:
import datetime

In [82]:
week_values = pd.to_datetime(london.start_rental_date_time.values, format = "%Y-%m-%d %H:%M:%S")
week_values = [date.weekday() for date in week_values]
london.loc[:, 'weekday'] = week_values

In [84]:
london_weekday = london.query("weekday < 5")
london_weekend = london.query("weekday >= 5")

In [88]:
london_weekday = london_weekday.drop('weekday', axis = 1)
london_weekend = london_weekend.drop('weekday', axis = 1)

In [94]:
# COVID-19 전후 data 분할

In [101]:
def covid_divide(data):
    data_entire = data
    data_before = data.query("start_rental_date_time >= '2019-01-01' and start_rental_date_time < '2019-09-01'")
    data_after = data.query("start_rental_date_time >= '2020-01-01' and start_rental_date_time < '2020-09-01'")


    # 전체/전/후
    data_entire['end_station_id'] = data_entire['end_station_id'].astype('float')
    data_before['end_station_id'] = data_before['end_station_id'].astype('float')
    data_after['end_station_id'] = data_after['end_station_id'].astype('float')

    return data_entire, data_before, data_after

In [102]:
weekday_entire, weekday_before, weekday_after = covid_divide(london_weekday)

## Co_occurence matrix 만드는 코드

In [105]:
def make_co_matrix(data):
    co_matrix = pd.crosstab(data['start_station_id'], data['end_station_id'])
    return co_matrix

In [108]:
co_matrix_en = make_co_matrix(weekday_entire)
co_matrix_bf = make_co_matrix(weekday_before)
co_matrix_af = make_co_matrix(weekday_after)

In [109]:
import os; 

case = 2
case_path = f'../data/preprocessed/case-{case}'
try: 
    os.mkdir(case_path)
except: 
    print('already exits')

In [110]:
co_matrix_en.to_csv(case_path+"/co_matrix_en.csv")
co_matrix_bc.to_csv(case_path+"/co_matrix_bf.csv")
co_matrix_ac.to_csv(case_path+"/co_matrix_af.csv")