In [1]:
import os
project_path = "D:/workspace/Bus Project"
os.chdir(project_path)

In [54]:
from multiprocessing import Pool
import multiprocessing
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

import bus.analyzer as anz

In [3]:
def parallelize_dataframe(func, df, core = multiprocessing.cpu_count()-4):
    df_split = np.array_split(df, core)
    pool = Pool(core)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [29]:
# 데이터 기간 설정
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)

# 로딩할 파일 명 리스트 생성
input_path_list = anz.make_input_path(start_date, end_date)

station_usage_df = parallelize_dataframe(anz.load_total_usage_data, input_path_list, core = 10)

user_df = anz.load_user_df()
station_df = anz.load_station_df()
cluster_df = anz.load_cluster_df()
cluster_station_df = anz.load_cluster_station_df()

In [31]:
def create_cluster_usage_df(station_usage_df):
    usage_selector = ['user_id', 'geton_datetime', 'geton_station_id', 'getoff_datetime', 'getoff_station_id']
    station_usage_df = station_usage_df[usage_selector]

    cluster_selector = cluster_station_df.columns

    geton_columns = {}
    getoff_columns = {}
    for column in cluster_selector:
        geton_columns[column] = "geton_"+column
        getoff_columns[column] = "getoff_"+column
        
    geton_cluster_df = cluster_station_df.rename(columns=geton_columns)
    getoff_cluster_df = cluster_station_df.rename(columns=getoff_columns)
    
    cluster_usage_df = pd.merge(station_usage_df, geton_cluster_df, on=['geton_station_id'], how="left")
    cluster_usage_df = pd.merge(cluster_usage_df, getoff_cluster_df, on=['getoff_station_id'], how="left")
    return cluster_usage_df

cluster_usage_df = create_cluster_usage_df(station_usage_df)

In [59]:
def extract_usage(cluster_usage_df, user_df, tourist=True):
    tourist_cluster_usage_selector = cluster_usage_df.columns
    merged_df = pd.merge(cluster_usage_df, user_df, on="user_id", how="left")
    tourist_cluster_usage_df = merged_df[merged_df["tourist"] == tourist]
    tourist_cluster_usage_df = tourist_cluster_usage_df[tourist_cluster_usage_selector]
    return tourist_cluster_usage_df
    
cluster_usage_df = extract_usage(cluster_usage_df, user_df, tourist=True)

In [61]:
def extract_usage_grouped_user(cluster_usage_df):
    user_list = cluster_usage_df["user_id"].drop_duplicates()
    usage_list = []
    
    for user_id in tqdm(user_list):
        usage_list.append(cluster_usage_df[cluster_usage_df["user_id"] == user_id])
        
    return usage_list

usage_list = extract_usage_grouped_user(cluster_usage_df)

100%|████████████████████████████████████████████████████████████████████████████| 29711/29711 [09:29<00:00, 52.13it/s]


In [63]:
usage_list[0]

Unnamed: 0,user_id,geton_datetime,geton_station_id,getoff_datetime,getoff_station_id,geton_cluster_id,getoff_cluster_id
381,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 15:56:12,1202,2019-06-01 16:03:03,748.0,52,179.0
382,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 12:24:58,3351,2019-06-01 13:29:46,863.0,0,28.0
383,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 15:31:13,1214,2019-06-01 15:40:00,1200.0,123,49.0
384,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 17:33:56,748,2019-06-01 18:10:51,842.0,179,473.0
366410,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 14:10:08,1059,2019-06-03 14:56:24,3271.0,248,0.0
366411,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 13:14:41,2799,2019-06-03 13:34:10,1051.0,888,248.0
366412,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 15:40:25,164,2019-06-03 15:58:04,1355.0,749,0.0
366413,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 12:37:30,2704,2019-06-03 12:44:50,2799.0,914,888.0
366414,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 11:54:11,2349,2019-06-03 11:56:47,2704.0,879,914.0
