In [1]:
import collections

In [2]:
import os
# Do! 프로젝트 root 경로로 설정
# project_path = "D:/workspace/Bus Project"
project_path = "/Users/jade/git/capstone/jeju-bus-tag_data-analysis"
os.chdir(project_path)

In [3]:
import os
import datetime
import requests
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.cluster import DBSCAN
import xml.etree.ElementTree as elemTree
from multiprocessing import Pool
# import folium
from keplergl import KeplerGl

import bus.analyzer as anz

In [4]:
station_df = anz.load_station_df()
user_df = anz.load_user_df()
cluster_df = anz.load_cluster_df()
station_cluster_df = anz.load_cluster_station_df()

In [5]:
# station_df
# user_df
# cluster_df
# station_cluster_df

In [6]:
def parallelize_dataframe(func, df, core = multiprocessing.cpu_count()-4):
    df_split = np.array_split(df, core)
    pool = Pool(core)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [7]:
def load_total_usage_data(input_path_list):
    usage_df = pd.read_csv(input_path_list[0], low_memory=False, encoding = "cp949") #, dtype=dtype)
    for file_path in tqdm(input_path_list[1:]):
        temp_df = pd.read_csv(file_path, low_memory=False, encoding = "cp949") #, dtype=dtype)
        usage_df = pd.concat([usage_df, temp_df], sort=False, ignore_index=True)
        
    usage_df = usage_df[usage_df["geton_station_longitude"].notnull()]
    usage_df = usage_df[usage_df["geton_station_latitude"].notnull()]
    
    # datetime64로 형 변환 # M[base_date] = pd.to_datetime(M[base_date], format='%Y%m%d')
    usage_df['geton_datetime'] = pd.to_datetime(usage_df['geton_datetime'], format='%Y%m%d%H%M%S')
    usage_df['getoff_datetime'] = pd.to_datetime(usage_df['getoff_datetime'], format='%Y%m%d%H%M%S')
    
    return usage_df

In [8]:
# 데이터 기간 설정
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)

# 로딩할 파일 명 리스트 생성
input_path_list = anz.make_input_path(start_date, end_date)
station_usage_df = parallelize_dataframe(anz.load_total_usage_df, input_path_list, core = 10)

user_df = anz.load_user_df()
station_df = anz.load_station_df()
cluster_df = anz.load_cluster_df()
cluster_station_df = anz.load_cluster_station_df()

In [9]:
def create_cluster_usage_df(station_usage_df):
    usage_selector = ['user_id', 'geton_datetime', 'geton_station_id', 'getoff_datetime', 'getoff_station_id']
    station_usage_df = station_usage_df[usage_selector]

    cluster_selector = cluster_station_df.columns

    geton_columns = {}
    getoff_columns = {}
    for column in cluster_selector:
        geton_columns[column] = "geton_"+column
        getoff_columns[column] = "getoff_"+column
        
    geton_cluster_df = cluster_station_df.rename(columns=geton_columns)
    getoff_cluster_df = cluster_station_df.rename(columns=getoff_columns)
    
    cluster_usage_df = pd.merge(station_usage_df, geton_cluster_df, on=['geton_station_id'], how="left")
    cluster_usage_df = pd.merge(cluster_usage_df, getoff_cluster_df, on=['getoff_station_id'], how="left")
    return cluster_usage_df

cluster_usage_df = create_cluster_usage_df(station_usage_df)

In [10]:
def extract_usage(cluster_usage_df, user_df, tourist=True):
    tourist_cluster_usage_selector = cluster_usage_df.columns
    merged_df = pd.merge(cluster_usage_df, user_df, on="user_id", how="left")
    tourist_cluster_usage_df = merged_df[merged_df["tourist"] == tourist]
    tourist_cluster_usage_df = tourist_cluster_usage_df[tourist_cluster_usage_selector]
    return tourist_cluster_usage_df
    
cluster_usage_df = extract_usage(cluster_usage_df, user_df, tourist=True)

In [11]:
def extract_usage_grouped_user(cluster_usage_df):
    user_list = cluster_usage_df["user_id"].drop_duplicates()
    usage_list = []
    
    for user_id in tqdm(user_list):
        usage_list.append(cluster_usage_df[cluster_usage_df["user_id"] == user_id])
        
    return usage_list

usage_list = extract_usage_grouped_user(cluster_usage_df)

100%|██████████| 29711/29711 [11:01<00:00, 44.89it/s]


In [12]:
usage_list[0]

Unnamed: 0,user_id,geton_datetime,geton_station_id,getoff_datetime,getoff_station_id,geton_cluster_id,getoff_cluster_id
381,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 15:56:12,1202,2019-06-01 16:03:03,748.0,52,179.0
382,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 12:24:58,3351,2019-06-01 13:29:46,863.0,0,28.0
383,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 15:31:13,1214,2019-06-01 15:40:00,1200.0,123,49.0
384,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-01 17:33:56,748,2019-06-01 18:10:51,842.0,179,473.0
366410,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 14:10:08,1059,2019-06-03 14:56:24,3271.0,248,0.0
366411,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 13:14:41,2799,2019-06-03 13:34:10,1051.0,888,248.0
366412,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 15:40:25,164,2019-06-03 15:58:04,1355.0,749,0.0
366413,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 12:37:30,2704,2019-06-03 12:44:50,2799.0,914,888.0
366414,60126bd03aab52274cd4e09f6432ab960d49545a699ef3...,2019-06-03 11:54:11,2349,2019-06-03 11:56:47,2704.0,879,914.0


In [13]:
class Node:
    def __init__(self, colony):
        self.colony = colony
        self.end_of_pattern = False
        self.count = 0
        # dictionary of next visited colonies
        self.next = {}

In [14]:
class Trie(object):
    def __init__(self):
        self.root = Node(-1)
    
    def insert(self, pattern):
        node = self.root
        
        for colony in pattern:
            #if colony exists in node's child, move to it
            if colony in node.next:
                node.count += 1
                node = node.next[colony]
            else:
                # if colony is not found in child, create new node as child. 
                newNode = Node(colony)
                node.next[colony] = newNode
                node = newNode
                node.count += 1
                
        node.end_of_pattern = True
    
    def dfs(self, node, pattern):
        if node.end_of_pattern:
            self.output.append(pattern)
            return 
        
        for next_node in node.next.values():
            pattern.append(next_node.colony)
            self.dfs(next_node, pattern)
            pattern = []
    
    def query(self, cluster_x):
        self.output = []
        node = self.root
        
        if cluster_x in node.next:
            pattern = []
            self.dfs(node.next[cluster_x], pattern)
            
        return self.output
    
    def dfs_n(self, node, pattern, n, i):
        if not i:
            self.output.append(pattern)
            i = n
            return
        
        for next_node in node.next.values():
            pattern.append(next_node.colony)
            i -= 1
            self.dfs(next_node, pattern, n, i)
            pattern = []
    
    def query_upto_n_clusters(self, cluster_x, n):
        self.output = []
        node = self.root
            
        if cluster_x in node.next:
            pattern = []
            self.dfs_n(node.next[cluster_x], pattern, n, n)
            
        return self.output    

    def dfs_p(self, node, pattern):
        if node.end_of_pattern:
            self.output.append(pattern)
            return
        
        max_col1 = max(node.next.items(), key=lambda x:x[1].count)[0]
        max_cnt_list = [k for k, v in node.next.items() if node.next[max_col1].count == v.count]
        
        for max_node in max_cnt_list:
            pattern.append(node.next[max_node].colony)
            self.dfs_p(node.next[max_node], pattern)
            pattern = []
    
    def query_by_popularity(self, cluster_x):
        self.output = []
        node = self.root
        
        if cluster_x in node.next:
            pattern = []
            self.dfs_p(node.next[cluster_x], pattern)
        
        return self.output

In [53]:
def check_pattern_validity(prev_getoff_time, cur_geton_time, prev_getoff_station_id, cur_geton_station_id):
#     delta = cur_geton_time - prev_getoff_time
#     if(delta.days == 0 and delta.seconds <= 8*60*60):
#         return True
#     return False
    return True
    
def get_patterns(user, pattern_candidates):
    patterns = [[]]
    p_idx = 0
    
    for pattern in pattern_candidates:
        for idx in pattern:
            patterns[p_idx].append(user['geton_cluster_id'][idx])
            patterns[p_idx].append(user['getoff_cluster_id'][idx])
        patterns.append([])
        p_idx += 1
    
    return patterns 
        
            
def get_all_pattern_candidates(user):
    pattern_candidates = [[]]
    p_idx = 0 # pattern candidates idx in patterns
    prev_getoff_time = user.iloc[0]['getoff_datetime']
    prev_getoff_station_id = user.iloc[0]['getoff_station_id']
    
    for data in user.index:
        cur_geton_time = user['geton_datetime'][data]
        cur_geton_station_id = user['geton_station_id'][data]
        
        if check_pattern_validity(prev_getoff_time, cur_geton_time, prev_getoff_station_id, cur_geton_station_id):
            pattern_candidates[p_idx].append(data)
        else:
            p_idx += 1
            pattern_candidates.append([data])
            
        prev_getoff_time = user['getoff_datetime'][data] 
    
    return pattern_candidates

In [54]:
pattern_trie = Trie()

for user in usage_list:
    # patterns by index
    pattern_candidates = get_all_pattern_candidates(user)
    
    # patterns by cluster_id
    patterns = get_patterns(user, pattern_candidates)
    
    for pattern in patterns:
        pattern_trie.insert(pattern)

In [55]:



# test

In [56]:
cluster_df.drop_duplicates("cluster_id", keep="first")
idx = 0

pop = []
pop.append(pattern_trie.query_by_popularity(1))
pop.append(pattern_trie.query_by_popularity(100))
pop.append(pattern_trie.query_by_popularity(888))

map_group = []
for i in range(0, 3):
    map_group.append(KeplerGl())

m_idx = 0
for cl_pattern in pop:
    idx = 0
    for pattern in cl_pattern:
        df = pd.DataFrame(pd.Series(pattern))
        df.rename(columns={0:"cluster_id"}, inplace = True)
        df = pd.merge(df, cluster_df, on='cluster_id')
        link_cluster = df[['cluster_longitude', 'cluster_latitude']].drop(0)
        link_cluster.reset_index(drop=True, inplace=True)
        df["next_cluster_latitude"] = link_cluster['cluster_latitude']
        df["next_cluster_longitude"] = link_cluster['cluster_longitude']
        map_group[m_idx].add_data(data=df, name='data'+str(idx))
        idx+=1
    m_idx+=1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [57]:
map_group[0]

KeplerGl(data={'data0': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 'columns': ['clu…

In [58]:
map_group[1]

KeplerGl(data={'data0': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], …

In [59]:
map_group[2]

KeplerGl(data={'data0': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2…

In [60]:


# test - 경로 사이즈 입력

In [61]:
size = 3

for i in range(0, 3):
    map_group.append(KeplerGl())

m_idx = 3
for cl_pattern in pop:
    idx = 0
    for pattern in cl_pattern:
        df = pd.DataFrame(pd.Series(pattern))
        df.rename(columns={0:"cluster_id"}, inplace = True)
        df = pd.merge(df, cluster_df, on='cluster_id').drop_duplicates('cluster_id').head(size)
        link_cluster = df[['cluster_longitude', 'cluster_latitude']].drop(0)
        link_cluster.reset_index(drop=True, inplace=True)
        df["next_cluster_latitude"] = link_cluster['cluster_latitude']
        df["next_cluster_longitude"] = link_cluster['cluster_longitude']
        map_group[m_idx].add_data(data=df, name='data'+str(idx))
        idx+=1
    m_idx+=1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [62]:
map_group[3]

KeplerGl(data={'data0': {'index': [0, 3, 8], 'columns': ['cluster_id', 'cluster_group', 'cluster_target', 'clu…

In [63]:
map_group[4]

KeplerGl(data={'data0': {'index': [0, 1, 4], 'columns': ['cluster_id', 'cluster_group', 'cluster_target', 'clu…

In [64]:
map_group[5]

KeplerGl(data={'data0': {'index': [0, 1, 2], 'columns': ['cluster_id', 'cluster_group', 'cluster_target', 'clu…

In [65]:
for user in usage_list:
    # patterns by index
    pattern_candidates = get_all_pattern_candidates(user)
    
    # patterns by cluster_id
    patterns = get_patterns(user, pattern_candidates)
    
    for pattern in patterns:
        pattern_trie.insert(pattern)
        
size = 3

for i in range(0, 3):
    map_group.append(KeplerGl())

m_idx = 3
for cl_pattern in pop:
    idx = 0
    for pattern in cl_pattern:
        df = pd.DataFrame(pd.Series(pattern))
        df.rename(columns={0:"cluster_id"}, inplace = True)
        df = pd.merge(df, cluster_df, on='cluster_id').drop_duplicates('cluster_id').head(size)
        link_cluster = df[['cluster_longitude', 'cluster_latitude']].drop(0)
        link_cluster.reset_index(drop=True, inplace=True)
        df["next_cluster_latitude"] = link_cluster['cluster_latitude']
        df["next_cluster_longitude"] = link_cluster['cluster_longitude']
        map_group[m_idx].add_data(data=df, name='data'+str(idx))
        idx+=1
    m_idx+=1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [66]:
test_user1 = usage_list[24]
test_user2 = usage_list[0]
test_user3 = usage_list[207]

patterns_1 = get_patterns(test_user1, get_all_pattern_candidates(test_user1))
patterns_2 = get_patterns(test_user2, get_all_pattern_candidates(test_user2))
patterns_3 = get_patterns(test_user3, get_all_pattern_candidates(test_user3))

user_pattern = []
user_pattern.append(patterns_1)
user_pattern.append(patterns_2)
user_pattern.append(patterns_3)

In [68]:
for i in range(0, 3):
    map_group.append(KeplerGl())

m_idx = 6

for patterns in user_pattern:
    idx = 0
    for pattern_g in patterns:
        df = pd.DataFrame(pd.Series(pattern_g))
        df.rename(columns={0:"cluster_id"}, inplace = True)
        df = pd.merge(df, cluster_df, on='cluster_id')
        if(len(df)>0):
            link_cluster = df[['cluster_longitude', 'cluster_latitude']].drop(0)
            link_cluster.reset_index(drop=True, inplace=True)
            df["next_cluster_latitude"] = link_cluster['cluster_latitude']
            df["next_cluster_longitude"] = link_cluster['cluster_longitude']
            map_group[m_idx].add_data(data=df, name='data'+str(idx))
            idx+=1
    m_idx+=1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


  df = pd.DataFrame(pd.Series(pattern_g))


In [69]:
map_group[6]

KeplerGl(data={'data0': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'columns': ['cluster_id', 'cluster_group', '…

In [70]:
map_group[7]

KeplerGl(data={'data0': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 'columns': […

In [71]:
map_group[8]

KeplerGl(data={'data0': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2…

In [39]:
# Load an empty map
map_1 = KeplerGl()

for i in range(0, idx):
    with open("data/visualize/data"+str(i)+".csv", 'r', encoding='cp949') as f:
        csvData = f.read()
    map_1.add_data(data=csvData, name='data'+str(i))
map_1

# cluster_id가 1인 정류장 군집에서 시작하는 이동패턴
# 색깔별로 해당 군집에서 시작된 모든 이동패턴을 나타낸다. 

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data0': 'cluster_id,cluster_group,cluster_target,cluster_longitude,cluster_latitude,tour_geton…

In [25]:
idx2 = idx
pop = pattern_trie.query_by_popularity(100)
for pattern in pop:
    df = pd.DataFrame(pd.Series(pattern))
    df.rename(columns={0:"cluster_id"}, inplace = True)
    df = pd.merge(df, cluster_df, on='cluster_id')
    link_cluster = df[['cluster_longitude', 'cluster_latitude']].drop(0)
    link_cluster.reset_index(drop=True, inplace=True)
    df["next_cluster_latitude"] = link_cluster['cluster_latitude']
    df["next_cluster_longitude"] = link_cluster['cluster_longitude']
    df.to_csv("data/visualize/data"+str(idx)+".csv", encoding="CP949", index=False)
    idx+=1

In [27]:
map_2 = KeplerGl()

for i in range(idx2, idx):
    with open("data/visualize/data"+str(i)+".csv", 'r', encoding='cp949') as f:
        csvData = f.read()
    map_2.add_data(data=csvData, name='data'+str(i))
map_2

# cluster_id가 100인 정류장 군집에서 시작하는 이동패턴
# 색깔별로 해당 군집에서 시작된 모든 이동패턴을 나타낸다. 

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data6': 'cluster_id,cluster_group,cluster_target,cluster_longitude,cluster_latitude,tour_geton…

In [40]:
idx2 = idx
pop = pattern_trie.query_by_popularity(888)
for pattern in pop:
    df = pd.DataFrame(pd.Series(pattern))
    df.rename(columns={0:"cluster_id"}, inplace = True)
    df = pd.merge(df, cluster_df, on='cluster_id')
    link_cluster = df[['cluster_longitude', 'cluster_latitude']].drop(0)
    link_cluster.reset_index(drop=True, inplace=True)
    df["next_cluster_latitude"] = link_cluster['cluster_latitude']
    df["next_cluster_longitude"] = link_cluster['cluster_longitude']
    df.to_csv("data/visualize/data"+str(idx)+".csv", encoding="CP949", index=False)
    idx+=1

map_3 = KeplerGl()

for i in range(idx2, idx):
    with open("data/visualize/data"+str(i)+".csv", 'r', encoding='cp949') as f:
        csvData = f.read()
    map_3.add_data(data=csvData, name='data'+str(i))
map_3

# cluster_id가 888인 정류장 군집에서 시작하는 이동패턴
# 색깔별로 해당 군집에서 시작된 모든 이동패턴을 나타낸다. 

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data2': 'cluster_id,cluster_group,cluster_target,cluster_longitude,cluster_latitude,tour_geton…

In [48]:
# 경로 크기 설정
size = 3

map_4 = KeplerGl()
map_5 = KeplerGl()
map_6 = KeplerGl()

# just for test

# cluster_id = 1
for i in range(0, 2):
    csvData = pd.read_csv("data/visualize/data"+str(i)+".csv", encoding = "cp949").drop_duplicates('cluster_id').head(size)
    map_4.add_data(data=csvData, name='data'+str(i))
# cluster_id = 100
for i in range(6, 10):
    csvData = pd.read_csv("data/visualize/data"+str(i)+".csv", encoding = "cp949").drop_duplicates('cluster_id').head(size)
    map_5.add_data(data=csvData, name='data'+str(i))
#cluster_id = 888
for i in range(2, 7)

SyntaxError: invalid syntax (<ipython-input-48-3505f4327561>, line 11)