In [1]:
import collections

In [2]:
import os
# Do! 프로젝트 root 경로로 설정
# project_path = "D:/workspace/Bus Project"
project_path = "/Users/jade/git/capstone/jeju-bus-tag_data-analysis"
os.chdir(project_path)

In [9]:
import os
import datetime
import requests
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.cluster import DBSCAN
import xml.etree.ElementTree as elemTree
from multiprocessing import Pool

import bus.analyzer as anz

In [None]:
station_df = anz.load_station_df()
user_df = anz.load_user_df()
cluster_df = anz.load_cluster_df()
station_cluster_df = anz.load_cluster_station_df()

In [None]:
# station_df
# user_df
# cluster_df
# station_cluster_df

In [None]:
def parallelize_dataframe(func, df, core = multiprocessing.cpu_count()-4):
    df_split = np.array_split(df, core)
    pool = Pool(core)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
def load_total_usage_data(input_path_list):
    usage_df = pd.read_csv(input_path_list[0], low_memory=False, encoding = "cp949") #, dtype=dtype)
    for file_path in tqdm(input_path_list[1:]):
        temp_df = pd.read_csv(file_path, low_memory=False, encoding = "cp949") #, dtype=dtype)
        usage_df = pd.concat([usage_df, temp_df], sort=False, ignore_index=True)
        
    usage_df = usage_df[usage_df["geton_station_longitude"].notnull()]
    usage_df = usage_df[usage_df["geton_station_latitude"].notnull()]
    
    # datetime64로 형 변환 # M[base_date] = pd.to_datetime(M[base_date], format='%Y%m%d')
    usage_df['geton_datetime'] = pd.to_datetime(usage_df['geton_datetime'], format='%Y%m%d%H%M%S')
    usage_df['getoff_datetime'] = pd.to_datetime(usage_df['getoff_datetime'], format='%Y%m%d%H%M%S')
    
    return usage_df

In [None]:
# 데이터 기간 설정
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)

# 로딩할 파일 명 리스트 생성
input_path_list = anz.make_input_path(start_date, end_date)

station_usage_df = parallelize_dataframe(anz.load_total_usage_data, input_path_list, core = 10)

user_df = anz.load_user_df()
station_df = anz.load_station_df()
cluster_df = anz.load_cluster_df()
cluster_station_df = anz.load_cluster_station_df()

In [None]:
def create_cluster_usage_df(station_usage_df):
    usage_selector = ['user_id', 'geton_datetime', 'geton_station_id', 'getoff_datetime', 'getoff_station_id']
    station_usage_df = station_usage_df[usage_selector]

    cluster_selector = cluster_station_df.columns

    geton_columns = {}
    getoff_columns = {}
    for column in cluster_selector:
        geton_columns[column] = "geton_"+column
        getoff_columns[column] = "getoff_"+column
        
    geton_cluster_df = cluster_station_df.rename(columns=geton_columns)
    getoff_cluster_df = cluster_station_df.rename(columns=getoff_columns)
    
    cluster_usage_df = pd.merge(station_usage_df, geton_cluster_df, on=['geton_station_id'], how="left")
    cluster_usage_df = pd.merge(cluster_usage_df, getoff_cluster_df, on=['getoff_station_id'], how="left")
    return cluster_usage_df

cluster_usage_df = create_cluster_usage_df(station_usage_df)

In [None]:
def extract_usage(cluster_usage_df, user_df, tourist=True):
    tourist_cluster_usage_selector = cluster_usage_df.columns
    merged_df = pd.merge(cluster_usage_df, user_df, on="user_id", how="left")
    tourist_cluster_usage_df = merged_df[merged_df["tourist"] == tourist]
    tourist_cluster_usage_df = tourist_cluster_usage_df[tourist_cluster_usage_selector]
    return tourist_cluster_usage_df
    
cluster_usage_df = extract_usage(cluster_usage_df, user_df, tourist=True)

In [None]:
def extract_usage_grouped_user(cluster_usage_df):
    user_list = cluster_usage_df["user_id"].drop_duplicates()
    usage_list = []
    
    for user_id in tqdm(user_list):
        usage_list.append(cluster_usage_df[cluster_usage_df["user_id"] == user_id])
        
    return usage_list

usage_list = extract_usage_grouped_user(cluster_usage_df)

In [11]:
class Node:
    def __init__(self, colony):
        self.colony = colony
        self.end_of_pattern = False
        self.count = 0
        # dictionary of next visited colonies
        self.next = {}

In [59]:
class Trie(object):
    def __init__(self):
        self.root = Node(-1)
    
    def insert(self, pattern):
        node = self.root
        
        for colony in pattern:
            #if colony exists in node's child, move to it
            if colony in node.next:
                node.count += 1
                node = node.next[colony]
            else:
                # if colony is not found in child, create new node as child. 
                newNode = Node(colony)
                node.next[colony] = newNode
                node = newNode
                node.count += 1
                
        node.end_of_pattern = True
    
    def dfs(self, node, pattern):
        if node.end_of_pattern:
            self.output.append(pattern)
            return 
        
        for next_node in node.next.values():
            pattern.append(next_node.colony)
            self.dfs(next_node, pattern)
            pattern = []
    
    def query(self, cluster_x):
        self.output = []
        node = self.root
        
        if cluster_x in node.next:
            pattern = []
            self.dfs(node.next[cluster_x], pattern)
            
        return self.output
    
    def dfs_n(self, node, pattern, n, i):
        if not i:
            self.output.append(pattern)
            i = n
            return
        
        for next_node in node.next.values():
            pattern.append(next_node.colony)
            i -= 1
            self.dfs(next_node, pattern, n, i)
            pattern = []
    
    def query_upto_n_clusters(self, cluster_x, n):
        self.output = []
        node = self.root
            
        if cluster_x in node.next:
            pattern = []
            self.dfs_n(node.next[cluster_x], pattern, n, n)
            
        return self.output    

    def dfs_p(self, node, pattern):
        if node.end_of_pattern:
            self.output.append(pattern)
            return
        
        max_col1 = max(node.next.items(), key=lambda x:x[1].count)[0]
        max_cnt_list = [k for k, v in node.next.items() if node.next[max_col1].count == v.count]
        
        for max_node in max_cnt_list:
            pattern.append(node.next[max_node].colony)
            self.dfs_p(node.next[max_node], pattern)
            pattern = []
    
    def query_by_popularity(self, cluster_x):
        self.output = []
        node = self.root
        
        if cluster_x in node.next:
            pattern = []
            self.dfs_p(node.next[cluster_x], pattern)
        
        return self.output

In [58]:
def check_pattern_validity(prev_getoff_time, cur_geton_time):
    return True
    
def get_patterns(user, pattern_candidates):
    patterns = [[]]
    p_idx = 0
    
    for pattern in pattern_candidates:
        for idx in pattern:
            patterns[p_idx].append(user['geton_cluster_id'][idx])
            patterns[p_idx].append(user['getoff_cluster_id'][idx])
        p_idx += 1
    
    return patterns 
        
            
def get_all_pattern_candidates(user):
    pattern_candidates = [[]]
    p_idx = 0 # pattern candidates idx in patterns
    prev_getoff_time = user.iloc[0]['getoff_datetime']
    
    for data in user.index:
        cur_geton_time = user['geton_datetime'][data]
        
        if check_pattern_validity(prev_getoff_time, cur_geton_time):
            pattern_candidates[p_idx].append(data)
        else:
            p_idx += 1
            pattern_candidates[p_idx].append(data)
            
        prev_getoff_time = user['getoff_datetime'][data] 
    
    return pattern_candidates

In [None]:
def draw_patterns(patterns):
    

In [10]:
pattern_trie = Trie()

for user in usage_list:
    # patterns by index
    pattern_candidates = get_all_pattern_candidates(user)
    # patterns by cluster_id
    patterns = get_patterns(user, pattern_candidates)
    
    for pattern in patterns:
        pattern_trie.insert(pattern)
    