In [1]:
import os
project_path = "C:/workspace/Bus Project"
os.chdir(project_path)

In [2]:
from multiprocessing import Pool
import multiprocessing
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import folium
from functools import partial

import bus.analyzer as anz
import bus.stay as stay

In [3]:
# 데이터 기간 설정
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 6, 28)

# 로딩할 파일 명 리스트 생성
input_path_list = anz.make_input_path(start_date, end_date)

station_usage_df = anz.parallel_load_total_usage_df(input_path_list, core=8)

# 데이터 로드
user_df = anz.load_user_df()
station_df = anz.load_station_df()
cluster_df = anz.load_cluster_df()
cluster_station_df = anz.load_cluster_station_df()

# 정류장 간 이동 데이터 -> 군집간 이동 데이터
cluster_usage_df = stay.create_cluster_usage_df(station_usage_df, cluster_station_df)

In [4]:
# 체류시간 추출
stay_df = stay.get_walk_df(cluster_usage_df)
# 결측값 제거
stay_df = stay_df.dropna()
# 관광객 데이터 추출
tourist_stay_df = stay.fillter_usage_df(stay_df, user_df, tourist=True)

In [5]:
tourist_stay_df['stay_time'] = tourist_stay_df["geton_datetime"] - tourist_stay_df["getoff_datetime"]
tourist_stay_df = tourist_stay_df[tourist_stay_df['stay_time'].apply(lambda x : x.seconds/60 <= 1000)]
tourist_stay_df = tourist_stay_df[tourist_stay_df['stay_time'].apply(lambda x : x.seconds/60 >= 30)]

In [6]:
tourist_stay_df.sort_values(by="stay_time")

Unnamed: 0,user_id,getoff_cluster_id,getoff_datetime,geton_cluster_id,geton_datetime,stay_time
2038631,e6a05d7aac325bb0b946bcbd455edec553c602751b843f...,2.0,2019-06-26 17:37:48,3.0,2019-06-26 18:07:48,0 days 00:30:00
55497,0615b21b4d43f02aafdf647314dbbaa19f89320b491f38...,1.0,2019-06-15 14:37:00,1.0,2019-06-15 15:07:00,0 days 00:30:00
1697792,bfea0a55731cb64cfa0a53793d089ae857f3458ee6fea3...,47.0,2019-06-18 16:19:16,1024.0,2019-06-18 16:49:16,0 days 00:30:00
1667097,bc73d27efab86d55f3c480ad343cd58e779f1d7dcb13d9...,97.0,2019-06-21 17:01:31,97.0,2019-06-21 17:31:31,0 days 00:30:00
608588,44dd542aaea297e40d87843c06b5fa1268a6dd76f3667c...,1014.0,2019-06-18 11:48:43,1014.0,2019-06-18 12:18:43,0 days 00:30:00
...,...,...,...,...,...,...
2022988,e4f658201b9c7e523fadcd19b00df79be226d7e8ebb67f...,95.0,2019-06-12 18:54:42,109.0,2019-06-16 07:16:14,3 days 12:21:32
849874,6053a7c27ccc1c27fa4fe5bbaf50cb1f15e7bf9e39c892...,0.0,2019-06-06 07:26:26,0.0,2019-06-09 20:42:50,3 days 13:16:24
1039481,75633257a86b917c3795a5f4bd9ba6e039ba44d93a4923...,283.0,2019-06-04 22:28:56,6.0,2019-06-08 14:40:37,3 days 16:11:41
506844,39537be716b98e17d17dce57bf9380f7ed17e6a285bdc9...,1.0,2019-06-13 19:46:53,109.0,2019-06-17 12:03:45,3 days 16:16:52


In [35]:
temp_stay_df = tourist_stay_df[["getoff_cluster_id", "geton_cluster_id", "stay_time"]].groupby(by=["getoff_cluster_id", "geton_cluster_id"]).sum()
temp_count_df =tourist_stay_df[["getoff_cluster_id", "geton_cluster_id", "stay_time"]].groupby(by=["getoff_cluster_id", "geton_cluster_id"]).count().rename(columns={"stay_time":"count"})
df = pd.merge(temp_stay_df, temp_count_df, on=['getoff_cluster_id', 'geton_cluster_id'])
df['stay_time'] = df['stay_time'].apply(lambda x : x.seconds/60)
df['stay_time'] = df['stay_time']/df['count']

In [36]:
df = df.reset_index()
df

Unnamed: 0,getoff_cluster_id,geton_cluster_id,stay_time,count
0,0.0,0.0,12.334644,89
1,0.0,1.0,73.745455,11
2,0.0,3.0,208.791667,2
3,0.0,8.0,274.861111,3
4,0.0,12.0,321.533333,1
...,...,...,...,...
3256,1503.0,867.0,73.358333,2
3257,1505.0,992.0,199.412500,4
3258,1505.0,1179.0,378.983333,1
3259,1513.0,1475.0,122.650000,1


In [None]:
class ClusterManager:
    
    def __init__(self, cluster_df):
        self.cluster_df = cluster_df
    
    def set_cluster_df(self, cluster_df):
        self.cluster_df = cluster_df
    def extract_cluster_by_id(self, cluster_id):
        return cluster_df[cluster_df["cluster_id"] == cluster_id]

    def get_location_from_cluster(self, cluster):
        return cluster[["cluster_longitude", "cluster_latitude"]].values[0]

    def set_dist_clolums_from_two_clustes(self, id1, id2, longitude = "cluster_longitude", latitude = "cluster_latitude"):
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        x1, y1 = location1
        x2, y2 = location2
        cluster_df = self.cluster_df
        selector = list(cluster_df.columns)
        cluster_df['dist1_x'] = (cluster_df["cluster_longitude"] - x1)**2
        cluster_df['dist1_y'] = (cluster_df[latitude] - y1)**2
        cluster_df['dist2_x'] = (cluster_df["cluster_longitude"] - x2)**2
        cluster_df['dist2_y'] = (cluster_df[latitude] - y2)**2

        cluster_df['dist1'] = (cluster_df['dist1_x'] + cluster_df['dist1_y'])**(1/2)
        cluster_df['dist2'] = (cluster_df['dist2_x'] + cluster_df['dist2_y'])**(1/2)
        cluster_df['dist'] = cluster_df['dist1'] + cluster_df['dist2']
        cluster_df['dist'] = cluster_df['dist']*6500000/360
        cluster_df['dist'] = cluster_df['dist'].apply(lambda x : int(x))
        if "dist" not in selector:
            selector.append("dist")
        cluster_df = cluster_df[selector]
        cluster_df = cluster_df.sort_values(by="dist")

    def get_dist(self, id1, id2):
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        x1, y1 = location1
        x2, y2 = location2
        return ((x1-x2)**2+(y1-y2)**2)**(1/2)*6500000/360

    def get_column_filter(self):
        return self.cluster_df.columns
    
    def filter_column(self, column_filter):
        self.cluster_df = self.cluster_df[column_filter]
    
    def get_stay_area_flag_list(self, id1, id2):
        column_filter = self.get_column_filter()
        dist = self.get_dist(id1, id2)
        self.set_dist_clolums_from_two_clustes(id1, id2)
        stay_area_flag_list = self.cluster_df['dist'] <= dist*1.01
        
        self.filter_column(column_filter)
        return stay_area_flag_list
    
    def get_stay_area_df(self, id1, id2):
        return self.cluster_df[self.get_stay_area_flag_list(id1, id2)]
    
    def get_cluster_map(self):
        return get_cluster_map(self.cluster_df)
    
    def get_cluster_map(self, df):
        center = [df["cluster_latitude"].mean(), df["cluster_longitude"].mean()]
        map = folium.Map(location=center, zoom_start=10)

        for i in df.index:
            folium.CircleMarker([df.loc[i, "cluster_latitude"], df.loc[i, "cluster_longitude"]], color = 'blue', weight = 5, radius=1).add_to(map)

        return map
    
    def get_stay_area_map(self, id1, id2):
        stay_area_df = self.get_stay_area_df(id1, id2)
        map = self.get_cluster_map(stay_area_df)
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        folium.CircleMarker(location1, color = 'red', weight = 10, radius=3).add_to(map)
        folium.CircleMarker(location1, color = 'red', weight = 10, radius=3).add_to(map)
        return map

    def get_stay_cluster_infor(self, stay_df):
        stay_time_table = self.cluster_df[['cluster_id']]
        stay_time_table['count'] = 0
        stay_time_table['stay_time'] = 0
        
        for i in tqdm(range(len(stay_df))):
            start_id = stay_df["getoff_cluster_id"][i]
            end_id = stay_df["geton_cluster_id"][i]
            stay_time = stay_df['stay_time'][i]
            self.acc_stay_time(stay_time_table, start_id, end_id, stay_time)

        stay_time_table["stay_time"] = stay_time_table[["stay_time", "count"]].apply(lambda x : (x[0]/x[1]) if (x[1] != 0) else x[0], axis = 1)
        
        return stay_time_table[['cluster_id', 'stay_time', 'count']]
    
    def acc_stay_time(self, stay_time_table, start_id, end_id, stay_time):
        stay_area_flag_list = self.get_stay_area_flag_list(start_id, end_id)
        
        stay_time_table['flag'] = stay_area_flag_list
        cluster_num = stay_area_flag_list.value_counts()[True]
        stay_time = stay_time/cluster_num
        stay_time_table['stay_time'] = stay_time_table[['flag', 'stay_time']].apply(lambda x : x[1]+stay_time if x[0] else x[1], axis = 1)
        stay_time_table['count'] = stay_time_table[['flag', 'count']].apply(lambda x : x[1]+1 if x[0] else x[1], axis = 1)
        
cm = ClusterManager(cluster_df)
# cm.get_stay_area_df(1, 10)
cm.get_stay_area_map(757, 41)
# c = cm.get_stay_area_flag_list(1, 1)
# cm.get_stay_area_flag_table()
d = cm.get_stay_cluster_infor(df)



In [10]:
# d.to_csv("data/analysis/citizen_cluster_stay_df.csv")
# d = pd.read_csv("data/analysis/cluster_stay_df.csv", encoding = "cp949")
# d = pd.read_csv("data/analysis/citizen_cluster_stay_df.csv")
d = pd.read_csv("data/analysis/tourist_cluster_stay_df.csv")

In [11]:
d1 = d

In [12]:
d1.sort_values(by='count')

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,cluster_id,stay_time,count
679,679,679,679,679,0.000000,0
845,845,845,845,845,0.000000,0
846,846,846,846,846,0.000000,0
839,839,839,839,839,0.000000,0
795,795,795,795,795,0.000000,0
...,...,...,...,...,...,...
306,306,306,306,306,0.002273,946
32,32,32,32,32,0.002853,947
100,100,100,100,100,0.002513,956
3,3,3,3,3,0.004964,988
