In [1]:
import os
project_path = "C:/workspace/Bus Project"
os.chdir(project_path)

In [2]:
from multiprocessing import Pool
import multiprocessing
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import folium
from functools import partial

import bus.analyzer as anz
import bus.stay as stay

In [3]:
# 데이터 기간 설정
start_date = datetime.datetime(2019, 7, 1)
end_date = datetime.datetime(2019, 7, 28)

# 로딩할 파일 명 리스트 생성
input_path_list = anz.make_input_path(start_date, end_date)

station_usage_df = anz.parallel_load_total_usage_df(input_path_list, core=8)

# 데이터 로드
user_df = anz.load_user_df()
station_df = anz.load_station_df()
cluster_df = anz.load_cluster_df()
cluster_station_df = anz.load_cluster_station_df()

# 정류장 간 이동 데이터 -> 군집간 이동 데이터
cluster_usage_df = stay.create_cluster_usage_df(station_usage_df, cluster_station_df)

In [6]:
# 체류시간 추출
stay_df = stay.get_walk_df(cluster_usage_df)
# 결측값 제거
stay_df = stay_df.dropna()
# 관광객 데이터 추출
tourist_stay_df = stay.fillter_usage_df(stay_df, user_df, tourist=True)

In [7]:
tourist_stay_df['stay_time'] = tourist_stay_df["geton_datetime"] - tourist_stay_df["getoff_datetime"]
tourist_stay_df = tourist_stay_df[tourist_stay_df['stay_time'].apply(lambda x : x.seconds/60 <= 300)]
tourist_stay_df = tourist_stay_df[tourist_stay_df['stay_time'].apply(lambda x : x.seconds/60 >= 30)]

In [8]:
tourist_stay_df.sort_values(by="stay_time")

Unnamed: 0,user_id,getoff_cluster_id,getoff_datetime,geton_cluster_id,geton_datetime,stay_time
867948,5c7c25a6b19ee235aeb01468b0358dd4ffa111a55298d0...,46.0,2019-07-26 15:36:55,46,2019-07-26 16:06:55,0 days 00:30:00
719699,4cac58cddb5923390fdddc91ef97d8c82c706997dc9d8e...,569.0,2019-07-17 08:10:54,569,2019-07-17 08:40:54,0 days 00:30:00
1370867,918f1b8cf0d29c8b1078072564722b7abeb4355d23c5ef...,0.0,2019-07-02 12:29:02,0,2019-07-02 12:59:02,0 days 00:30:00
1067415,716fc98f916399a79c47a812df98e7262970d7f4069480...,6.0,2019-07-06 15:52:08,6,2019-07-06 16:22:08,0 days 00:30:00
1814595,c0c1c7d43e9944c9a1dc7ace1df81ce731f489be2c50f7...,11.0,2019-07-02 11:42:57,10,2019-07-02 12:12:57,0 days 00:30:00
...,...,...,...,...,...,...
1154436,7aced2520cb40eecd07fe0328bd7cc0a6a2c0ded44e883...,73.0,2019-07-18 14:31:19,107,2019-07-21 17:30:11,3 days 02:58:52
1544246,a3e21d422ad778fdae9634acbf3d1d9e5ce4f34e6fd988...,5.0,2019-07-03 11:40:15,0,2019-07-06 15:38:28,3 days 03:58:13
1613815,ab49cdbe14508422df0ad7e6678d49ff02f74dc6d960b4...,5.0,2019-07-03 11:40:16,0,2019-07-06 15:38:39,3 days 03:58:23
1587949,a88a201ae4b0eb86c933d5b7657dbc82f693427a6732d4...,557.0,2019-07-18 12:11:11,557,2019-07-22 14:09:32,4 days 01:58:21


In [9]:
temp_stay_df = tourist_stay_df[["getoff_cluster_id", "geton_cluster_id", "stay_time"]].groupby(by=["getoff_cluster_id", "geton_cluster_id"]).sum()
temp_count_df =tourist_stay_df[["getoff_cluster_id", "geton_cluster_id", "stay_time"]].groupby(by=["getoff_cluster_id", "geton_cluster_id"]).count().rename(columns={"stay_time":"count"})
df = pd.merge(temp_stay_df, temp_count_df, on=['getoff_cluster_id', 'geton_cluster_id'])
df['stay_time'] = df['stay_time'].apply(lambda x : x.seconds/60)
df['stay_time'] = df['stay_time']/df['count']

In [10]:
df = df.reset_index()
df

Unnamed: 0,getoff_cluster_id,geton_cluster_id,stay_time,count
0,0.0,0,0.525076,662
1,0.0,1,16.470952,35
2,0.0,2,43.137500,28
3,0.0,3,0.925000,52
4,0.0,4,8.850000,30
...,...,...,...,...
4486,959.0,646,82.616667,1
4487,961.0,83,221.816667,2
4488,961.0,89,93.208333,2
4489,961.0,543,167.900000,1


In [11]:
class ClusterManager:
    
    def __init__(self, cluster_df):
        self.cluster_df = cluster_df
    
    def set_cluster_df(self, cluster_df):
        self.cluster_df = cluster_df
    def extract_cluster_by_id(self, cluster_id):
        return cluster_df[cluster_df["cluster_id"] == cluster_id]

    def get_location_from_cluster(self, cluster):
        return cluster[["cluster_longitude", "cluster_latitude"]].values[0]

    def set_dist_clolums_from_two_clustes(self, id1, id2, longitude = "cluster_longitude", latitude = "cluster_latitude"):
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        x1, y1 = location1
        x2, y2 = location2
        cluster_df = self.cluster_df
        selector = list(cluster_df.columns)
        cluster_df['dist1_x'] = (cluster_df["cluster_longitude"] - x1)**2
        cluster_df['dist1_y'] = (cluster_df[latitude] - y1)**2
        cluster_df['dist2_x'] = (cluster_df["cluster_longitude"] - x2)**2
        cluster_df['dist2_y'] = (cluster_df[latitude] - y2)**2

        cluster_df['dist1'] = (cluster_df['dist1_x'] + cluster_df['dist1_y'])**(1/2)
        cluster_df['dist2'] = (cluster_df['dist2_x'] + cluster_df['dist2_y'])**(1/2)
        cluster_df['dist'] = cluster_df['dist1'] + cluster_df['dist2']
        cluster_df['dist'] = cluster_df['dist']*6500000/360
        cluster_df['dist'] = cluster_df['dist'].apply(lambda x : int(x))
        if "dist" not in selector:
            selector.append("dist")
        cluster_df = cluster_df[selector]
        cluster_df = cluster_df.sort_values(by="dist")

    def get_dist(self, id1, id2):
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        x1, y1 = location1
        x2, y2 = location2
        return ((x1-x2)**2+(y1-y2)**2)**(1/2)*6500000/360

    def get_column_filter(self):
        return self.cluster_df.columns
    
    def filter_column(self, column_filter):
        self.cluster_df = self.cluster_df[column_filter]
    
    def get_stay_area_flag_list(self, id1, id2):
        column_filter = self.get_column_filter()
        dist = self.get_dist(id1, id2)
        self.set_dist_clolums_from_two_clustes(id1, id2)
        stay_area_flag_list = self.cluster_df['dist'] <= dist*1.01
        self.filter_column(column_filter)
        return stay_area_flag_list
    
    def get_stay_area_df(self, id1, id2):
        return self.cluster_df[self.get_stay_area_flag_list(id1, id2)]
    
    def get_cluster_map(self):
        return get_cluster_map(self.cluster_df)
    
    def get_cluster_map(self, df):
        center = [df["cluster_latitude"].mean(), df["cluster_longitude"].mean()]
        map = folium.Map(location=center, zoom_start=10)

        for i in df.index:
            folium.CircleMarker([df.loc[i, "cluster_latitude"], df.loc[i, "cluster_longitude"]], color = 'blue', weight = 3, radius=1).add_to(map)

        return map
    
    def get_stay_area_map(self, id1, id2):
        stay_area_df = self.get_stay_area_df(id1, id2)
        map = self.get_cluster_map(stay_area_df)
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        folium.CircleMarker(location1, color = 'red', weight = 10, radius=3).add_to(map)
        folium.CircleMarker(location1, color = 'red', weight = 10, radius=3).add_to(map)
        return map

    def get_stay_cluster_infor(self, stay_df):
        stay_time_table = self.cluster_df[['cluster_id']]
        stay_time_table['count'] = 0
        stay_time_table['stay_time'] = 0
        
        for i in tqdm(range(len(stay_df))):
            start_id = stay_df["getoff_cluster_id"][i]
            end_id = stay_df["geton_cluster_id"][i]
            stay_time = stay_df['stay_time'][i]
            self.acc_stay_time(stay_time_table, start_id, end_id, stay_time)

        stay_time_table["stay_time"] = stay_time_table[["stay_time", "count"]].apply(lambda x : (x[0]/x[1]) if (x[1] != 0) else x[0], axis = 1)
        
        return stay_time_table[['cluster_id', 'stay_time', 'count']]
    
    def acc_stay_time(self, stay_time_table, start_id, end_id, stay_time):
        stay_area_flag_list = self.get_stay_area_flag_list(start_id, end_id)
        
        stay_time_table['flag'] = stay_area_flag_list
        cluster_num = stay_area_flag_list.value_counts()[True]
        stay_time = stay_time/cluster_num
        stay_time_table['stay_time'] = stay_time_table[['flag', 'stay_time']].apply(lambda x : x[1]+stay_time if x[0] else x[1], axis = 1)
        stay_time_table['count'] = stay_time_table[['flag', 'count']].apply(lambda x : x[1]+1 if x[0] else x[1], axis = 1)
        
cm = ClusterManager(cluster_df)
# cm.get_stay_area_df(1, 10)
cm.get_stay_area_map(757, 41)
# c = cm.get_stay_area_flag_list(1, 1)
# cm.get_stay_area_flag_table()
d = cm.get_stay_cluster_infor(df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stay_time_table['count'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stay_time_table['stay_time'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['dist1_x'] = (cluster_df["cluster_longitude"] - x1)**2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

KeyboardInterrupt: 

In [23]:
# d.to_csv("data/analysis/citizen_cluster_stay_df.csv")
# d = pd.read_csv("data/analysis/cluster_stay_df.csv", encoding = "cp949")
# d = pd.read_csv("data/analysis/citizen_cluster_stay_df.csv")
d = pd.read_csv("data/analysis/tourist_cluster_stay_df.csv")

In [24]:
d1 = d

In [25]:
d1.sort_values(by='count')

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,cluster_id,stay_time,count
679,679,679,679,679,0.000000,0
845,845,845,845,845,0.000000,0
846,846,846,846,846,0.000000,0
839,839,839,839,839,0.000000,0
795,795,795,795,795,0.000000,0
...,...,...,...,...,...,...
306,306,306,306,306,0.002273,946
32,32,32,32,32,0.002853,947
100,100,100,100,100,0.002513,956
3,3,3,3,3,0.004964,988


In [26]:
d1["stay_time"] = d[["stay_time", "count"]].apply(lambda x : (x[0]/x[1]) if (x[1] != 0) else x[0], axis = 1)


In [27]:
d1 = d1.sort_values(by='stay_time')

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,cluster_id,stay_time,count
795,795,795,795,795,0.000000,0
725,725,725,725,725,0.000000,0
846,846,846,846,846,0.000000,0
845,845,845,845,845,0.000000,0
844,844,844,844,844,0.000000,0
...,...,...,...,...,...,...
583,583,583,583,583,9.117778,1
636,636,636,636,636,15.238542,2
812,812,812,812,812,37.072917,1
835,835,835,835,835,37.072917,1


In [31]:
f = d1.sort_values(by="stay_time").tail(10)
cm.get_cluster_map(pd.merge(cluster_df, f[["cluster_id"]], on="cluster_id", how='inner'))

In [18]:
map = self.get_cluster_map(stay_area_df)
location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
folium.CircleMarker(location1, color = 'red', weight = 10, radius=3).add_to(map)
folium.CircleMarker(location1, color = 'red', weight = 10, radius=3).add_to(map)

NameError: name 'self' is not defined