In [15]:
import os
# Do! 프로젝트 root 경로로 설정
project_path = "C:/workspace/Bus Project"
os.chdir(project_path)

In [16]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

import bus.analyzer as anz

### 1) grouping function

In [17]:
def make_group_column(df):
    df['cluster_group'] = df['station_address'].apply(lambda x : x.split(" ")[1])
    return df

### 2) clustering function

In [18]:
target_s = 'cluster_target'
level_s = 'cluster_level'
earth_radius = 6371.0088 # 단위: km

In [19]:
def get_n_level_spatial_dbscan_result \
    (df, n, eps, min_pts = 3, nonf_cols = ['station_id', 'station_name'], f_cols = ['station_longitude', 'station_latitude']):

    # clustering
    global earth_radius 
    dbscan = DBSCAN(eps = eps/1000/earth_radius, algorithm='ball_tree', 
                    metric='haversine', min_samples=min_pts)
    
    temp_df = df.loc[:,nonf_cols + f_cols]
    temp_df.loc[:, target_s] = dbscan.fit_predict(np.radians(temp_df[f_cols]))

    # level 부여
    success_index = temp_df.query(target_s + ' > -1').index
    temp_df.loc[success_index, level_s] = str(n)
    
    failed_index = set(temp_df.index) - set(success_index)
    temp_df.loc[failed_index, level_s] = str(-1)

    return temp_df

In [20]:
def get_noise_handled_result \
    (df, n, by1='station_id', by2='station_name', f_cols=['station_longitude', 'station_latitude']):
    temp_df = df.loc[:, [by1, by2] + f_cols]

    temp_df['cluster_target'] = -1
    temp_df['cluster_level'] = -1

    grouped_df = temp_df.groupby(by = by2, as_index = False)
    grouped_df_count = grouped_df.count()
    station_nm_list = list(grouped_df_count[grouped_df_count[by1] >= 2][by2])
    target_range = np.arange(len(station_nm_list))

    for i in target_range:
        list_idx = temp_df.query('%s == "%s"' % (by2, station_nm_list[i])).index
        for idx in list_idx:
            temp_df.loc[idx, 'cluster_target'] = i
            temp_df.loc[idx, 'cluster_level'] = n

    return temp_df

In [21]:
def get_noise_result(df, n):
    df.loc[:, 'cluster_level'] = n
    return df

In [63]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(station_df, 1, eps, min_pts = 3)
    df_lv2_group = df_lv1_group.query(target_s + ' == -1')
    df_lv2_group = get_n_level_spatial_dbscan_result(df_lv2_group, 2, eps, min_pts = 2)
    
    df_noise_handled_group = df_lv2_group.query(target_s + ' == -1')
    df_noise_handled_group = get_noise_handled_result(df_noise_handled_group, 3)
    
    df_noise_group = df_noise_handled_group.query(target_s + ' == -1')
    df_noise_group = get_noise_result(df_noise_group, 4)
    
    r1 = df_lv1_group.query(target_s + ' > -1')
    r2 = df_lv2_group.query(target_s + ' > -1')
    r2["cluster_target"] = r2["cluster_target"] + max(r1["cluster_target"])
    r3 = df_noise_handled_group.query(target_s + ' > -1')
    r3["cluster_target"] = r2["cluster_target"] + max(r2["cluster_target"])
    r4 = df_noise_group
    r4["cluster_target"] = -1
    
    combined = pd.concat([r1, r2, r3, r4])
    return combined

In [64]:
cluster_df = get_spatial_dbscan_result(eps, group_df_list[0])
cluster_df[cluster_df["cluster_target"] == 0]
# cluster_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r2["cluster_target"] = r2["cluster_target"] + max(r1["cluster_target"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r3["cluster_target"] = r2["cluster_target"] + max(r2["cluster_target"])
A value is trying to be set on a copy of a

Unnamed: 0,station_id,station_name,station_longitude,station_latitude,cluster_target,cluster_level
4,638.0,제주국제공항(구제주방면),126.49345,33.50661,0.0,1
6,1355.0,제주국제공항(신제주방면),126.49275,33.5061,0.0,1
20,2972.0,제주국제공항(종점),126.49356,33.50572,0.0,1
99,3290.0,"제주국제공항(대정,화순,일주서로)",126.49298,33.50644,0.0,1
121,3351.0,"제주국제공항(일주동로,516도로)",126.49335,33.50669,0.0,1
130,1579.0,제주국제공항(600번),126.49252,33.50577,0.0,1
136,3350.0,"제주국제공항(평화로,800번)",126.49272,33.50636,0.0,1


### main

In [68]:
station_df = anz.load_station_df()

station_df = station_df[station_df['station_id'] != 0] ####################

station_df = make_group_column(station_df)

group_list = list(station_df["cluster_group"].drop_duplicates())
group_df_list = []
for label in group_list:
    group_df = station_df[station_df["cluster_group"] == label]
    group_df_list.append(group_df)
    
eps_list = [105, 90]
# eps_list = [200, 200]
cluster_df_list = []

for i, group_df in enumerate(group_df_list):   
    eps = eps_list[i]
    # eps에 따른 dbscan 수행. => label OR noise 생성
    cluster_df = get_spatial_dbscan_result(eps, group_df)
    print(cluster_df)
    cluster_df['cluster_level'] = cluster_df['cluster_level'].astype(str)
    cluster_df['cluster_target'] = cluster_df['cluster_target'].astype(str)
    cluster_df['level_target'] = cluster_df['cluster_level'] + '&' + cluster_df['cluster_target']
    
    # noise 군집에 개별 레이블 부여
    cluster_df["cluster_target"] = cluster_df["cluster_target"].apply(lambda x : float(x))
    next_target = int(max(list(cluster_df["cluster_target"]))) + 1
    for j in cluster_df.index:
        if cluster_df.loc[j, "cluster_target"] == -1:
            cluster_df.loc[j, "cluster_target"] = next_target
            next_target += 1
    
    # 군집 위치 구하기
    cluster_df["cluster_longitude"] = 0
    cluster_df["cluster_latitude"] = 0
    target_list = set(cluster_df["cluster_target"])
    for target in target_list:
        target_df = cluster_df[cluster_df["cluster_target"] == target]
        cluster_longitude = target_df["station_longitude"].mean()
        cluster_latitude = target_df["station_latitude"].mean()
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_longitude"] = cluster_longitude
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_latitude"] = cluster_latitude

    cluster_df["cluster_group"] = group_list[i]    
    cluster_df_list.append(cluster_df)

base_id = 0
for i, cluster_df in enumerate(cluster_df_list):
    cluster_df["cluster_id"] = cluster_df["cluster_target"]+base_id
    length = len(cluster_df["cluster_target"].drop_duplicates())
    base_id += length
    
# 그룹별 데이터를 하나로 묶음
cluster_df = pd.concat(cluster_df_list)
cluster_df = cluster_df[["station_id", "cluster_id", "cluster_group","cluster_target", "cluster_longitude", "cluster_latitude"]]
cluster_station_df = cluster_df[["station_id", "cluster_id"]]
cluster_station_df.to_csv("data/analysis/cluster_station_df.csv", encoding="CP949", index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r2["cluster_target"] = r2["cluster_target"] + max(r1["cluster_target"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r3["cluster_target"] = r2["cluster_target"] + max(r2["cluster_target"])
A value is trying to be set on a copy of a

      station_id   station_name  station_longitude  station_latitude  \
4          638.0  제주국제공항(구제주방면)          126.49345          33.50661   
5          149.0        제주버스터미널          126.51486          33.49993   
6         1355.0  제주국제공항(신제주방면)          126.49275          33.50610   
7         1564.0      제주시외버스터미널          126.51479          33.49946   
10         150.0        제주버스터미널          126.51471          33.50020   
...          ...            ...                ...               ...   
3579   6115011.0            묵리항          126.31181          33.94683   
3581   6115039.0      영흥리부녀회사무실          126.30011          33.95966   
3608   6115023.0             예초          126.33134          33.95359   
3613       922.0             월림          126.25608          33.34792   
3636      1429.0            멍중내          126.68860          33.50070   

      cluster_target cluster_level  
4                0.0             1  
5                1.0             1  
6                0.0    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r2["cluster_target"] = r2["cluster_target"] + max(r1["cluster_target"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r3["cluster_target"] = r2["cluster_target"] + max(r2["cluster_target"])
A value is trying to be set on a copy of a

      station_id      station_name  station_longitude  station_latitude  \
9         1957.0             중앙로터리          126.56058          33.25285   
16        1784.0          중앙로터리(동)          126.56135          33.25297   
24     6103005.0      서귀포시구시외버스터미널          126.56052          33.25217   
42        1699.0  제주월드컵경기장서귀포버스터미널          126.50900          33.24912   
51        2746.0            서귀포등기소          126.56015          33.25314   
...          ...               ...                ...               ...   
3590      1888.0               학림동          126.60706          33.29013   
3591      2212.0           수산1리사거리          126.88277          33.44238   
3596      2208.0              삼달1리          126.84528          33.37493   
3599      3472.0             안좌동입구          126.75970          33.35775   
3629      1997.0            예래초등학교          126.39113          33.24667   

      cluster_target cluster_level  
9                0.0             1  
16               0.0     

In [69]:
station_df = anz.load_station_df()
merged_df = pd.merge(station_df, cluster_df, on='station_id')
root = merged_df[['cluster_id', 'cluster_group', 'cluster_target', 'cluster_longitude', 'cluster_latitude']].drop_duplicates().sort_values(['cluster_group', 'cluster_target'])
branch = merged_df.groupby(by=["cluster_group", "cluster_target"]).sum().reset_index()[['cluster_group', 'cluster_target', 'tour_geton_usage', 'regident_geton_usage', 'tour_getoff_usage', 'regident_getoff_usage', 'total_usage']]
cluster_df = pd.merge(root, branch, on=['cluster_group', 'cluster_target'])
cluster_df_selector = {}
cluster_df = cluster_df.sort_values(by='cluster_id')
cluster_df.to_csv("data/analysis/cluster_df.csv", encoding="CP949", index = False)

In [71]:
dd = pd.merge(station_df, cluster_station_df, on="station_id")[["station_id", "cluster_id", "station_longitude", "station_latitude"]]
dd = dd[dd["cluster_id"] == 0]
dd

Unnamed: 0,station_id,cluster_id,station_longitude,station_latitude
4,638.0,0.0,126.49345,33.50661
6,1355.0,0.0,126.49275,33.5061
20,2972.0,0.0,126.49356,33.50572
99,3290.0,0.0,126.49298,33.50644
121,3351.0,0.0,126.49335,33.50669
130,1579.0,0.0,126.49252,33.50577
136,3350.0,0.0,126.49272,33.50636


In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import folium
import datetime
import random

import bus.analyzer as anz
class ClusterManager:
    
    def __init__(self, cluster_df):
        self.cluster_df = cluster_df
    
    def set_cluster_df(self, cluster_df):
        self.cluster_df = cluster_df
    def extract_cluster_by_id(self, cluster_id):
        return cluster_df[cluster_df["cluster_id"] == cluster_id]

    def get_location_from_cluster(self, cluster):
        return cluster[["cluster_longitude", "cluster_latitude"]].values[0]

    def set_dist_clolums_from_two_clustes(self, id1, id2, longitude = "cluster_longitude", latitude = "cluster_latitude"):
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        x1, y1 = location1
        x2, y2 = location2
        cluster_df = self.cluster_df
        selector = list(cluster_df.columns)
        cluster_df['dist1_x'] = (cluster_df["cluster_longitude"] - x1)**2
        cluster_df['dist1_y'] = (cluster_df[latitude] - y1)**2
        cluster_df['dist2_x'] = (cluster_df["cluster_longitude"] - x2)**2
        cluster_df['dist2_y'] = (cluster_df[latitude] - y2)**2

        cluster_df['dist1'] = (cluster_df['dist1_x'] + cluster_df['dist1_y'])**(1/2)
        cluster_df['dist2'] = (cluster_df['dist2_x'] + cluster_df['dist2_y'])**(1/2)
        cluster_df['dist'] = cluster_df['dist1'] + cluster_df['dist2']
        cluster_df['dist'] = cluster_df['dist']*6500000/360
        cluster_df['dist'] = cluster_df['dist'].apply(lambda x : int(x))
        if "dist" not in selector:
            selector.append("dist")
        cluster_df = cluster_df[selector]
        cluster_df = cluster_df.sort_values(by="dist")

    def get_dist(self, id1, id2):
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        x1, y1 = location1
        x2, y2 = location2
        return ((x1-x2)**2+(y1-y2)**2)**(1/2)*6500000/360

    def get_column_filter(self):
        return self.cluster_df.columns
    
    def filter_column(self, column_filter):
        self.cluster_df = self.cluster_df[column_filter]
    
    def get_stay_area_flag_list(self, id1, id2):
        column_filter = self.get_column_filter()
        dist = self.get_dist(id1, id2)
        self.set_dist_clolums_from_two_clustes(id1, id2)
        stay_area_flag_list = self.cluster_df['dist'] <= dist*1.01
        self.filter_column(column_filter)
        return stay_area_flag_list
    
    def get_stay_area_df(self, id1, id2):
        return self.cluster_df[self.get_stay_area_flag_list(id1, id2)]
    
    def get_cluster_map(self):
        return get_cluster_map(self.cluster_df)
    
    def get_cluster_map(self, df):
        center = [df["station_latitude"].mean(), df["station_longitude"].mean()]
        map = folium.Map(location=center, zoom_start=10)

        for i in df.index:
            folium.CircleMarker([df.loc[i, "station_latitude"], df.loc[i, "station_longitude"]], color = 'blue', weight = 5, radius=1).add_to(map)

        return map
    
    def get_stay_area_map(self, id1, id2):
        stay_area_df = self.get_stay_area_df(id1, id2)
        location1 = self.get_location_from_cluster(self.extract_cluster_by_id(id1))
        location2 = self.get_location_from_cluster(self.extract_cluster_by_id(id2))
        map = self.get_cluster_map(stay_area_df)
        folium.CircleMarker([location1[1], location1[0]], color = 'red', weight = 10, radius=3).add_to(map)
        folium.CircleMarker([location2[1], location2[0]], color = 'red', weight = 10, radius=3).add_to(map)
        return map

    def get_set_stay_infor(self, stay_df):
        table = self.cluster_df[[]]
        
        cluster_id_list = self.cluster_df["cluster_id"]
        
        for idx, start in enumerate(tqdm(cluster_id_list)):
            for end in tqdm(cluster_id_list[:idx+1]):
                table[str(start)+"/"+str(end)] = self.get_stay_area_flag_list(start, end)

cm = ClusterManager(cluster_df)
# cm.get_stay_area_df(13, 10)
id1 = 15
id2 = 20
cm.get_cluster_map(dd)
# cm.get_stay_area_flag_list(1, 100)
# cm.get_stay_area_flag_table()
# cm.get_cluster_map()

In [73]:
cluster_df

Unnamed: 0,cluster_id,cluster_group,cluster_target,cluster_longitude,cluster_latitude,tour_geton_usage,regident_geton_usage,tour_getoff_usage,regident_getoff_usage,total_usage
710,0.0,제주시,0.0,126.493047,33.506241,28678.0,382778.0,27997.0,209550.0,649003.0
711,1.0,제주시,1.0,126.514680,33.499755,12349.0,472516.0,7793.0,250057.0,742715.0
712,2.0,제주시,2.0,126.524287,33.511325,3180.0,205014.0,4019.0,161129.0,373342.0
713,3.0,제주시,3.0,126.527887,33.513440,6112.0,176127.0,3558.0,94720.0,280517.0
714,4.0,제주시,4.0,126.532737,33.495390,446.0,133885.0,441.0,86516.0,221288.0
...,...,...,...,...,...,...,...,...,...,...
705,1650.0,서귀포시,705.0,126.607060,33.290130,0.0,2.0,0.0,7.0,9.0
706,1651.0,서귀포시,706.0,126.882770,33.442380,0.0,3.0,0.0,6.0,9.0
707,1652.0,서귀포시,707.0,126.845280,33.374930,0.0,2.0,0.0,6.0,8.0
708,1653.0,서귀포시,708.0,126.759700,33.357750,0.0,2.0,0.0,6.0,8.0
