In [1]:
import os
# Do! 프로젝트 root 경로로 설정
project_path = "D:/workspace/Bus Project"
os.chdir(project_path)

In [2]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

import bus.analyzer as anz

### 1) grouping function

In [3]:
def make_group_column(df):
    df['cluster_group'] = df['station_address'].apply(lambda x : x.split(" ")[1])
    return df

### 2) clustering function

In [4]:
target_s = 'cluster_target'
level_s = 'cluster_level'
earth_radius = 6371.0088 # 단위: km

In [5]:
def get_n_level_spatial_dbscan_result \
    (df, n, eps, min_pts = 3, nonf_cols = ['station_id', 'station_name'], f_cols = ['station_longitude', 'station_latitude']):

    # clustering
    global earth_radius 
    dbscan = DBSCAN(eps = eps/1000/earth_radius, algorithm='ball_tree', 
                    metric='haversine', min_samples=min_pts)
    
    temp_df = df.loc[:,nonf_cols + f_cols]
    temp_df.loc[:, target_s] = dbscan.fit_predict(np.radians(temp_df[f_cols]))

    # level 부여
    success_index = temp_df.query(target_s + ' > -1').index
    temp_df.loc[success_index, level_s] = str(n)
    
    failed_index = set(temp_df.index) - set(success_index)
    temp_df.loc[failed_index, level_s] = str(-1)

    return temp_df

In [6]:
def get_noise_handled_result \
    (df, n, by1='station_id', by2='station_name', f_cols=['station_longitude', 'station_latitude']):
    temp_df = df.loc[:, [by1, by2] + f_cols]

    temp_df['cluster_target'] = -1
    temp_df['cluster_level'] = -1

    grouped_df = temp_df.groupby(by = by2, as_index = False)
    grouped_df_count = grouped_df.count()
    station_nm_list = list(grouped_df_count[grouped_df_count[by1] >= 2][by2])
    target_range = np.arange(len(station_nm_list))

    for i in target_range:
        list_idx = temp_df.query('%s == "%s"' % (by2, station_nm_list[i])).index
        for idx in list_idx:
            temp_df.loc[idx, 'cluster_target'] = i
            temp_df.loc[idx, 'cluster_level'] = n

    return temp_df

In [7]:
def get_noise_result(df, n):
    df.loc[:, 'cluster_level'] = n
    return df

In [8]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(station_df, 1, eps, min_pts = 3)
    
    df_lv2_group = df_lv1_group.query(target_s + ' == -1')
    df_lv2_group = get_n_level_spatial_dbscan_result(df_lv2_group, 2, eps, min_pts = 2)
    
    df_noise_handled_group = df_lv2_group.query(target_s + ' == -1')
    df_noise_handled_group = get_noise_handled_result(df_noise_handled_group, 3)
    
    df_noise_group = df_noise_handled_group.query(target_s + ' == -1')
    df_noise_group = get_noise_result(df_noise_group, 4)
    
    r1 = df_lv1_group.query(target_s + ' > -1')
    r2 = df_lv2_group.query(target_s + ' > -1')
    r3 = df_noise_handled_group.query(target_s + ' > -1')
    r4 = df_noise_group
    
    combined = pd.concat([r1, r2, r3, r4])
    return combined

### main

In [45]:
station_df = anz.load_station_df()

station_df = station_df[station_df['station_id'] != 0] ####################

station_df = make_group_column(station_df)

group_list = list(station_df["cluster_group"].drop_duplicates())
group_df_list = []
for label in group_list:
    group_df_list.append(station_df[station_df["cluster_group"] == label])

    
eps_list = [105, 90]
# eps_list = [300, 200]
cluster_df_list = []

for i, group_df in enumerate(group_df_list):   
    eps = eps_list[i]
    # eps에 따른 dbscan 수행. => label OR noise 생성
    cluster_df = get_spatial_dbscan_result(eps, group_df)

    cluster_df['cluster_level'] = cluster_df['cluster_level'].astype(str)
    cluster_df['cluster_target'] = cluster_df['cluster_target'].astype(str)
    cluster_df['level_target'] = cluster_df['cluster_level'] + '&' + cluster_df['cluster_target']
    
    # noise 군집에 개별 레이블 부여
    cluster_df["cluster_target"] = cluster_df["cluster_target"].apply(lambda x : int(x))
    next_target = int(max(list(cluster_df["cluster_target"]))) + 1
    for j in cluster_df.index:
        if cluster_df.loc[j, "cluster_target"] == -1:
            cluster_df.loc[j, "cluster_target"] = next_target
            next_target += 1
    
    # 군집 위치 구하기
    cluster_df["cluster_longitude"] = 0
    cluster_df["cluster_latitude"] = 0
    target_list = set(cluster_df["cluster_target"])
    for target in target_list:
        target_df = cluster_df[cluster_df["cluster_target"] == target]
        cluster_longitude = target_df["station_longitude"].mean()
        cluster_latitude = target_df["station_latitude"].mean()
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_longitude"] = cluster_longitude
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_latitude"] = cluster_latitude

    cluster_df_list.append(cluster_df)


# 그룹별 데이터를 하나로 묶음
cluster_df = pd.concat(cluster_df_list)
cluster_df = pd.merge(df, cluster_df, on="station_id", sort = "right")
cluster_df = cluster_df[["station_id", "cluster_target", "cluster_level", "cluster_longitude", "cluster_latitude"]]

cluster_df.to_csv("data/analysis/cluster_df.csv", encoding="CP949", index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [46]:
cluster_df

Unnamed: 0,station_id,cluster_target,cluster_level,cluster_longitude,cluster_latitude
0,1.0,36,1,126.585702,33.514012
1,2.0,25,1,126.467768,33.493629
2,3.0,25,1,126.467768,33.493629
3,4.0,96,2,126.461295,33.494980
4,5.0,96,2,126.461295,33.494980
...,...,...,...,...,...
3649,6115048.0,840,4,126.954110,33.495580
3650,6115052.0,794,4,126.965670,33.511230
3651,6115059.0,818,4,126.950930,33.507310
3652,6115100.0,1,1,126.471695,33.250647


In [48]:
merged_df = pd.merge(station_df, cluster_df, on='station_id')
a = merged_df[['cluster_group', 'cluster_target', 'cluster_longitude', 'cluster_latitude']].drop_duplicates().sort_values(['cluster_group', 'cluster_target'])
b = merged_df.groupby(by=["cluster_group", "cluster_target"]).sum().reset_index()[['cluster_group', 'cluster_target', 'tour_geton_usage', 'regident_geton_usage', 'tour_getoff_usage', 'regident_getoff_usage', 'total_usage']]
clustered_station_df = pd.merge(a, b, on=['cluster_group', 'cluster_target'])
clustered_station_df.to_csv("data/analysis/clustered_station_df.csv", encoding="CP949", index = False)

In [49]:
clustered_station_df

Unnamed: 0,cluster_group,cluster_target,cluster_longitude,cluster_latitude,tour_geton_usage,regident_geton_usage,tour_getoff_usage,regident_getoff_usage,total_usage
0,서귀포시,0,126.582644,33.263308,6183.0,392471.0,5662.0,281772.0,686088.0
1,서귀포시,1,126.471695,33.250647,3888.0,198089.0,3736.0,125259.0,330972.0
2,서귀포시,2,126.500589,33.323629,6316.0,89538.0,5561.0,58765.0,160180.0
3,서귀포시,3,126.552505,33.251292,2405.0,87230.0,2397.0,48322.0,140354.0
4,서귀포시,4,126.626550,33.282450,1161.0,76412.0,1378.0,56467.0,135418.0
...,...,...,...,...,...,...,...,...,...
1486,제주시,844,126.311810,33.946830,1.0,10.0,0.0,0.0,11.0
1487,제주시,845,126.300110,33.959660,1.0,10.0,0.0,0.0,11.0
1488,제주시,846,126.331340,33.953590,0.0,6.0,0.0,0.0,6.0
1489,제주시,847,126.256080,33.347920,0.0,1.0,0.0,5.0,6.0
