In [14]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [15]:
def make_path(s_date, e_date):
    root_path = ""
    base_name = "station_data_"
    extender = ".csv"
    s_date = str(s_date)
    e_date = str(e_date)
    s_date = s_date[2:4]+s_date[5:7]+s_date[8:10]
    e_date = e_date[2:4]+e_date[5:7]+e_date[8:10]

    file_name = root_path+base_name+s_date+"_"+e_date+extender
    return file_name

#### 1-1 정류장 파일 읽어온 뒤, 'city(시)' 필드 추가
#### 1-2 제주시, 서귀포시 정류장 dataframe 생성

In [16]:
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)
file_path = make_path(start_date, end_date)
df = pd.read_csv(file_path, encoding="CP949")

df['cluster_group'] = df['station_address'].apply(lambda string: string.split(' ')[1])
                    # (예) 'STATION_ADDR': 서귀포시 중문동 ... ☞ 'city': 서귀포시
group_label_list = list(df["cluster_group"].value_counts().index)
    
grouped_df_list = []
for label in group_label_list:
    grouped_df_list.append(df[df["cluster_group"] == label])

In [17]:
target_s = 'cluster_target'
level_s = 'cluster_level'
earth_radius = 6371.0088 # 단위: km

In [18]:
def get_n_level_spatial_dbscan_result \
    (df, n, eps, min_pts = 3, nonf_cols = ['station_id', 'station_name'], f_cols = ['station_x', 'station_y']):
    global earth_radius 
    dbscan = DBSCAN(eps = eps/1000/earth_radius, algorithm='ball_tree', 
                    metric='haversine', min_samples=min_pts)
    
    temp_df = df.loc[:, nonf_cols + f_cols]
    temp_df.loc[:, target_s] = dbscan.fit_predict(np.radians(temp_df[f_cols]))

    success_index = temp_df.query(target_s + ' > -1').index
    temp_df.loc[success_index, level_s] = str(n)
    
    failed_index = set(temp_df.index) - set(success_index)
    temp_df.loc[failed_index, level_s] = str(-1)

    return temp_df

In [19]:
def get_noise_handled_result \
    (df, n, by1='station_id', by2='station_name', f_cols=['station_x', 'station_y']):
    temp_df = df.loc[:, [by1, by2] + f_cols]

    temp_df.loc[:, 'cluster_target'] = -1
    temp_df.loc[:, 'cluster_level'] = -1

    grouped_df = temp_df.groupby(by = by2, as_index = False)
    grouped_df_count = grouped_df.count()
    station_nm_list = list(grouped_df_count[grouped_df_count[by1] >= 2][by2])
    target_range = np.arange(len(station_nm_list))

    for i in target_range:
        list_idx = temp_df.query('%s == "%s"' % (by2, station_nm_list[i])).index
        for idx in list_idx:
            temp_df.loc[idx, 'cluster_target'] = i
            temp_df.loc[idx, 'cluster_level'] = n

    return temp_df

In [20]:
def get_noise_result(df, n):
    df.loc[:, 'cluster_level'] = n
    return df

In [21]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(station_df, 1, eps, min_pts = 3)
    
    df_lv2_group = df_lv1_group.query(target_s + ' == -1')
    df_lv2_group = get_n_level_spatial_dbscan_result(df_lv2_group, 2, eps, min_pts = 2)
    
    df_noise_handled_group = df_lv2_group.query(target_s + ' == -1')
    df_noise_handled_group = get_noise_handled_result(df_noise_handled_group, 3)
    
    df_noise_group = df_noise_handled_group.query(target_s + ' == -1')
    df_noise_group = get_noise_result(df_noise_group, 4)
    
    r1 = df_lv1_group.query(target_s + ' > -1')
    r2 = df_lv2_group.query(target_s + ' > -1')
    r3 = df_noise_handled_group.query(target_s + ' > -1')
    r4 = df_noise_group
    
    combined = pd.concat([r1, r2, r3, r4])
    return combined

#### 2-1 정류장 클러스터링

In [22]:
eps_list = [110, 95]
list_df_combined = []

for i, station_df in enumerate(grouped_df_list):   
    eps = eps_list[i]
    # eps에 따른 dbscan 수행.
    df_combined = get_spatial_dbscan_result(eps, station_df)
    df_combined.loc[:, 'cluster_level'] = df_combined['cluster_level'].astype(str)
    df_combined.loc[:, 'cluster_target'] = df_combined['cluster_target'].astype(str)
    df_combined['level-target'] = df_combined['cluster_level'] + '&' + df_combined['cluster_target']

    # lv별 grouping -> count -> 저장.
    list_df_combined.append(df_combined)
    
for i in range(len(list_df_combined)):
#     list_df_combined[i]["cluster_group"] = group_label_list[i]
    list_df_combined[i] = list_df_combined[i][["station_id", "cluster_target", "cluster_level"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [23]:
cluster_df = pd.concat(list_df_combined)
df = pd.merge(df, cluster_df, on="station_id", sort = "right")

In [25]:
df.to_csv("clustered_station_data", encoding="CP949")