In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

#### 1-1 정류장 파일 읽어온 뒤, 'city(시)' 필드 추가
#### 1-2 제주시, 서귀포시 정류장 dataframe 생성

In [2]:
result_path = r'C:\Users\think\Desktop\버스 정류장 클러스터링'.replace('\\', '/')
path = r'D:\jeju_bus_data_no_leakage\station'.replace('\\', '/')
df = pd.read_csv(path + '/station_final.csv', encoding = 'ansi')
df['city'] = df['STATION_ADDR']
df.loc[:, 'city'] = df.loc[:, 'STATION_ADDR'].apply(lambda string: string.split(' ')[0])
                    # (예) 'STATION_ADDR': 서귀포시 중문동 ... ☞ 'city': 서귀포시

df_jeju_station     = df.query('city == "제주시"')
df_seogwipo_station = df.query('city == "서귀포시"')

list_station_df_per_city = [df_jeju_station, df_seogwipo_station]

In [3]:
target_s = 'target'
level_s = 'level'
earth_radius = 6371.0088 # 단위: km

In [4]:
def get_n_level_spatial_dbscan_result \
    (df, n, eps, min_pts = 3, nonf_cols = ['STATION_ID', 'STATION_NM'], f_cols = ['LOCAL_X', 'LOCAL_Y']):
    global earth_radius 
    dbscan = DBSCAN(eps = eps/1000/earth_radius, algorithm='ball_tree', 
                    metric='haversine', min_samples=min_pts)
    
    temp_df = df.loc[:, nonf_cols + f_cols]
    temp_df.loc[:, target_s] = dbscan.fit_predict(np.radians(temp_df[f_cols]))

    success_index = temp_df.query(target_s + ' > -1').index
    temp_df.loc[success_index, level_s] = str(n)
    
    failed_index = set(temp_df.index) - set(success_index)
    temp_df.loc[failed_index, level_s] = str(-1)

    return temp_df

In [5]:
def get_noise_handled_result \
    (df, n, by1='STATION_ID', by2='STATION_NM', f_cols=['LOCAL_X', 'LOCAL_Y']):
    temp_df = df.loc[:, [by1, by2] + f_cols]

    temp_df.loc[:, 'target'] = -1
    temp_df.loc[:, 'level'] = -1

    grouped_df = temp_df.groupby(by = by2, as_index = False)
    grouped_df_count = grouped_df.count()
    station_nm_list = list(grouped_df_count[grouped_df_count[by1] >= 2][by2])
    target_range = np.arange(len(station_nm_list))

    for i in target_range:
        list_idx = temp_df.query('%s == "%s"' % (by2, station_nm_list[i])).index
        for idx in list_idx:
            temp_df.loc[idx, 'target'] = i
            temp_df.loc[idx, 'level'] = n

    return temp_df

In [6]:
def get_noise_result(df, n):
    df.loc[:, 'level'] = n
    return df

In [7]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(station_df, 1, eps, min_pts = 3)
    
    df_lv2_group = df_lv1_group.query(target_s + ' == -1')
    df_lv2_group = get_n_level_spatial_dbscan_result(df_lv2_group, 2, eps, min_pts = 2)
    
    df_noise_handled_group = df_lv2_group.query(target_s + ' == -1')
    df_noise_handled_group = get_noise_handled_result(df_noise_handled_group, 3)
    
    df_noise_group = df_noise_handled_group.query(target_s + ' == -1')
    df_noise_group = get_noise_result(df_noise_group, 4)
    
    r1 = df_lv1_group.query(target_s + ' > -1')
    r2 = df_lv2_group.query(target_s + ' > -1')
    r3 = df_noise_handled_group.query(target_s + ' > -1')
    r4 = df_noise_group
    
    combined = pd.concat([r1, r2, r3, r4])
    return combined

### 2-1 정류장 클러스터링

In [9]:
#list_station_cnt_per_group_per_level_per_eps_per_city = []
eps_list = [110, 95]
list_df_combined = []

for i, station_df in enumerate(list_station_df_per_city):
    #list_station_cnt_per_group_per_level_per_eps_per_city.append(pd.DataFrame(columns=[1., 2., 3., 4.]))
    
    eps = eps_list[i]
    # eps에 따른 dbscan 수행.
    df_combined = get_spatial_dbscan_result(eps, station_df)
    df_combined.loc[:, 'level'] = df_combined['level'].astype(str)
    df_combined.loc[:, 'target'] = df_combined['target'].astype(str)
    df_combined['level-target'] = df_combined['level'] + '&' + df_combined['target']

    # lv별 grouping -> count -> 저장.
    list_df_combined.append(df_combined)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [10]:
list_df_combined[0].to_csv('jeju_city_clustered.csv', index = False, encoding = 'ansi')

In [11]:
list_df_combined[1].to_csv('seogwipo_city_clustered.csv', index = False, encoding = 'ansi')