In [12]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [13]:
def make_path(s_date, e_date):
    root_path = ""
    base_name = "tag_usage_data_"
    extender = ".csv"
    s_date = str(s_date)
    e_date = str(e_date)
    s_date = s_date[2:4]+s_date[5:7]+s_date[8:10]
    e_date = e_date[2:4]+e_date[5:7]+e_date[8:10]

    file_name = root_path+base_name+s_date+"_"+e_date+extender
    return file_name

#### 1-1 정류장 파일 읽어온 뒤, 'city(시)' 필드 추가
#### 1-2 제주시, 서귀포시 정류장 dataframe 생성

In [19]:
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)
file_path = make_path(start_date, end_date)
df = pd.read_csv(file_path, encoding="CP949")

df['CITY'] = df['STATION_ADDRESS'].apply(lambda string: string.split(' ')[1])
                    # (예) 'STATION_ADDR': 서귀포시 중문동 ... ☞ 'city': 서귀포시

df_jeju_station     = df.query('CITY == "제주시"')
df_seogwipo_station = df.query('CITY == "서귀포시"')

list_station_df_per_city = [df_jeju_station, df_seogwipo_station]
print(df_jeju_station)
print(df_seogwipo_station)

      STATION_ID STATION_NAME  STATION_Y  STATION_X  STATION_USAGE  \
0              1     국제여객선터미널   33.52438  126.54433           6125   
1              2        오광로입구   33.49527  126.45618          10723   
2              3        오광로입구   33.49546  126.45623           1228   
3              4         이호2동   33.49488  126.46137          10098   
4              5         이호2동   33.49508  126.46122           3120   
...          ...          ...        ...        ...            ...   
3615     6115046       여객선대합실   33.96351  126.29717             40   
3616     6115047        우도천진항   33.49334  126.95151           5106   
3617     6115048        우도봉입구   33.49558  126.95411             24   
3618     6115052          비양동   33.51123  126.96567           2213   
3619     6115059         하우목동   33.50731  126.95093            486   

                 STATION_ADDRESS CITY  
0         제주특별자치도 제주시 건입동 908-20  제주시  
1        제주특별자치도 제주시 이호이동 1587-4  제주시  
2        제주특별자치도 제주시 이호이동 1587-1  제주시  

In [20]:
target_s = 'target'
level_s = 'level'
earth_radius = 6371.0088 # 단위: km

In [21]:
def get_n_level_spatial_dbscan_result \
    (df, n, eps, min_pts = 3, nonf_cols = ['STATION_ID', 'STATION_NAME'], f_cols = ['STATION_X', 'STATION_Y']):
    global earth_radius 
    dbscan = DBSCAN(eps = eps/1000/earth_radius, algorithm='ball_tree', 
                    metric='haversine', min_samples=min_pts)
    
    temp_df = df.loc[:, nonf_cols + f_cols]
    temp_df.loc[:, target_s] = dbscan.fit_predict(np.radians(temp_df[f_cols]))

    success_index = temp_df.query(target_s + ' > -1').index
    temp_df.loc[success_index, level_s] = str(n)
    
    failed_index = set(temp_df.index) - set(success_index)
    temp_df.loc[failed_index, level_s] = str(-1)

    return temp_df

In [22]:
def get_noise_handled_result \
    (df, n, by1='STATION_ID', by2='STATION_NAME', f_cols=['STATION_X', 'STATION_Y']):
    temp_df = df.loc[:, [by1, by2] + f_cols]

    temp_df.loc[:, 'target'] = -1
    temp_df.loc[:, 'level'] = -1

    grouped_df = temp_df.groupby(by = by2, as_index = False)
    grouped_df_count = grouped_df.count()
    station_nm_list = list(grouped_df_count[grouped_df_count[by1] >= 2][by2])
    target_range = np.arange(len(station_nm_list))

    for i in target_range:
        list_idx = temp_df.query('%s == "%s"' % (by2, station_nm_list[i])).index
        for idx in list_idx:
            temp_df.loc[idx, 'target'] = i
            temp_df.loc[idx, 'level'] = n

    return temp_df

In [23]:
def get_noise_result(df, n):
    df.loc[:, 'level'] = n
    return df

In [24]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(station_df, 1, eps, min_pts = 3)
    
    df_lv2_group = df_lv1_group.query(target_s + ' == -1')
    df_lv2_group = get_n_level_spatial_dbscan_result(df_lv2_group, 2, eps, min_pts = 2)
    
    df_noise_handled_group = df_lv2_group.query(target_s + ' == -1')
    df_noise_handled_group = get_noise_handled_result(df_noise_handled_group, 3)
    
    df_noise_group = df_noise_handled_group.query(target_s + ' == -1')
    df_noise_group = get_noise_result(df_noise_group, 4)
    
    r1 = df_lv1_group.query(target_s + ' > -1')
    r2 = df_lv2_group.query(target_s + ' > -1')
    r3 = df_noise_handled_group.query(target_s + ' > -1')
    r4 = df_noise_group
    
    combined = pd.concat([r1, r2, r3, r4])
    return combined

#### 2-1 정류장 클러스터링

In [25]:
eps_list = [110, 95]
list_df_combined = []

for i, station_df in enumerate(list_station_df_per_city):   
    eps = eps_list[i]
    # eps에 따른 dbscan 수행.
    df_combined = get_spatial_dbscan_result(eps, station_df)
    df_combined.loc[:, 'level'] = df_combined['level'].astype(str)
    df_combined.loc[:, 'target'] = df_combined['target'].astype(str)
    df_combined['level-target'] = df_combined['level'] + '&' + df_combined['target']

    # lv별 grouping -> count -> 저장.
    list_df_combined.append(df_combined)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [26]:
list_df_combined[0].to_csv('jeju_city_clustered.csv', index = False, encoding = 'ansi')

In [27]:
list_df_combined[1].to_csv('seogwipo_city_clustered.csv', index = False, encoding = 'ansi')