In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

#### 1-1 정류장 파일 읽어온 뒤, 'city(시)' 필드 추가
#### 1-2 제주시, 서귀포시 정류장 dataframe 생성

In [2]:
result_path = r'C:\Users\think\Desktop\버스 정류장 클러스터링'.replace('\\', '/')
path = r'D:\jeju_bus_data_no_leakage\station'.replace('\\', '/')
df = pd.read_csv(path + '/station_final.csv', encoding = 'ansi')
df['city'] = df['STATION_ADDR']
df.loc[:, 'city'] = df.loc[:, 'STATION_ADDR'].apply(lambda string: string.split(' ')[0])
                    # (예) 'STATION_ADDR': 서귀포시 중문동 ... ☞ 'city': 서귀포시

df_jeju_station     = df.query('city == "제주시"')
df_seogwipo_station = df.query('city == "서귀포시"')

list_station_df_per_city = [df_jeju_station, df_seogwipo_station]

In [3]:
def get_n_level_spatial_dbscan_result \
    (n, eps, df, df_cols=['LOCAL_X','LOCAL_Y'], algorithm='ball_tree', metric='haversine', min_pts=3, eps_measure='m'):
    earth_radius = 6371.0088 # 단위: km
    div = 1000 if eps_measure == 'm' else 1
    dbscan = DBSCAN(eps=eps/div/earth_radius, min_samples=min_pts, algorithm=algorithm, metric=metric)

    temp_df = df.loc[:, df_cols]
    temp_df.loc[:, 'target'] = dbscan.fit_predict(np.radians(temp_df))
    
    success_index = temp_df.query('target > -1').index ; temp_df.loc[success_index, 'level'] = n 
    failed_index = set(temp_df.index) - set(success_index) ; temp_df.loc[failed_index, 'level'] = -1
    
    return temp_df

In [4]:
def get_noise_handled_result(df, df_cols=['LOCAL_X','LOCAL_Y'], n=3, column1='STATION_NM', column2='STATION_ID'):
    df.loc[:, 'target'] = -1 ; df.loc[:, 'level'] = -1
    
    grouped = df.groupby(by = column1, as_index = False)
    list_station_nm = list(grouped.count()[grouped.count()[column2] >= 2][column1])
    target_range = np.arange(len(list_station_nm))
    
    for i in target_range:
        list_idx = df.query('%s == "%s"' % (column1, list_station_nm[i])).index
        for idx in list_idx: df.loc[idx, 'target'] = i ; df.loc[idx, 'level'] = n
    
    return df[df_cols + ['target', 'level']]

In [5]:
def get_noise_result(df, n=4):
    df.loc[:, 'level'] = n ; return df

In [6]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(1, eps, station_df)
    df_lv2_group = get_n_level_spatial_dbscan_result(2, eps, df_lv1_group.query('target == -1'), min_pts=2)

    df_noise_handled_group = get_noise_handled_result(station_df.loc[df_lv2_group.query('target == -1').index])
    df_noise_group = get_noise_result(df_noise_handled_group.query('target == -1'))

    r1 = df_lv1_group.query('target > -1')
    r2 = df_lv2_group.query('target > -1')
    r3 = df_noise_handled_group.query('target > -1')
    r4 = df_noise_group

    combined = pd.concat([r1, r2, r3, r4])
    return combined

In [12]:
#list_station_cnt_per_group_per_level_per_eps_per_city = []
eps_list = [110, 95]
list_df_combined = []

for i, station_df in enumerate(list_station_df_per_city):
    #list_station_cnt_per_group_per_level_per_eps_per_city.append(pd.DataFrame(columns=[1., 2., 3., 4.]))
    
    eps = eps_list[i]
    # eps에 따른 dbscan 수행.
    df_combined = get_spatial_dbscan_result(eps, station_df)
    df_combined['_tmp'] = '-'
    df_combined.loc[:, 'level'] = df_combined['level'].astype(str)
    df_combined.loc[:, 'target'] = df_combined['target'].astype(str)
    
    df_combined['level-target'] = df_combined['level'] + df_combined['_tmp'] + df_combined['target']
    
    del df_combined['_tmp']

    # lv별 grouping -> count -> 저장.
    list_df_combined.append(df_combined)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [18]:
list_df_combined[0]['level-target'].unique()[1:10]

array(['1.0-0', '1.0-1', '1.0-101', '1.0-2', '1.0-3', '1.0-4', '1.0-5',
       '1.0-6', '1.0-7'], dtype=object)

In [22]:
for level_target in list_df_combined[0]['level-target'].unique()[1:10]:
    print(level_target)
    print(list_df_combined[0][ list_df_combined[0]['level-target'] == level_target ])

1.0-0
         LOCAL_X    LOCAL_Y target level level-target
5     126.525833  33.500017      0   1.0        1.0-0
6     126.523759  33.499789      0   1.0        1.0-0
1006  126.524706  33.500095      0   1.0        1.0-0
2678  126.525625  33.500228      0   1.0        1.0-0
3030  126.525892  33.500031      0   1.0        1.0-0
1.0-1
         LOCAL_X    LOCAL_Y target level level-target
17    126.675754  33.534836      1   1.0        1.0-1
18    126.675601  33.534663      1   1.0        1.0-1
1147  126.675961  33.534260      1   1.0        1.0-1
2583  126.675933  33.533639      1   1.0        1.0-1
1.0-101
         LOCAL_X    LOCAL_Y target level level-target
21    126.550757  33.456718    101   1.0      1.0-101
1394  126.551750  33.456267    101   1.0      1.0-101
2063  126.551050  33.456733    101   1.0      1.0-101
1.0-2
         LOCAL_X    LOCAL_Y target level level-target
30    126.455461  33.496234      2   1.0        1.0-2
87    126.456183  33.495267      2   1.0        1.0-2
88

# 

In [9]:
fig_title = '제주시 버스 정류장 군집화 (2019)'

i = 0
instance = list_df_combined[i].query('level != "4.0"')

plt.rcParams['font.size'] = 12
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.figure(figsize=(20, 12))
plt.grid(True, which='both')

list_level_target = instance['level-target'].unique()
for level_target in list_level_target:
    _query = instance[ instance['level-target'] == level_target ]
    #_query = instance.query('level-target == "%s"' % level_target)
    # => 불안정: 많은 도서/온라인에서 실무에서의 .query() 사용을 자제하라는 이유를 알 수 있는 사례였음.
    _x, _y = _query['LOCAL_X'], _query['LOCAL_Y']
    plt.scatter(x = _x, y = _y, marker = 'o', 
                alpha = '0.2', edgecolors = 'white')

#plt.legend()
plt.xlabel('경도')
plt.ylabel('위도')
plt.title(fig_title)

plt.gcf().savefig(result_path + '/' + fig_title)
plt.show()

AttributeError: 'NoneType' object has no attribute 'query'

In [None]:
fig_title = '서귀포시 버스 정류장 군집화 (2019)'

i = 1
instance = list_df_combined[i].query('level != "4.0"')

plt.rcParams['font.size'] = 12
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.figure(figsize=(20, 12))
plt.grid(True, which='both')

list_level_target = instance['level-target'].unique()
for level_target in list_level_target:
    _query = instance[ instance['level-target'] == level_target ]
    #_query = instance.query('level-target == "%s"' % level_target)
    # => 불안정: 많은 도서/온라인에서 실무에서의 .query() 사용을 자제하라는 이유를 알 수 있는 사례였음.
    _x, _y = _query['LOCAL_X'], _query['LOCAL_Y']
    plt.scatter(x = _x, y = _y, marker = 'o', 
                alpha = '0.2', edgecolors = 'white')

#plt.legend()
plt.xlabel('경도')
plt.ylabel('위도')
plt.title(fig_title)

plt.gcf().savefig(result_path + '/' + fig_title)
plt.show()