In [3]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [4]:
def make_path(s_date, e_date):
    root_path = ""
    base_name = "station_data_"
    extender = ".csv"
    s_date = str(s_date)
    e_date = str(e_date)
    s_date = s_date[2:4]+s_date[5:7]+s_date[8:10]
    e_date = e_date[2:4]+e_date[5:7]+e_date[8:10]

    file_name = root_path+base_name+s_date+"_"+e_date+extender
    return file_name

#### 1-1 정류장 파일 읽어온 뒤, 'city(시)' 필드 추가
#### 1-2 제주시, 서귀포시 정류장 dataframe 생성

In [5]:
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)

file_path = make_path(start_date, end_date)
df = pd.read_csv(file_path, encoding="CP949")

In [6]:
def make_group_column(df):
    df['cluster_group'] = df['station_address'].apply(lambda string: string.split(' ')[1])
    return df

df = make_group_column(df)

group_list = set(df["cluster_group"])
group_df_list = []
for label in group_list:
    group_df_list.append(df[df["cluster_group"] == label])

In [7]:
target_s = 'cluster_target'
level_s = 'cluster_level'
earth_radius = 6371.0088 # 단위: km

In [8]:
def get_n_level_spatial_dbscan_result \
    (df, n, eps, min_pts = 3, nonf_cols = ['station_id', 'station_name'], f_cols = ['station_x', 'station_y']):
    global earth_radius 
    dbscan = DBSCAN(eps = eps/1000/earth_radius, algorithm='ball_tree', 
                    metric='haversine', min_samples=min_pts)
    
    temp_df = df.loc[:, nonf_cols + f_cols]
    temp_df.loc[:, target_s] = dbscan.fit_predict(np.radians(temp_df[f_cols]))

    success_index = temp_df.query(target_s + ' > -1').index
    temp_df.loc[success_index, level_s] = str(n)
    
    failed_index = set(temp_df.index) - set(success_index)
    temp_df.loc[failed_index, level_s] = str(-1)

    return temp_df

In [9]:
def get_noise_handled_result \
    (df, n, by1='station_id', by2='station_name', f_cols=['station_x', 'station_y']):
    temp_df = df.loc[:, [by1, by2] + f_cols]

    temp_df['cluster_target'] = -1
    temp_df['cluster_level'] = -1

    grouped_df = temp_df.groupby(by = by2, as_index = False)
    grouped_df_count = grouped_df.count()
    station_nm_list = list(grouped_df_count[grouped_df_count[by1] >= 2][by2])
    target_range = np.arange(len(station_nm_list))

    for i in target_range:
        list_idx = temp_df.query('%s == "%s"' % (by2, station_nm_list[i])).index
        for idx in list_idx:
            temp_df.loc[idx, 'cluster_target'] = i
            temp_df.loc[idx, 'cluster_level'] = n

    return temp_df

In [10]:
def get_noise_result(df, n):
    df.loc[:, 'cluster_level'] = n
    return df

In [11]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(station_df, 1, eps, min_pts = 3)
    
    df_lv2_group = df_lv1_group.query(target_s + ' == -1')
    df_lv2_group = get_n_level_spatial_dbscan_result(df_lv2_group, 2, eps, min_pts = 2)
    
    df_noise_handled_group = df_lv2_group.query(target_s + ' == -1')
    df_noise_handled_group = get_noise_handled_result(df_noise_handled_group, 3)
    
    df_noise_group = df_noise_handled_group.query(target_s + ' == -1')
    df_noise_group = get_noise_result(df_noise_group, 4)
    
    r1 = df_lv1_group.query(target_s + ' > -1')
    r2 = df_lv2_group.query(target_s + ' > -1')
    r3 = df_noise_handled_group.query(target_s + ' > -1')
    r4 = df_noise_group
    
    combined = pd.concat([r1, r2, r3, r4])
    return combined

#### 2-1 정류장 클러스터링

In [12]:
eps_list = [110, 95]
cluster_df_list = []

for i, group_df in enumerate(group_df_list):   
    eps = eps_list[i]
    # eps에 따른 dbscan 수행. => label OR noise 생성
    cluster_df = get_spatial_dbscan_result(eps, group_df)
    cluster_df['cluster_level'] = cluster_df['cluster_level'].astype(str)
    cluster_df['cluster_target'] = cluster_df['cluster_target'].astype(str)
    cluster_df['level-target'] = cluster_df['cluster_level'] + '&' + cluster_df['cluster_target']
    
    # noise 군집에 개별 레이블 부여
    cluster_df["cluster_target"] = cluster_df["cluster_target"].apply(lambda x : int(x))
    next_target = int(max(list(cluster_df["cluster_target"]))) + 1
    for j in cluster_df.index:
        if cluster_df.loc[j, "cluster_target"] == -1:
            cluster_df.loc[j, "cluster_target"] = next_target
            next_target += 1
    
    # 군집 위치 구하기
    cluster_df["cluster_x"] = 0
    cluster_df["cluster_y"] = 0
    target_list = set(cluster_df["cluster_target"])
    for target in target_list:
        target_df = cluster_df[cluster_df["cluster_target"] == target]
        cluster_x = target_df["station_x"].mean()
        cluster_y = target_df["station_y"].mean()
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_x"] = cluster_x
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_y"] = cluster_y
            
    # 필수 컬럼만 추출
    cluster_df = cluster_df[["station_id", "cluster_target", "cluster_level", "cluster_x", "cluster_y"]]
    cluster_df_list.append(cluster_df)

# 그룹별 데이터를 하나로 묶음
cluster_df = pd.concat(cluster_df_list)
df = pd.merge(df, cluster_df, on="station_id", sort = "right")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [14]:
df.sort_values(by="citizen_station_usage", ascending = False)

Unnamed: 0,station_id,station_name,station_y,station_x,citizen_user_usage,tourist_user_usage,total_user_usage,citizen_station_usage,tourist_station_usage,total_station_usage,station_address,cluster_group,cluster_target,cluster_level,cluster_x,cluster_y
2683,3271,제주시청(광양방면),33.49892,126.53035,239964,7480,247444,236477,7218,243695,제주특별자치도 제주시 이도이동 1938-1,제주시,505,2,126.530060,33.499240
2682,3270,제주시청(아라방면),33.49956,126.52977,192611,5901,198512,189613,5680,195293,제주특별자치도 제주시 이도이동 1938-1,제주시,505,2,126.530060,33.499240
1294,1564,제주시외버스터미널,33.49946,126.51479,165061,11955,177016,162692,11434,174126,제주특별자치도 제주시 오라일동 2455-16,제주시,6,1,126.468044,33.475799
179,201,제주대학교,33.46011,126.56166,154142,2093,156235,151976,2027,154003,제주특별자치도 제주시 아라이동 35-4,제주시,64,2,126.423548,33.752810
328,357,한라병원,33.48944,126.48508,136315,5587,141902,133894,5357,139251,제주특별자치도 제주시 연동 2335-4,제주시,73,3,126.553580,33.512023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,753,한원리,33.33400,126.20038,1,5,6,1,5,6,제주특별자치도 제주시 한경면 두모리 1360-4,제주시,225,2,126.200400,33.334050
1180,1429,멍중내,33.50070,126.68860,1,2,3,1,2,3,제주특별자치도 제주시 조천읍 선흘리 3138-2,제주시,18,3,126.575082,33.493768
2518,3063,이스트소프트,33.44722,126.57148,1,1,2,1,1,2,제주특별자치도 제주시 영평동 2187-1,제주시,478,2,126.571545,33.447390
1131,1369,저지리알못,33.33589,126.26369,1,1,2,1,1,2,제주특별자치도 제주시 한경면 저지리 3258,제주시,694,4,126.263690,33.335890


#### 2-1 군집 좌표 구하기

In [171]:
df.to_csv("clustered_station_data.csv", encoding="CP949", index = False)