In [2]:
import os
# Do! 프로젝트 root 경로로 설정
project_path = "D:/Users\workspace/2.jeju-bus-stations-clustering_MH"
os.chdir(project_path)

In [11]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

import bus.analyzer as anz

### 1) grouping function

In [12]:
def make_group_column(df):
    df['cluster_group'] = df['station_address'].apply(lambda string: string.split(' ')[1])
    return df

### 2) clustering function

In [13]:
target_s = 'cluster_target'
level_s = 'cluster_level'
earth_radius = 6371.0088 # 단위: km

In [22]:
def get_n_level_spatial_dbscan_result \
    (df, n, eps, min_pts = 3, nonf_cols = ['station_id', 'station_name'], f_cols = ['station_longitude', 'station_latitude']):

    # clustering
    global earth_radius 
    dbscan = DBSCAN(eps = eps/1000/earth_radius, algorithm='ball_tree', 
                    metric='haversine', min_samples=min_pts)
    
    temp_df = df.loc[:,nonf_cols + f_cols]
    temp_df.loc[:, target_s] = dbscan.fit_predict(np.radians(temp_df[f_cols]))

    # level 부여
    success_index = temp_df.query(target_s + ' > -1').index
    temp_df.loc[success_index, level_s] = str(n)
    
    failed_index = set(temp_df.index) - set(success_index)
    temp_df.loc[failed_index, level_s] = str(-1)

    return temp_df

In [27]:
def get_noise_handled_result \
    (df, n, by1='station_id', by2='station_name', f_cols=['station_longitude', 'station_latitude']):
    temp_df = df.loc[:, [by1, by2] + f_cols]

    temp_df['cluster_target'] = -1
    temp_df['cluster_level'] = -1

    grouped_df = temp_df.groupby(by = by2, as_index = False)
    grouped_df_count = grouped_df.count()
    station_nm_list = list(grouped_df_count[grouped_df_count[by1] >= 2][by2])
    target_range = np.arange(len(station_nm_list))

    for i in target_range:
        list_idx = temp_df.query('%s == "%s"' % (by2, station_nm_list[i])).index
        for idx in list_idx:
            temp_df.loc[idx, 'cluster_target'] = i
            temp_df.loc[idx, 'cluster_level'] = n

    return temp_df

In [28]:
def get_noise_result(df, n):
    df.loc[:, 'cluster_level'] = n
    return df

In [29]:
def get_spatial_dbscan_result(eps, station_df):
    df_lv1_group = get_n_level_spatial_dbscan_result(station_df, 1, eps, min_pts = 3)
    
    df_lv2_group = df_lv1_group.query(target_s + ' == -1')
    df_lv2_group = get_n_level_spatial_dbscan_result(df_lv2_group, 2, eps, min_pts = 2)
    
    df_noise_handled_group = df_lv2_group.query(target_s + ' == -1')
    df_noise_handled_group = get_noise_handled_result(df_noise_handled_group, 3)
    
    df_noise_group = df_noise_handled_group.query(target_s + ' == -1')
    df_noise_group = get_noise_result(df_noise_group, 4)
    
    r1 = df_lv1_group.query(target_s + ' > -1')
    r2 = df_lv2_group.query(target_s + ' > -1')
    r3 = df_noise_handled_group.query(target_s + ' > -1')
    r4 = df_noise_group
    
    combined = pd.concat([r1, r2, r3, r4])
    return combined

### main

In [41]:
df = anz.load_station_df()

df = make_group_column(df)

group_list = set(df["cluster_group"])
group_df_list = []
for label in group_list:
    group_df_list.append(df[df["cluster_group"] == label])

    
eps_list = [105, 90]
# eps_list = [300, 200]
cluster_df_list = []

    
for i, group_df in enumerate(group_df_list):   
    eps = eps_list[i]
    # eps에 따른 dbscan 수행. => label OR noise 생성
    cluster_df = get_spatial_dbscan_result(eps, group_df)
    cluster_df['cluster_level'] = cluster_df['cluster_level'].astype(str)
    cluster_df['cluster_target'] = cluster_df['cluster_target'].astype(str)
    cluster_df['level_target'] = cluster_df['cluster_level'] + '&' + cluster_df['cluster_target']
    
    # noise 군집에 개별 레이블 부여
    cluster_df["cluster_target"] = cluster_df["cluster_target"].apply(lambda x : int(x))
    next_target = int(max(list(cluster_df["cluster_target"]))) + 1
    for j in cluster_df.index:
        if cluster_df.loc[j, "cluster_target"] == -1:
            cluster_df.loc[j, "cluster_target"] = next_target
            next_target += 1
    
    # 군집 위치 구하기
    cluster_df["cluster_longitude"] = 0
    cluster_df["cluster_latitude"] = 0
    target_list = set(cluster_df["cluster_target"])
    for target in target_list:
        target_df = cluster_df[cluster_df["cluster_target"] == target]
        cluster_longitude = target_df["station_longitude"].mean()
        cluster_latitude = target_df["station_latitude"].mean()
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_longitude"] = cluster_longitude
        cluster_df.loc[cluster_df["cluster_target"] == target, "cluster_latitude"] = cluster_latitude

    cluster_df_list.append(cluster_df)


# 그룹별 데이터를 하나로 묶음
cluster_df = pd.concat(cluster_df_list)
cluster_df = pd.merge(df, cluster_df, on="station_id", sort = "right")
cluster_df = cluster_df[["station_id","cluster_group", "cluster_target", "cluster_level", "cluster_longitude", "cluster_latitude"]]

cluster_df
# cluster_df.to_csv("cluster_list.csv", encoding="CP949", index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,station_id,cluster_group,cluster_target,cluster_level,cluster_longitude,cluster_latitude
0,1.0,제주시,28,1,126.547504,33.492026
1,2.0,제주시,7,1,126.547328,33.511889
2,3.0,제주시,7,1,126.547328,33.511889
3,4.0,제주시,177,2,126.461295,33.494980
4,5.0,제주시,177,2,126.461295,33.494980
...,...,...,...,...,...,...
3649,6115048.0,제주시,829,4,126.954110,33.495580
3650,6115052.0,제주시,773,4,126.965670,33.511230
3651,6115059.0,제주시,782,4,126.950930,33.507310
3652,6115100.0,서귀포시,3,1,126.571814,33.279697
