In [1]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm


import bus

geton_station_name = 'geton_stataion_name'

### 1. 정류장 추출

In [130]:
def analyze_usage(usage_df):
    usage_df = usage_df[["geton_station_id", "user_count"]].groupby("geton_station_id").sum()
    usage_df.index.name = "geton_station_id"
    usage_df = usage_df.rename(columns={"user_count":"usage"})
    return usage_df

def create_station_df(usage_df):
    geton_station_columns = ['geton_station_id', 'geton_stataion_name', 'geton_station_longitude', 'geton_station_latitude']
    getoff_station_columns = ['getoff_station_id', 'getoff_station_name', 'getoff_station_longitude', 'getoff_station_latitude']
    
    station_columns = ['station_id', 'station_name', 'station_longitude', 'station_latitude']
    
    geton_rename_dict = {}
    getoff_rename_dict = {}
    for i, column in enumerate(station_columns):
        geton_rename_dict[geton_station_columns[i]] = column
        getoff_rename_dict[getoff_station_columns[i]] = column
    
    geton_station_df = usage_df[geton_station_columns].drop_duplicates().rename(columns = geton_rename_dict)
    getoff_station_df = usage_df[getoff_station_columns].drop_duplicates().rename(columns = getoff_rename_dict)
    
    station_df = pd.concat([geton_station_df, getoff_station_df]).drop_duplicates()
    print(len(station_df[['station_id']]))
    station_id_df = station_df[['station_id']].drop_duplicates()
    print(len(station_id_df))
    station_df = pd.merge(station_id_df, station_df, how ='right')
    
    station_df = station_df.dropna()
    
    # 예외처리 - 하나의 id에 대하여 여러 경도, 위도 존재
    id_count_df = pd.DataFrame(station_df['station_id'].value_counts()).reset_index().rename(columns = {'index':'station_id', 'station_id':'count'})
    exception_df = id_count_df[id_count_df['count'] >1]
    exception_id_list = list(exception_df['station_id'])
    for exception_id in exception_id_list:
        temp_df = station_df[station_df['station_id'] == exception_id]
        station_df = station_df[station_df['station_id'] != exception_id]
        station_df = pd.concat([station_df, temp_df.head(1)])
    ######################################################################                            
    return station_df

3662
3656


4103.0    1
3553.0    1
1415.0    1
2831.0    1
3226.0    1
         ..
1014.0    1
899.0     1
361.0     1
2062.0    1
26.0      1
Name: station_id, Length: 3654, dtype: int64

In [7]:

    
# def create_station_df(usage_df, user_df):
#     # extract necessary columns
#     station_columns = ["geton_station_id", geton_station_name, "geton_station_latitude", "geton_station_longitude"]
#     used_station = usage_df[station_columns].drop_duplicates()
#     station_id_df = used_station[["geton_station_id"]]
    
    
#     # set tourist column
#     usage_df = pd.merge(usage_df, user_df, on = "user_id")
#     usage_df = usage_df[["geton_station_id", "user_count", "tourist"]]

    
#     # counting user in citizen and trouist
#     citizen_user_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 0]))
#     tourist_user_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 1]))
    
    
#     # counting tag in citizen and trouist
#     usage_df["user_count"] = 1
#     citizen_tag_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 0]))
#     tourist_tag_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 1]))

    
#     # create df_list
#     df_list = [citizen_user_count, tourist_user_count, citizen_tag_count, tourist_tag_count]
    
    
#     # renaming columns(include index)
#     usage_column_list = ["citizen_user_count", "tourist_user_count", "citizen_tag_count", "tourist_tag_count"]
#     for i in range(len(df_list)):
#         df_list[i] = df_list[i].reset_index().rename(columns = {"index":"geton_station_id", "usage": usage_column_list[i]})
    
    
#     # processing NaN data
#     for i in range(len(df_list)):
#         df_list[i] = pd.merge(station_id_df, df_list[i], on = "geton_station_id", how="left")
#         df_list[i] = df_list[i].fillna(0)
    
    
#     # mergeing each result(count - user, tag) df
#     for i in range(len(df_list)):
#         used_station= pd.merge(used_station, df_list[i], on="geton_station_id", how="left")
    
    
#     # set total count columns
#     used_station["total_user_count"] = used_station["citizen_user_count"] + used_station["tourist_user_count"]
#     used_station["total_tag_count"] = used_station["citizen_tag_count"] + used_station["tourist_tag_count"]
    
    
#     # processing missing values
# #     print(used_station.count())
#     used_station = used_station.dropna(axis = 0);
    
    
#     # renaming columns
#     used_station = used_station[(used_station["geton_station_longitude"] != "nan") & (used_station["geton_station_latitude"] != "nan")]
#     used_station = used_station.sort_values(by="geton_station_id", ascending = False)
#     used_station = used_station.rename(columns = {"geton_station_id" :'station_id',
#                              geton_station_name: "station_name", 
#                              "geton_station_latitude":"station_y",
#                              "geton_station_longitude":"station_x",})
    
#     return used_station



### 2. 주소 추가

In [68]:
import xml.etree.ElementTree as elemTree
import requests

def set_address_column(df):
    df = pd.DataFrame.copy(df)
    df["station_address"] = ""
    for i in tqdm(df.index):
        df.loc[i, "station_address"] = get_address(float(df.loc[i, "station_longitude"]), float(df.loc[i, "station_latitude"]))
    return df

def get_address(loc_x, loc_y, min_x = 126.531891, min_y = 33.399409, key = "E20F6493-C13D-3F6F-AC90-D5BB2F239901"):
    loc_x = round(float(loc_x), 7)
    loc_y = round(float(loc_y), 7)
    url_form = "http://api.vworld.kr/req/address?service=address&request=getAddress&version=2.0&crs=epsg:4326&point={},{}&format=xml&type=both&zipcode=true&simple=false&key={}"
    url = url_form.format(loc_x, loc_y, key)
    response = requests.get(url)
    tree = elemTree.fromstring(response.text)
    branch = ""
    try:
#     road = tree.find("result").find("item[2]").find("text").text
        branch = tree.find("result").find("item[1]").find("text").text
    except:
#         print("현재 좌표: ",(loc_x, loc_y), "주소 없음")
        x_left = str(int(loc_x))
        y_left = str(int(loc_y))
        
        
        if min_x < loc_x:
            x_right = str((int(loc_x*100000)-2)%100000)
        else:
            x_right = str((int(loc_x*100000)+2)%100000)
            
        if min_y < loc_y:
            y_right = str((int(loc_y*100000)-2)%100000)
        else:
            y_right = str((int(loc_y*100000)+2)%100000)
            
        loc_x = float(x_left+"."+x_right)
        loc_y = float(y_left+"."+y_right)
        return get_address(loc_x, loc_y)
            
    return branch

### main

In [135]:
# setting clasification period(start, end)
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)

# load usage data
input_path_list = bus.make_input_path(start_date, end_date)
usage_df = bus.load_total_usage_data(input_path_list)

# # get station data
station_df = create_station_df(usage_df)
print("추출된 정류장 개수: ",len(station_df))

# setting address column
station_df = set_address_column(station_df)

# sorting
station_df = station_df.sort_values(by="station_id")

# storing
station_df.to_csv("station_list.csv", encoding="CP949", index=False)

  0%|                                                                                 | 2/3654 [00:00<04:04, 14.97it/s]

3662
3656
추출된 정류장 개수:  3654


100%|██████████████████████████████████████████████████████████████████████████████| 3654/3654 [04:35<00:00, 13.28it/s]


### 결과

In [133]:
station_df

Unnamed: 0,station_id,station_name,station_longitude,station_latitude,station_address
529,1.0,국제여객선터미널,126.54433,33.52438,제주특별자치도 제주시 건입동 908-20
108,2.0,오광로입구,126.45618,33.49527,제주특별자치도 제주시 이호이동 1587-4
1803,3.0,오광로입구,126.45623,33.49546,제주특별자치도 제주시 이호이동 1587-1
949,4.0,이호2동,126.46137,33.49488,제주특별자치도 제주시 이호이동 803-6
1582,5.0,이호2동,126.46122,33.49508,제주특별자치도 제주시 이호이동 803-5
...,...,...,...,...,...
3415,6115048.0,우도봉입구,126.95411,33.49558,제주특별자치도 제주시 우도면 연평리 1666-2
1539,6115052.0,비양동,126.96567,33.51123,제주특별자치도 제주시 우도면 연평리 133-1
2265,6115059.0,하우목동,126.95093,33.50731,제주특별자치도 제주시 우도면 연평리 2164-2
1496,6115100.0,서귀포시외버스터미널,126.50799,33.24873,제주특별자치도 서귀포시 법환동 843
