In [1]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm


import bus

geton_station_name = 'geton_stataion_name'

### 1. 정류장 추출

In [2]:
def analyze_usage(df):
    df = df[["geton_station_id", "user_count"]].groupby("geton_station_id").sum()
    df.index.name = "geton_station_id"
    df = df.rename(columns={"user_count":"usage"})
    return df

def create_station_df(usage_df, user_df):
    # extract necessary columns
    station_columns = ["geton_station_id", geton_station_name, "geton_station_latitude", "geton_station_longitude"]
    used_station = usage_df[station_columns].drop_duplicates()
    station_id_df = used_station[["geton_station_id"]]
    
    
    # set tourist column
    usage_df = pd.merge(usage_df, user_df, on = "user_id")
    usage_df = usage_df[["geton_station_id", "user_count", "tourist"]]

    
    # counting user in citizen and trouist
    citizen_user_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 0]))
    tourist_user_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 1]))
    
    
    # counting tag in citizen and trouist
    usage_df["user_count"] = 1
    citizen_tag_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 0]))
    tourist_tag_count = pd.DataFrame(analyze_usage(usage_df[usage_df["tourist"] == 1]))

    
    # create df_list
    df_list = [citizen_user_count, tourist_user_count, citizen_tag_count, tourist_tag_count]
    
    
    # renaming columns(include index)
    usage_column_list = ["citizen_user_count", "tourist_user_count", "citizen_tag_count", "tourist_tag_count"]
    for i in range(len(df_list)):
        df_list[i] = df_list[i].reset_index().rename(columns = {"index":"geton_station_id", "usage": usage_column_list[i]})
    
    
    # processing NaN data
    for i in range(len(df_list)):
        df_list[i] = pd.merge(station_id_df, df_list[i], on = "geton_station_id", how="left")
        df_list[i] = df_list[i].fillna(0)
    
    
    # mergeing each result(count - user, tag) df
    for i in range(len(df_list)):
        used_station= pd.merge(used_station, df_list[i], on="geton_station_id", how="left")
    
    
    # set total count columns
    used_station["total_user_count"] = used_station["citizen_user_count"] + used_station["tourist_user_count"]
    used_station["total_tag_count"] = used_station["citizen_tag_count"] + used_station["tourist_tag_count"]
    
    
    # processing missing values
#     print(used_station.count())
    used_station = used_station.dropna(axis = 0);
    
    
    # renaming columns
    used_station = used_station[(used_station["geton_station_longitude"] != "nan") & (used_station["geton_station_latitude"] != "nan")]
    used_station = used_station.sort_values(by="geton_station_id", ascending = False)
    used_station = used_station.rename(columns = {"geton_station_id" :'station_id',
                             geton_station_name: "station_name", 
                             "geton_station_latitude":"station_y",
                             "geton_station_longitude":"station_x",})
    
    return used_station

### 2. 주소 추가

In [3]:
import xml.etree.ElementTree as elemTree
import requests

def set_address_column(df):
    df = pd.DataFrame.copy(df)
    df["station_address"] = ""
    for i in tqdm(df.index):
        df.loc[i, "station_address"] = get_address(float(df.loc[i, "station_x"]), float(df.loc[i, "station_y"]))
    return df

def get_address(loc_x, loc_y, min_x = 126.531891, min_y = 33.399409, key = "E20F6493-C13D-3F6F-AC90-D5BB2F239901"):
    loc_x = round(float(loc_x), 7)
    loc_y = round(float(loc_y), 7)
    url_form = "http://api.vworld.kr/req/address?service=address&request=getAddress&version=2.0&crs=epsg:4326&point={},{}&format=xml&type=both&zipcode=true&simple=false&key={}"
    url = url_form.format(loc_x, loc_y, key)
    response = requests.get(url)
    tree = elemTree.fromstring(response.text)
    branch = ""
    try:
#     road = tree.find("result").find("item[2]").find("text").text
        branch = tree.find("result").find("item[1]").find("text").text
    except:
#         print("현재 좌표: ",(loc_x, loc_y), "주소 없음")
        x_left = str(int(loc_x))
        y_left = str(int(loc_y))
        
        
        if min_x < loc_x:
            x_right = str((int(loc_x*100000)-2)%100000)
        else:
            x_right = str((int(loc_x*100000)+2)%100000)
            
        if min_y < loc_y:
            y_right = str((int(loc_y*100000)-2)%100000)
        else:
            y_right = str((int(loc_y*100000)+2)%100000)
            
        loc_x = float(x_left+"."+x_right)
        loc_y = float(y_left+"."+y_right)
        return get_address(loc_x, loc_y)
            
    return branch

### main

In [4]:
# setting clasification period(start, end)
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)

# load usage data
input_path_list = bus.make_input_path(start_date, end_date)
usage_df = bus.load_total_usage_data(input_path_list)
usage_df = bus.preprocessing_missing_data_from_usage_df(usage_df)

# load user_data
user_df = bus.create_user_df()

# get station data
station_df = create_station_df(usage_df, user_df)
print("추출된 정류장 개수: ",len(station_df))

# setting address column
station_df = set_address_column(station_df)

# sorting
station_df = station_df.sort_values(by="station_id")

# storing
station_df.to_csv("station_list.csv", encoding="CP949", index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [04:21<00:00,  2.93s/it]
  res_values = method(rvalues)
  0%|                                                                                 | 1/3952 [00:00<07:56,  8.29it/s]

추출된 정류장 개수:  3952


100%|██████████████████████████████████████████████████████████████████████████████| 3952/3952 [05:20<00:00, 12.33it/s]


### 결과

In [5]:
station_df

Unnamed: 0,station_id,station_name,station_y,station_x,citizen_user_count,tourist_user_count,citizen_tag_count,tourist_tag_count,total_user_count,total_tag_count,station_address
529,1,국제여객선터미널,33.52438,126.54433,5375.0,750.0,5098.0,689.0,6125.0,5787.0,제주특별자치도 제주시 건입동 908-20
108,2,오광로입구,33.49527,126.45618,8646.0,2077.0,8354.0,1961.0,10723.0,10315.0,제주특별자치도 제주시 이호이동 1587-4
1922,3,오광로입구,33.49546,126.45623,955.0,273.0,926.0,258.0,1228.0,1184.0,제주특별자치도 제주시 이호이동 1587-1
1026,4,이호2동,33.49488,126.46137,9913.0,185.0,9845.0,170.0,10098.0,10015.0,제주특별자치도 제주시 이호이동 803-6
1701,5,이호2동,33.49508,126.46122,3012.0,108.0,2984.0,104.0,3120.0,3088.0,제주특별자치도 제주시 이호이동 803-5
...,...,...,...,...,...,...,...,...,...,...,...
3534,6115048,우도봉입구,33.49558,126.95411,23.0,1.0,22.0,1.0,24.0,23.0,제주특별자치도 제주시 우도면 연평리 1666-2
1658,6115052,비양동,33.51123,126.96567,1885.0,328.0,1660.0,300.0,2213.0,1960.0,제주특별자치도 제주시 우도면 연평리 133-1
2384,6115059,하우목동,33.50731,126.95093,338.0,148.0,275.0,133.0,486.0,408.0,제주특별자치도 제주시 우도면 연평리 2164-2
1615,6115100,서귀포시외버스터미널,33.24873,126.50799,3496.0,207.0,3459.0,198.0,3703.0,3657.0,제주특별자치도 서귀포시 법환동 843
