In [1]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm

In [2]:
# 컬럼 상수
user_id = 'user_id'
base_date = 'base_date'
route_id = 'route_id'
route_name = 'route_name'
route_no = 'route_no'
geton_datetime = 'geton_datetime'
geton_station_id = 'geton_station_id'
geton_station_name = 'geton_stataion_name'
geton_station_longitude = 'geton_station_longitude'
geton_station_latitude = 'geton_station_latitude'
getoff_datetime = 'getoff_datetime'
getoff_station_id = 'getoff_station_id'
getoff_station_name = 'getoff_station_name'
getoff_station_longitude = 'getoff_station_longitude'
getoff_station_latitude = 'getoff_station_latitude'
user_type = 'user_type'
user_count = 'user_count'
input_date = 'input_date'
usage = 'usage' # not excel column, for new DataFrame object
key = 'key'

In [3]:
def make_path(date):
    root_path = "C:/tb_bus_user_usage"
    base_name = "tb_bus_user_usage_"
    extender = ".csv"
    
    date = str(date)
    y = date[2:4]
    m = date[5:7]
    d = date[8:10]
    
    file_name = root_path+"/"+base_name+y+m+d+extender
    return file_name

In [4]:
def analyze_how_many_is_station_used(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')

    df[geton_station_name] = df[geton_station_name].apply(lambda x : str(x))
    df[geton_station_longitude] = df[geton_station_longitude].apply(lambda x : str(x))
    df[geton_station_latitude] = df[geton_station_latitude].apply(lambda x : str(x))
    df[key] = df[geton_station_name]+"@"+df[geton_station_longitude]+"@"+df[geton_station_latitude]

    df = pd.DataFrame(df[key].value_counts())
    df.index.name = key
    df = df.rename(columns={key:usage})
    return df

def analyze_how_many_did_user_use_station(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    
    df[geton_station_name] = df[geton_station_name].apply(lambda x : str(x))
    df[geton_station_longitude] = df[geton_station_longitude].apply(lambda x : str(x))
    df[geton_station_latitude] = df[geton_station_latitude].apply(lambda x : str(x))
    df[key] = df[geton_station_name]+"@"+df[geton_station_longitude]+"@"+df[geton_station_latitude]
    
    df = df[[key, user_count]].groupby(key).sum()
    df.index.name = key
    df = df.rename(columns={user_count:usage})
    return df

In [5]:
def combine_result(df_a, df_b):
    df_a_ = pd.DataFrame(index = set(df_b.index)-set(df_a.index))
    df_b_ = pd.DataFrame(index = set(df_a.index)-set(df_b.index))

    df_a_[usage] = 0
    df_b_[usage] = 0
    
    df_a = pd.concat([df_a, df_a_])
    df_b = pd.concat([df_b, df_b_])
    
    return df_a.add(df_b)

In [54]:
def analyze_all(start_date, end_date, scale = "station"):# scale = ["station" | "user"]
    if(scale == "station"):# select analyze function and scale column
        scale = geton_station_name
        analyze_func = analyze_how_many_is_station_used
    elif(scale == "user"):
        scale = user_count
        analyze_func = analyze_how_many_did_user_use_station
        
    file_path = make_path(start_date)
    result = pd.DataFrame(analyze_func(file_path))

    for day in tqdm(range((end_date-start_date).days)):
        date = start_date + datetime.timedelta(days = day+1)
        file_path = make_path(date)
        result = combine_result(result, analyze_func(file_path))
    
    
    
    columns = [geton_station_name, geton_station_longitude, geton_station_latitude]
    result = result.reset_index().rename(columns = {"index":key})
    
    for i, column in enumerate(columns):
        result[column] = result[key].apply(lambda x : str(x).split("@")[i])
    
    columns.append(usage)
    result = result[columns]
    result = result.dropna(axis = 0)
    result = result[(result[geton_station_longitude] != "nan") & (result[geton_station_latitude] != "nan")]
    result = result.sort_values(by=usage, ascending = False)
    return result

In [55]:
def make_new_path(start_date, end_date):
    s_d = str(start_date)
    e_d = str(end_date)

    s_y, s_m, s_d = s_d[2:4], s_d[5:7], s_d[8:10]
    e_y, e_m, e_d = e_d[2:4], e_d[5:7], e_d[8:10]
    
    return "station_total_usage_"+s_y+s_m+s_d+"_"+e_y+e_m+e_d+".csv"
    
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)
period = int((end_date-start_date).days)+1

usage_df = analyze_all(start_date, end_date, scale = "user")
# usage_df[usage] = usage_df[usage].apply(lambda x : int(x/period))
usage_df = usage_df.rename(columns = {geton_station_name: "STATION_NM", 
                          geton_station_latitude:"LOCAL_Y",
                          geton_station_longitude:"LOCAL_X",
                          usage:"USAGE"})
usage_df.to_csv(make_new_path(start_date, end_date), encoding="CP949", index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [01:01<00:00,  1.45it/s]


In [41]:
a = make_path(datetime.datetime(2019, 6, 1))
df = pd.read_csv(a, encoding='utf-8')
df = df[[geton_station_name, geton_station_latitude, geton_station_longitude]]
print(df)
print(df.dropna(axis = 0))

       geton_stataion_name  geton_station_latitude  geton_station_longitude
0                   납읍리사무소                33.43944                126.32944
1                   납읍리사무소                33.43944                126.32944
2             한림환승정류장(한림리)                33.41453                126.26427
3                      학원동                33.47493                126.39766
4                     연대마을                33.49282                126.42858
...                    ...                     ...                      ...
130975                 봉개동                33.49154                126.59275
130976                 봉개동                33.49154                126.59275
130977                 큰동네                33.48923                126.59697
130978        제주교통방송아라아이파크                33.47783                126.54931
130979        제주교통방송아라아이파크                33.47783                126.54931

[130980 rows x 3 columns]
       geton_stataion_name  geton_station_latitude  geton_sta