In [1]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm

In [2]:
# 컬럼 상수
user_id = 'user_id'
base_date = 'base_date'
route_id = 'route_id'
route_name = 'route_name'
route_no = 'route_no'
geton_datetime = 'geton_datetime'
geton_station_id = 'geton_station_id'
geton_station_name = 'geton_stataion_name'
geton_station_longitude = 'geton_station_longitude'
geton_station_latitude = 'geton_station_latitude'
getoff_datetime = 'getoff_datetime'
getoff_station_id = 'getoff_station_id'
getoff_station_name = 'getoff_station_name'
getoff_station_longitude = 'getoff_station_longitude'
getoff_station_latitude = 'getoff_station_latitude'
user_type = 'user_type'
user_count = 'user_count'
input_date = 'input_date'
usage = 'usage' # not excel column, for new DataFrame object

In [3]:
def make_path(date):
    root_path = "C:/tb_bus_user_usage"
    base_name = "tb_bus_user_usage_"
    extender = ".csv"
    
    date = str(date)
    y = date[2:4]
    m = date[5:7]
    d = date[8:10]
    
    file_name = root_path+"/"+base_name+y+m+d+extender
    return file_name

In [4]:
def analyze_how_many_is_station_used(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    df = pd.DataFrame(df[geton_station_name].value_counts())
    df.index.name = geton_station_name
    df = df.rename(columns={geton_station_name:usage})
    return df

def analyze_how_many_did_user_use_station(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    df = df[[geton_station_name, user_count]].groupby(geton_station_name).sum()
    df.index.name = geton_station_name
    df = df.rename(columns={user_count:usage})
    return df

In [5]:
def combine_result(df_a, df_b):
    df_a_ = pd.DataFrame(index = set(df_b.index)-set(df_a.index))
    df_b_ = pd.DataFrame(index = set(df_a.index)-set(df_b.index))

    df_a_[usage] = 0
    df_b_[usage] = 0
    
    df_a = pd.concat([df_a, df_a_])
    df_b = pd.concat([df_b, df_b_])
    
#     df_a[usage].apply(lambda x:int(x))
#     df_b[usage].apply(lambda x:int(x))
    return df_a.add(df_b)

In [6]:
def analyze_all(start_date, end_date, scale = "station"):# scale = ["station" | "user"]
    if(scale == "station"):# select analyze function and scale column
        scale = geton_station_name
        analyze_func = analyze_how_many_is_station_used
    elif(scale == "user"):
        scale = user_count
        analyze_func = analyze_how_many_did_user_use_station
        
    file_path = make_path(start_date)
    result = pd.DataFrame(analyze_func(file_path))

    for day in tqdm(range((end_date-start_date).days)):
        date = start_date + datetime.timedelta(days = day+1)
        print(date)
        file_path = make_path(date)
        result = combine_result(result, analyze_func(file_path))
    return result

In [7]:
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 6, 3)
period = int((end_date-start_date).days)+1

station_usage = analyze_all(start_date, end_date, scale = "station")
station_user_usage = analyze_all(start_date, end_date, scale = "user")

station_usage[usage] = station_usage[usage].apply(lambda x : int(x/period))
station_user_usage[usage] = station_user_usage[usage].apply(lambda x : int(x/period))

print(station_usage.sort_values(by = usage))
print(station_user_usage.sort_values(by = usage))

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

2019-06-02 00:00:00


 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  2.78it/s]

2019-06-03 00:00:00


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.24it/s]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

2019-06-02 00:00:00


 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  2.83it/s]

2019-06-03 00:00:00


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.27it/s]

             usage
위미정수장            0
굴집터              0
이화농장             0
이스트힐스            0
이스트소프트           0
...            ...
동문로터리(동문시장)   2184
제주시청(아라방면)    2430
제주대학교         2670
제주시청(광양방면)    2853
한라병원          2859

[1840 rows x 1 columns]
             usage
새소망요양병원          0
선린지              0
선인동사거리           0
선흘1리운동장          0
관광단지입구           0
...            ...
동문로터리(동문시장)   2240
제주시청(아라방면)    2480
제주대학교         2710
제주시청(광양방면)    2911
한라병원          2913

[1840 rows x 1 columns]





In [9]:
scale_value = [0, 10, 100, 1000, 10000, 100000, 1000000]
scale_column = usage
result = []
for i in range(6):
    condition = str(scale_value[i])+" <= "+scale_column+" and " + scale_column +" < "+str(scale_value[i+1])
    result.append(int(station_usage.query(condition).count()))
result

[804, 766, 251, 19, 0, 0]