In [1]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm

In [2]:
geton_station_id = 'geton_station_id'
geton_station_name = 'geton_stataion_name'
geton_station_longitude = 'geton_station_longitude'
geton_station_latitude = 'geton_station_latitude'
user_count = 'user_count'
tourist = "tourist"
usage = 'usage' # not excel column, for new DataFrame object

#### 1. 파일 경로 설정 def
* 해당 날짜에 대한 파일 경로 반환


In [3]:
def make_path(date):
    root_path = "C:/tb_bus_user_usage"
    base_name = "tb_bus_user_usage_"
    extender = ".csv"
    
    date = str(date)
    y = date[2:4]
    m = date[5:7]
    d = date[8:10]
    
    file_name = root_path+"/"+base_name+y+m+d+extender
    return file_name

#### 2. 태그 데이터 분석 - def

In [4]:
def analyze_usage(df):
    df = df[[geton_station_id, user_count]].groupby(geton_station_id).sum()
    df.index.name = geton_station_id
    df = df.rename(columns={user_count:usage})
    return df

def combine_result(df_a, df_b):
    df_a_ = pd.DataFrame(index = set(df_b.index)-set(df_a.index))
    df_b_ = pd.DataFrame(index = set(df_a.index)-set(df_b.index))

    df_a_[usage] = 0
    df_b_[usage] = 0
    
    df_a = pd.concat([df_a, df_a_])
    df_b = pd.concat([df_b, df_b_])
    
    return df_a.add(df_b)

def analyze_total_usage(start_date, end_date):# scale = ["station" | "user"]
    station_columns = [geton_station_id, geton_station_name, geton_station_latitude, geton_station_longitude]

    df = pd.read_csv(make_path(start_date), encoding='cp949')  
    
    used_station = df[station_columns]
    
    citizen_user_usage    = pd.DataFrame(analyze_usage(df[df["tourist"] == 0]))
    tourist_user_usage    = pd.DataFrame(analyze_usage(df[df["tourist"] == 1]))
    
    df["user_count"] = 1
    citizen_station_usage = pd.DataFrame(analyze_usage(df[df["tourist"] == 0]))
    tourist_station_usage = pd.DataFrame(analyze_usage(df[df["tourist"] == 1]))
    
    for day in tqdm(range((end_date-start_date).days)):
        date = start_date + datetime.timedelta(days = day+1)
        df = pd.read_csv(make_path(date), encoding='cp949')
        
        used_station = used_station.append(df[station_columns])
        used_station = used_station.drop_duplicates(station_columns)
        
        citizen_user_usage = combine_result(citizen_user_usage, analyze_usage(df[df["tourist"] == 0]))
        tourist_user_usage = combine_result(tourist_user_usage, analyze_usage(df[df["tourist"] == 1]))
        
        df["user_count"] = 1
        citizen_station_usage = combine_result(citizen_station_usage, analyze_usage(df[df["tourist"] == 0]))
        tourist_station_usage = combine_result(tourist_station_usage, analyze_usage(df[df["tourist"] == 1]))
    
    citizen_user_usage = citizen_user_usage.reset_index().rename(columns = {"index": geton_station_id, "usage": "citizen_user_usage"})
    tourist_user_usage = tourist_user_usage.reset_index().rename(columns = {"index": geton_station_id, "usage": "tourist_user_usage"})
    citizen_station_usage = citizen_station_usage.reset_index().rename(columns = {"index": geton_station_id, "usage": "citizen_station_usage"})
    tourist_station_usage = tourist_station_usage.reset_index().rename(columns = {"index": geton_station_id, "usage": "tourist_station_usage"})
    
    result = used_station
    result = pd.merge(result, citizen_user_usage, on=geton_station_id)
    result = pd.merge(result, tourist_user_usage, on=geton_station_id)
    result["total_user_usage"] = result["citizen_user_usage"] + result["tourist_user_usage"]
    
    result = pd.merge(result, citizen_station_usage, on=geton_station_id)
    result = pd.merge(result, tourist_station_usage, on=geton_station_id)
    result["total_station_usage"] = result["citizen_station_usage"] + result["tourist_station_usage"]
    
    result = result.dropna(axis = 0)
    result = result[(result[geton_station_longitude] != "nan") & (result[geton_station_latitude] != "nan")]
    result = result.sort_values(by=geton_station_id, ascending = False)
    return result

In [5]:
def make_new_path(start_date, end_date):
    s_d = str(start_date)
    e_d = str(end_date)

    s_y, s_m, s_d = s_d[2:4], s_d[5:7], s_d[8:10]
    e_y, e_m, e_d = e_d[2:4], e_d[5:7], e_d[8:10]
    
    return "station_data_"+s_y+s_m+s_d+"_"+e_y+e_m+e_d+".csv"

#### 3. 주소 컬럼추가 - def

In [6]:
import xml.etree.ElementTree as elemTree
import requests
def get_address(loc_x, loc_y, min_x = 126.531891, min_y = 33.399409, key = "E20F6493-C13D-3F6F-AC90-D5BB2F239901"):
    loc_x = round(float(loc_x), 7)
    loc_y = round(float(loc_y), 7)
    url_form = "http://api.vworld.kr/req/address?service=address&request=getAddress&version=2.0&crs=epsg:4326&point={},{}&format=xml&type=both&zipcode=true&simple=false&key={}"
    url = url_form.format(loc_x, loc_y, key)
    response = requests.get(url)
    tree = elemTree.fromstring(response.text)
    branch = ""
    try:
#     road = tree.find("result").find("item[2]").find("text").text
        branch = tree.find("result").find("item[1]").find("text").text
    except:
        print(loc_x, loc_y)
        x_left = str(int(loc_x))
        y_left = str(int(loc_y))
        
        
        if min_x < loc_x:
            x_right = str((int(loc_x*100000)-2)%100000)
        else:
            x_right = str((int(loc_x*100000)+2)%100000)
            
        if min_y < loc_y:
            y_right = str((int(loc_y*100000)-2)%100000)
        else:
            y_right = str((int(loc_y*100000)+2)%100000)
            
        loc_x = float(x_left+"."+x_right)
        loc_y = float(y_left+"."+y_right)
        return get_address(loc_x, loc_y)
            
    return branch

# loc_x = 126.30207
# loc_y = 33.2253
# print(get_address(loc_x, loc_y))

In [7]:
def set_address_column(df):
    df = pd.DataFrame.copy(df)
    df["station_address"] = ""
    for i in tqdm(df.index):
        df.loc[i, "station_address"] = get_address(float(df.loc[i, "station_x"]), float(df.loc[i, "station_y"]))
    return df
# usage_df = set_address_column(usage_df)
# usage_df

#### main

In [8]:
### setting clasification period(start, end)
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)


### accumulating
usage_df = analyze_total_usage(start_date, end_date)


### calculate mean of usage
# period = int((end_date-start_date).days)+1
# usage_df[usage] = usage_df[usage].apply(lambda x : int(x/period))


### new naming
usage_df = usage_df.rename(columns = {geton_station_id :'station_id',
                          geton_station_name: "station_name", 
                          geton_station_latitude:"station_y",
                          geton_station_longitude:"station_x",})

### setting address column
usage_df = set_address_column(usage_df)


### sorting
usage_df = usage_df.sort_values(by="station_id")


### storing
usage_df.to_csv(make_new_path(start_date, end_date), encoding="CP949", index=False)

  if (await self.run_code(code, result,  async_=asy)):
100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [00:55<00:00,  1.61it/s]
  res_values = method(rvalues)
  0%|▎                                                                               | 10/3105 [00:01<05:58,  8.64it/s]

126.29527 33.96109
126.29529 33.96107


  0%|▎                                                                               | 11/3105 [00:01<05:46,  8.92it/s]

126.30382 33.95284
126.30384 33.95282
126.30386 33.9528
126.30388 33.95278
126.3039 33.95275
126.30392 33.95273
126.30394 33.95271
126.30396 33.95269
126.30398 33.95266
126.304 33.95264
126.30402 33.95262
126.30404 33.9526
126.30406 33.95257
126.30408 33.95255
126.3041 33.95253


  0%|▎                                                                               | 13/3105 [00:02<13:57,  3.69it/s]

126.30412 33.95251
126.30414 33.95248


  1%|▉                                                                               | 35/3105 [00:04<04:18, 11.89it/s]

126.29452 33.96141
126.29454 33.96139


  2%|█▋                                                                              | 67/3105 [00:06<04:15, 11.88it/s]

126.29979 33.22527
126.29981 33.22529
126.29983 33.22531
126.29985 33.22533
126.29987 33.22535
126.29989 33.22537
126.29991 33.22539
126.29993 33.2254
126.29995 33.22542
126.29997 33.22544
126.29999 33.22546
126.30001 33.22548
126.30003 33.22549
126.30005 33.22551
126.30007 33.22553
126.30009 33.22555
126.30011 33.22557
126.30013 33.22558
126.30015 33.2256
126.30017 33.22562
126.30019 33.22564
126.30021 33.22566
126.30023 33.22568
126.30025 33.22569
126.30027 33.22571
126.30029 33.22573
126.30031 33.22575
126.30033 33.22577
126.30035 33.22578
126.30037 33.2258
126.30039 33.22582
126.30041 33.22584
126.30043 33.22586
126.30045 33.22587
126.30047 33.22589
126.30049 33.22591
126.30051 33.22593
126.30053 33.22595
126.30055 33.22596
126.30057 33.22598
126.30059 33.226
126.30061 33.22602
126.30063 33.22604
126.30065 33.22605
126.30067 33.22607
126.30069 33.22609
126.30071 33.22611
126.30073 33.22613
126.30075 33.22615
126.30077 33.22616
126.30079 33.22618
126.30081 33.2262
126.30083 33.22622

 11%|█████████                                                                      | 357/3105 [00:37<03:52, 11.80it/s]

126.61122 33.26048
126.6112 33.2605
126.61118 33.26052
126.61116 33.26054


 12%|█████████▏                                                                     | 359/3105 [00:37<06:16,  7.29it/s]

126.61114 33.26056


 32%|█████████████████████████▎                                                     | 997/3105 [01:26<02:33, 13.75it/s]

126.78144 33.30731
126.78142 33.30733
126.7814 33.30735


 35%|███████████████████████████▌                                                  | 1099/3105 [01:35<02:55, 11.46it/s]

126.23817 33.23178


 44%|██████████████████████████████████▏                                           | 1361/3105 [01:55<02:17, 12.72it/s]

126.61324 33.2642


 51%|███████████████████████████████████████▉                                      | 1589/3105 [02:12<02:07, 11.85it/s]

126.59505 33.25965


 52%|████████████████████████████████████████▎                                     | 1605/3105 [02:13<02:04, 12.05it/s]

126.59404 33.24436


 88%|████████████████████████████████████████████████████████████████████▍         | 2723/3105 [03:42<00:28, 13.34it/s]

126.49945 33.51938
126.49947 33.51936
126.49949 33.51934
126.49951 33.51932
126.49953 33.5193
126.49955 33.51928
126.49957 33.51926
126.49959 33.51924
126.49961 33.51922
126.49963 33.51919
126.49965 33.51917
126.49967 33.51915
126.49969 33.51913
126.49971 33.5191
126.49973 33.51908
126.49975 33.51906
126.49977 33.51904
126.49979 33.51901
126.49981 33.51899
126.49983 33.51897
126.49985 33.51895
126.49987 33.51892
126.49989 33.5189
126.49991 33.51888
126.49993 33.51886
126.49995 33.51883
126.49997 33.51881
126.49999 33.51879
126.50001 33.51877
126.50003 33.51875
126.50005 33.51872
126.50007 33.5187
126.50009 33.51868
126.50011 33.51866


100%|██████████████████████████████████████████████████████████████████████████████| 3105/3105 [04:12<00:00, 12.28it/s]
