In [62]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm

In [141]:
geton_station_id = 'geton_station_id'
geton_station_name = 'geton_stataion_name'
geton_station_longitude = 'geton_station_longitude'
geton_station_latitude = 'geton_station_latitude'
user_count = 'user_count'

usage = 'usage' # not excel column, for new DataFrame object

#### 1. 입력 기간 사이에 테그 데이터 누계

In [64]:
def make_path(date):
    root_path = "C:/tb_bus_user_usage"
    base_name = "tb_bus_user_usage_"
    extender = ".csv"
    
    date = str(date)
    y = date[2:4]
    m = date[5:7]
    d = date[8:10]
    
    file_name = root_path+"/"+base_name+y+m+d+extender
    return file_name

In [95]:
def analyze_how_many_is_station_used(df):
    df = pd.DataFrame(df[geton_station_id].value_counts())
    df.index.name = geton_station_id
    df = df.rename(columns={geton_station_id:usage})
    return df

def analyze_how_many_did_user_use_station(df):
    df = df[[geton_station_id, user_count]].groupby(geton_station_id).sum()
    df.index.name = geton_station_id
    df = df.rename(columns={user_count:usage})
    return df

In [66]:
def combine_result(df_a, df_b):
    df_a_ = pd.DataFrame(index = set(df_b.index)-set(df_a.index))
    df_b_ = pd.DataFrame(index = set(df_a.index)-set(df_b.index))

    df_a_[usage] = 0
    df_b_[usage] = 0
    
    df_a = pd.concat([df_a, df_a_])
    df_b = pd.concat([df_b, df_b_])
    
    return df_a.add(df_b)

In [150]:
######################
def analyze_total_usage(start_date, end_date, scale = "station"):# scale = ["station" | "user"]
    if(scale == "station"):# select analyze function and scale column
        scale = geton_station_name
        analyze_func = analyze_how_many_is_station_used
    elif(scale == "user"):
        scale = user_count
        analyze_func = analyze_how_many_did_user_use_station

    extraction_columns = [geton_station_id, geton_station_name, geton_station_latitude, geton_station_longitude]
        
    df = pd.read_csv(make_path(start_date), encoding='utf-8')
    used_station = df[extraction_columns]
    station_usage = pd.DataFrame(analyze_func(df))

    for day in tqdm(range((end_date-start_date).days)):
        date = start_date + datetime.timedelta(days = day+1)
        df = pd.read_csv(make_path(date), encoding='utf-8')
        station_usage = combine_result(station_usage, analyze_func(df))
        used_station = used_station.append(df[extraction_columns])
        used_station = used_station.drop_duplicates(used_station.columns)
    station_usage = station_usage.reset_index().rename(columns = {"index": geton_station_id})
    result = pd.merge(used_station, station_usage, on=geton_station_id)
    print(result)
    result = result.dropna(axis = 0)
    result = result[(result[geton_station_longitude] != "nan") & (result[geton_station_latitude] != "nan")]
    result = result.sort_values(by=usage, ascending = False)
    return result

In [68]:
def make_new_path(start_date, end_date):
    s_d = str(start_date)
    e_d = str(end_date)

    s_y, s_m, s_d = s_d[2:4], s_d[5:7], s_d[8:10]
    e_y, e_m, e_d = e_d[2:4], e_d[5:7], e_d[8:10]
    
    return "tag_usage_data_"+s_y+s_m+s_d+"_"+e_y+e_m+e_d+".csv"

#### 2. 주소 컬럼추가

In [69]:
import xml.etree.ElementTree as elemTree
import requests
def get_address(loc_x, loc_y, min_x = 126.531891, min_y = 33.399409, key = "E20F6493-C13D-3F6F-AC90-D5BB2F239901"):
    loc_x = round(float(loc_x), 7)
    loc_y = round(float(loc_y), 7)
    url_form = "http://api.vworld.kr/req/address?service=address&request=getAddress&version=2.0&crs=epsg:4326&point={},{}&format=xml&type=both&zipcode=true&simple=false&key={}"
    url = url_form.format(loc_x, loc_y, key)
    response = requests.get(url)
    tree = elemTree.fromstring(response.text)
    branch = ""
    try:
#     road = tree.find("result").find("item[2]").find("text").text
        branch = tree.find("result").find("item[1]").find("text").text
    except:
        print(loc_x, loc_y)
        x_left = str(int(loc_x))
        y_left = str(int(loc_y))
        
        
        if min_x < loc_x:
            x_right = str((int(loc_x*100000)-2)%100000)
        else:
            x_right = str((int(loc_x*100000)+2)%100000)
            
        if min_y < loc_y:
            y_right = str((int(loc_y*100000)-2)%100000)
        else:
            y_right = str((int(loc_y*100000)+2)%100000)
            
        loc_x = float(x_left+"."+x_right)
        loc_y = float(y_left+"."+y_right)
        return get_address(loc_x, loc_y)
            
    return branch

# loc_x = 126.30207
# loc_y = 33.2253
# print(get_address(loc_x, loc_y))

In [158]:
def set_address_column(df):
    df = pd.DataFrame.copy(df)
    df["STATION_ADDRESS"] = ""
    for i in tqdm(df.index):
        df.loc[i, "STATION_ADDRESS"] = get_address(float(df.loc[i, "STATION_X"]), float(df.loc[i, "STATION_Y"]))
    return df
# usage_df = set_address_column(usage_df)
# usage_df

#### main

In [159]:
### setting clasification period(start, end)
start_date = datetime.datetime(2019, 6, 1)
end_date = datetime.datetime(2019, 8, 29)


### accumulating
usage_df = analyze_total_usage(start_date, end_date, scale = "user")


### calculate mean of usage
# period = int((end_date-start_date).days)+1
# usage_df[usage] = usage_df[usage].apply(lambda x : int(x/period))


### new naming
usage_df = usage_df.rename(columns = {geton_station_id :'STATION_ID',
                          geton_station_name: "STATION_NAME", 
                          geton_station_latitude:"STATION_Y",
                          geton_station_longitude:"STATION_X",
                          usage:"STATION_USAGE"})


### setting address column
usage_df = set_address_column(usage_df)


### sorting
usage_df = usage_df.sort_values(by="STATION_ID")


### stroing
usage_df.to_csv(make_new_path(start_date, end_date), encoding="CP949", index=False)

  if (await self.run_code(code, result,  async_=asy)):
100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [00:39<00:00,  2.24it/s]
  res_values = method(rvalues)
  0%|                                                                                 | 1/3622 [00:00<08:03,  7.48it/s]

      geton_station_id geton_stataion_name  geton_station_latitude  \
0                  988              납읍리사무소                33.43944   
1                 1130        한림환승정류장(한림리)                33.41453   
2                   47                 학원동                33.47493   
3                   20                연대마을                33.49282   
4                  174                명주주택                33.49579   
...                ...                 ...                     ...   
3621              4250               추사유배지                33.24910   
3622              3314                 산수동                33.30309   
3623              4206            크라운골프연습장                33.33965   
3624              2148             세화3리웃동네                33.32318   
3625              3559              두문포노린당                33.49630   

      geton_station_longitude  usage  
0                   126.32944   3234  
1                   126.26427  38821  
2                   126.39766   2135  
3  

  8%|██████▎                                                                        | 289/3622 [00:21<04:31, 12.27it/s]

126.61324 33.2642


 25%|████████████████████                                                           | 917/3622 [01:07<03:07, 14.46it/s]

126.78144 33.30731
126.78142 33.30733
126.7814 33.30735


 34%|██████████████████████████▎                                                   | 1223/3622 [01:30<03:17, 12.15it/s]

126.59505 33.25965


 37%|████████████████████████████▉                                                 | 1345/3622 [01:39<03:36, 10.52it/s]

126.29452 33.96141
126.29454 33.96139


 40%|███████████████████████████████▍                                              | 1457/3622 [01:48<03:07, 11.55it/s]

126.59404 33.24436


 54%|██████████████████████████████████████████▎                                   | 1963/3622 [02:26<02:17, 12.03it/s]

126.23817 33.23178


 65%|██████████████████████████████████████████████████▋                           | 2355/3622 [02:55<01:33, 13.55it/s]

126.49945 33.51938
126.49947 33.51936
126.49949 33.51934
126.49951 33.51932
126.49953 33.5193
126.49955 33.51928
126.49957 33.51926
126.49959 33.51924
126.49961 33.51922
126.49963 33.51919
126.49965 33.51917
126.49967 33.51915
126.49969 33.51913
126.49971 33.5191
126.49973 33.51908
126.49975 33.51906
126.49977 33.51904
126.49979 33.51901
126.49981 33.51899
126.49983 33.51897
126.49985 33.51895
126.49987 33.51892
126.49989 33.5189
126.49991 33.51888
126.49993 33.51886
126.49995 33.51883
126.49997 33.51881
126.49999 33.51879
126.50001 33.51877
126.50003 33.51875
126.50005 33.51872
126.50007 33.5187
126.50009 33.51868
126.50011 33.51866


 80%|██████████████████████████████████████████████████████████████▋               | 2909/3622 [03:38<00:53, 13.23it/s]

126.29979 33.22527
126.29981 33.22529
126.29983 33.22531
126.29985 33.22533
126.29987 33.22535
126.29989 33.22537
126.29991 33.22539
126.29993 33.2254
126.29995 33.22542
126.29997 33.22544
126.29999 33.22546
126.30001 33.22548
126.30003 33.22549
126.30005 33.22551
126.30007 33.22553
126.30009 33.22555
126.30011 33.22557
126.30013 33.22558
126.30015 33.2256
126.30017 33.22562
126.30019 33.22564
126.30021 33.22566
126.30023 33.22568
126.30025 33.22569
126.30027 33.22571
126.30029 33.22573
126.30031 33.22575
126.30033 33.22577
126.30035 33.22578
126.30037 33.2258
126.30039 33.22582
126.30041 33.22584
126.30043 33.22586
126.30045 33.22587
126.30047 33.22589
126.30049 33.22591
126.30051 33.22593
126.30053 33.22595
126.30055 33.22596
126.30057 33.22598
126.30059 33.226
126.30061 33.22602
126.30063 33.22604
126.30065 33.22605
126.30067 33.22607
126.30069 33.22609
126.30071 33.22611
126.30073 33.22613
126.30075 33.22615
126.30077 33.22616
126.30079 33.22618
126.30081 33.2262
126.30083 33.22622

 86%|██████████████████████████████████████████████████████████████████▋           | 3099/3622 [04:00<00:41, 12.47it/s]

126.30382 33.95284
126.30384 33.95282
126.30386 33.9528
126.30388 33.95278
126.3039 33.95275
126.30392 33.95273
126.30394 33.95271
126.30396 33.95269
126.30398 33.95266
126.304 33.95264
126.30402 33.95262
126.30404 33.9526
126.30406 33.95257
126.30408 33.95255
126.3041 33.95253
126.30412 33.95251
126.30414 33.95248


 86%|██████████████████████████████████████████████████████████████████▉           | 3108/3622 [04:02<01:13,  7.02it/s]

126.29527 33.96109
126.29529 33.96107


 86%|███████████████████████████████████████████████████████████████████▏          | 3118/3622 [04:03<00:42, 11.97it/s]

126.61122 33.26048
126.6112 33.2605
126.61118 33.26052


 86%|███████████████████████████████████████████████████████████████████▏          | 3120/3622 [04:03<01:08,  7.30it/s]

126.61116 33.26054
126.61114 33.26056


100%|█████████████████████████████████████████████████████████████████████████████▊| 3614/3622 [04:40<00:00, 13.57it/s]

126.61747 33.26697
126.61745 33.26699
126.61743 33.26701
126.61741 33.26703
126.61739 33.26705
126.61737 33.26706
126.61735 33.26708
126.61733 33.2671
126.61731 33.26712
126.61729 33.26714
126.61727 33.26716
126.61725 33.26717
126.61723 33.26719
126.61721 33.26721
126.61719 33.26723


100%|█████████████████████████████████████████████████████████████████████████████▊| 3616/3622 [04:42<00:01,  4.04it/s]

126.61717 33.26725


100%|██████████████████████████████████████████████████████████████████████████████| 3622/3622 [04:42<00:00, 12.82it/s]
