In [1]:
import pandas as pd
import numpy as np
import datetime
import os

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [28]:
# 날짜 범위 생성 / 검은날 vs 빨간날+파란날+공휴일 구분
start_datetime = '2019-06-01'
end_datetime = '2019-06-30'
official_rest_range = set(['20190606'])
# official_rest_range => 국가 공휴일을 따로 입력해주어야 함.
datetime_range = set(str(datetime.date()).replace('-', '') 
                     for datetime 
                     in pd.date_range(start_datetime, end_datetime, freq='D'))
business_datetime_range = set(str(datetime.date()).replace('-', '') 
                          for datetime 
                          in pd.date_range(start_datetime, end_datetime, freq='B'))
business_datetime_range = business_datetime_range - official_rest_range
rest_datetime_range = datetime_range - business_datetime_range

business_datetime_range, rest_datetime_range = \
    list(business_datetime_range), list(rest_datetime_range)

In [4]:
path_postfix = '_정류장별시간대별노선별유형별승차인원분석.csv'
ratio = 0.8

# 데이터 파악

In [5]:
test_file = '20190601' + path_postfix
origin_df = pd.read_csv(test_file, engine='python', encoding='ms949')

del origin_df['Unnamed: 0']

# set(origin_df.columns)

In [18]:
target_col = 'user_count'
del_cols = ['route_no', 'geton_stataion_name']
feature_cols = list(set(origin_df.columns) - set([target_col]) - set(del_cols))

temp = origin_df[feature_cols]

In [19]:
categorized_cols = []
scaled_cols = []

for col in feature_cols:
    domain = origin_df[col].unique()
    
    if len(domain) <= 7:
        categorized_cols.append(col)
    else:
        scaled_cols.append(col)

print('categorizing needed... \n', categorized_cols)
print('scaling needed... \n', scaled_cols)

categorizing needed... 
 ['user_type']
scaling needed... 
 ['geton_station_id', 'route_id', 'geton_hour']


# 날짜 범위 생성하기

In [20]:
# 날짜 범위 생성 / 검은날 vs 빨간날+파란날+공휴일 구분
start_datetime = '2019-06-01'
end_datetime = '2019-06-30'
official_rest_range = set(['20190606'])
# official_rest_range => 국가 공휴일을 따로 입력해주어야 함.
datetime_range = set(str(datetime.date()).replace('-', '') 
                     for datetime 
                     in pd.date_range(start_datetime, end_datetime, freq='D'))
business_datetime_range = set(str(datetime.date()).replace('-', '') 
                          for datetime 
                          in pd.date_range(start_datetime, end_datetime, freq='B'))
business_datetime_range = business_datetime_range - official_rest_range
rest_datetime_range = datetime_range - business_datetime_range

# [평일] 파일 불러오며 데이터 정규화...
# 데이터프레임에 합치기

In [32]:
business_df = None

for path_infix in business_datetime_range:
    print(path_infix)
    df = pd.read_csv(test_file, engine='python', encoding='ms949')
    df = df[feature_cols + [target_col]]
    
    for col in categorized_cols:
        domain = df[col].unique()
        
        if len(domain) <= 2:
            df[col] = (df[col] == domain[0]).astype(int)
        else:
            dummies = pd.get_dummies(df[col], prefix=col)
        df.drop(columns=[col], inplace=True)
        df = pd.concat([df, dummies], axis=1)
        
    df.loc[:, scaled_cols] = scaler.fit_transform(df[scaled_cols])
    
    if business_df is None:
        business_df = df
    else:
        business_df = pd.concat([business_df, df])

business_df['is_business'] = 1

temp = business_df[target_col]
del business_df[target_col]
business_df[target_col] = temp

20190612
20190626
20190624
20190613
20190604
20190614
20190621
20190603
20190617
20190620
20190618
20190607
20190627
20190611
20190605
20190628
20190619
20190610
20190625


# 평일-기계학습 ... 의사결정트리 회귀 모형

In [34]:
num_df = len(business_df)
ratio = 0.8
df = df.sample(frac=1)
train_df = business_df[:int(num_df * ratio)]
test_df= business_df[int(num_df * ratio):]

loop_cnt = 1

In [39]:
for _ in range(loop_cnt):
    # train_df = train_df[train_df.columns[:-1]]
    
    train_x = train_df[train_df.columns[:-1]]
    train_y = train_df[train_df.columns[-1]]

    test_x = test_df[test_df.columns[:-1]]
    test_y = test_df[test_df.columns[-1]]
    
    model = DecisionTreeRegressor()
    model.fit(X=train_x, y=train_y)
    
#     pred = model.predict(test_x)
    
    print(model.score(test_x, test_y))
    
#     accuracy_score()
    
    #print(res)

0.9997670835769082


In [48]:
model.predict([[0.0, 0.856741, 0.617021, 0, 0, 0, 1, 0, 0, 0, 1]])
# 0.856741	0.914894

array([3.])

In [43]:
business_df

Unnamed: 0,geton_station_id,route_id,geton_hour,user_type_경로,user_type_어린이,user_type_유공 일반,user_type_일반,user_type_장애 동반,user_type_장애 일반,user_type_청소년,is_business,user_count
0,0.0,0.661035,0.617021,1,0,0,0,0,0,0,1,1
1,0.0,0.661035,0.638298,1,0,0,0,0,0,0,1,1
2,0.0,0.661035,0.276596,0,1,0,0,0,0,0,1,1
3,0.0,0.661035,0.255319,0,0,0,1,0,0,0,1,5
4,0.0,0.661035,0.276596,0,0,0,1,0,0,0,1,15
5,0.0,0.661035,0.340426,0,0,0,1,0,0,0,1,1
6,0.0,0.661035,0.510638,0,0,0,1,0,0,0,1,1
7,0.0,0.661035,0.553191,0,0,0,1,0,0,0,1,3
8,0.0,0.661035,0.617021,0,0,0,1,0,0,0,1,2
9,0.0,0.661035,0.765957,0,0,0,1,0,0,0,1,1
