### 반복을 위한 Sacred
 - 목적  
    - 머신러닝 모델링에서 사용되는 config, parameter등을 손쉽게 저장할 수 있도록 도와주는 도구  

 - 이유  
    - 다양한 실험을 빠르게 진행하며, 손으로 기록하지 않고 자동으로 기록될 수 있도록 도와주는 도구  
    
 - 특징  
    - 환경설정을 다시 reproduce할 수 있도록 도와주는 도구
        - 실험의 모든 파라미터 추적
        - 여러 설정에 대해 쉽게 실험을 할 수 있음
        - DB에 각 실행의 설정을 저장
        - 결과를 reproduce  
        
    - 옴니보드등을 통해 저장된 값을 시각화할 수 있음
        - 옴니보드 : sacred에서 저장된 파리미터를 테이블로 보여주고 성능을 표시

#### sacred의 main mechanisms
- ConfigScopes : 함수의 local변수를 편리하게 다룰 수 있음. @ex.config 데코레이터로 사용
- Config Injection : 모든 함수에 있는 설정을 접근할 수 있음
- Command-line interface : 커맨드 라인으로 파라미터를 바꿔서 실행할 수 있음
- Observers : 실험의 모든 정보를 Observers에게 제공해 저장
- Automatic seeding : 실험의 무작위를 컨트롤할 때 도와줌

### sacred

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from ipywidgets import interact
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import os
from numpy.random import permutation
from sklearn import svm, datasets
from sacred import Experiment
from sacred.observers import FileStorageObserver

plt.style.use('ggplot')
warnings.filterwarnings('ignore')


PROJECT_ID='new-york-taxi-309501' # 여기에 여러분들의 프로젝트 ID를 넣어주세요

### 기존 데이터와 통합

In [17]:
ex = Experiment('nyc-demand-prediction', interactive=True)

experiment_dir = os.path.join('./', 'experiments')
if not os.path.isdir(experiment_dir): 
    os.makedirs(experiment_dir)
ex.observers.append(FileStorageObserver.create(experiment_dir))

### 데이터 전처리

In [4]:
base_query = """
WITH base_data AS 
(
  SELECT nyc_taxi.*, gis.* EXCEPT (zip_code_geom)
  FROM (
    SELECT *
    FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015`
    WHERE 
        EXTRACT(MONTH from pickup_datetime) = 1
        and pickup_latitude  <= 90 and pickup_latitude >= -90
    ) AS nyc_taxi
  JOIN (
    SELECT zip_code, state_code, state_name, city, county, zip_code_geom
    FROM `bigquery-public-data.geo_us_boundaries.zip_codes`
    WHERE state_code='NY'
    ) AS gis 
  ON ST_CONTAINS(zip_code_geom, st_geogpoint(pickup_longitude, pickup_latitude))
)

SELECT 
    zip_code,
    DATETIME_TRUNC(pickup_datetime, hour) as pickup_hour,
    EXTRACT(MONTH FROM pickup_datetime) AS month,
    EXTRACT(DAY FROM pickup_datetime) AS day,
    CAST(format_datetime('%u', pickup_datetime) AS INT64) -1 AS weekday,
    EXTRACT(HOUR FROM pickup_datetime) AS hour,
    CASE WHEN CAST(FORMAT_DATETIME('%u', pickup_datetime) AS INT64) IN (6, 7) THEN 1 ELSE 0 END AS is_weekend,
    COUNT(*) AS cnt
FROM base_data 
GROUP BY zip_code, pickup_hour, month, day, weekday, hour, is_weekend
ORDER BY pickup_hour
"""

base_df = pd.read_gbq(query=base_query, dialect='standard', project_id=PROJECT_ID)

Downloading: 100%|██████████████████████████████████████████████████████████| 87020/87020 [00:06<00:00, 14080.12rows/s]


In [5]:
base_df

Unnamed: 0,zip_code,pickup_hour,month,day,weekday,hour,is_weekend,cnt
0,10280,2015-01-01 00:00:00,1,1,3,0,0,54
1,10040,2015-01-01 00:00:00,1,1,3,0,0,13
2,10033,2015-01-01 00:00:00,1,1,3,0,0,33
3,10035,2015-01-01 00:00:00,1,1,3,0,0,92
4,10007,2015-01-01 00:00:00,1,1,3,0,0,191
...,...,...,...,...,...,...,...,...
87015,10454,2015-01-31 23:00:00,1,31,5,23,1,4
87016,10452,2015-01-31 23:00:00,1,31,5,23,1,1
87017,11415,2015-01-31 23:00:00,1,31,5,23,1,1
87018,11223,2015-01-31 23:00:00,1,31,5,23,1,1


### feature engineering

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(base_df[['zip_code']])
ohe_output = enc.transform(base_df[['zip_code']]).toarray()
ohe_df = pd.concat([base_df, pd.DataFrame(ohe_output, columns='zip_code_'+enc.categories_[0])], axis=1)
ohe_df['log_cnt'] = np.log10(ohe_df['cnt'])

### train/ test 데이터 나누기

In [7]:
def split_train_test(df, date):
    train_df = df[df['pickup_hour'] < date]
    test_df = df[df['pickup_hour'] >= date]
    return train_df, test_df

In [8]:
train_df, test_df = split_train_test(ohe_df,'2015-01-24')

In [10]:
train_df.head()

Unnamed: 0,zip_code,pickup_hour,month,day,weekday,hour,is_weekend,cnt,zip_code_10001,zip_code_10002,...,zip_code_12729,zip_code_12771,zip_code_13029,zip_code_13118,zip_code_13656,zip_code_13691,zip_code_14072,zip_code_14527,zip_code_14801,log_cnt
0,10280,2015-01-01,1,1,3,0,0,54,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.732394
1,10040,2015-01-01,1,1,3,0,0,13,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.113943
2,10033,2015-01-01,1,1,3,0,0,33,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.518514
3,10035,2015-01-01,1,1,3,0,0,92,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.963788
4,10007,2015-01-01,1,1,3,0,0,191,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.281033


In [11]:
test_df.head()

Unnamed: 0,zip_code,pickup_hour,month,day,weekday,hour,is_weekend,cnt,zip_code_10001,zip_code_10002,...,zip_code_12729,zip_code_12771,zip_code_13029,zip_code_13118,zip_code_13656,zip_code_13691,zip_code_14072,zip_code_14527,zip_code_14801,log_cnt
65118,10171,2015-01-24,1,24,5,0,1,13,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.113943
65119,10172,2015-01-24,1,24,5,0,1,19,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.278754
65120,11205,2015-01-24,1,24,5,0,1,36,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.556303
65121,11206,2015-01-24,1,24,5,0,1,68,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.832509
65122,11201,2015-01-24,1,24,5,0,1,143,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.155336


In [12]:
del train_df['zip_code']
del train_df['pickup_hour']
del test_df['zip_code']
del test_df['pickup_hour']

In [13]:
y_train_raw = train_df.pop('cnt')
y_train_log = train_df.pop('log_cnt')
y_test_raw = test_df.pop('cnt')
y_test_log = test_df.pop('log_cnt')

In [14]:
y_true = y_test_raw.values.copy()
x_train = train_df.copy()
x_test = test_df.copy()

In [15]:
def evaluation(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    score = pd.DataFrame([mape, mae, mse], index=['mape', 'mae', 'mse'], columns=['score']).T
    return score

### 실험 설정

In [18]:
@ex.config
def config():
    fit_intercept=True
    normalize=False

In [19]:
@ex.capture
def get_model(fit_intercept, normalize):
    return LinearRegression(fit_intercept, normalize)

In [20]:
@ex.main
def run(_log, _run):
    lr_reg = get_model()
    lr_reg.fit(x_train, y_train_raw)
    pred = lr_reg.predict(x_test)
    # log File에 로그 저장
    _log.info("Predict End")
    score = evaluation(y_test_raw, pred)
    _run.log_scalar('model_name', lr_reg.__class__.__name__)
    
    # Metrics쪽에 저장하고 싶으면 아래처럼 사용
    _run.log_scalar('metrics', score)
    
    # Result쪽에 저장하고 싶으면 아래처럼 사용
    return score.to_dict()
