# Exp 4. Simple modeling with almost no pp/fe 

In [80]:
# Import Modules
import os
import sys
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go

from ipywidgets import interact

# Ignore sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Enable plotly offline plotting
import plotly

plotly.offline.init_notebook_mode(connected=True)

# Init project path
PROJECT_DIR = os.getcwd() + "/../../"
sys.path.insert(0, PROJECT_DIR)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
from src.exp.exp4_simple.step0_set_global_variables import *
from src.exp.exp4_simple.step1_load_data import load_data
from src.exp.exp4_simple.step2_preprocessing import preprocessing
from src.exp.exp4_simple.step3_rename_non_ascii_cols import rename_non_ascii_cols

# Step 1. Load train.csv, test.csv
with timer("load data"):
    train_df, test_df = load_data(DATA_DIR=DATA_DIR)
    original_train_df, original_test_df = load_data(DATA_DIR=DATA_DIR)

# Step 2. Preprocessing
with timer("preprocessing"):
    train_df = preprocessing(df=train_df)
    test_df = preprocessing(df=test_df)

# Step 3. Rename non-ascii columns as lightGBM doesn't support them.
with timer("rename non-ascii cols"):
    train_df = rename_non_ascii_cols(df=train_df)

2019-10-31 09:23:41,572 - INFO - [load data] done in 2.13 s
2019-10-31 09:23:58,542 - INFO - [preprocessing] done in 16.97 s
2019-10-31 09:23:58,554 - INFO - [rename non-ascii cols] done in 0.01 s


In [67]:
# LightGBM parameters
lgb_params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": {"l2"},
    "num_leaves": 1000,
    "learning_rate": 0.1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 20,
    "verbose": -1,
}

# xgb parameters
xgb_params = {"objective": "reg:squarederror", "eval_metric": "rmse"}

In [68]:
# Step 4. Cross Validation
from src.exp.exp4_simple.step4_cv import cross_validation
with timer("cv with linear regression"):
    lr_result, scores = cross_validation(
        train_df=train_df, model_name="linearRegression", params=None
    )
with timer("cv with xgboost"):
    xgb_result, scores = cross_validation(
        train_df=train_df, model_name="xgboost", params=xgb_params
    )
with timer("cv with lightgbm"):
    lgb_result, scores = cross_validation(
        train_df=train_df, model_name="lightgbm", params=lgb_params
    )

---------------- linearRegression ----------------
n_fold: 0 Score: 32927.17395126365
n_fold: 1 Score: 24331.8640655277
n_fold: 2 Score: 26734.197174381203
n_fold: 3 Score: 22056.815322410202


2019-10-31 09:24:00,440 - INFO - [cv with linear regression] done in 1.72 s


n_fold: 4 Score: 25821.2029931744
---------------- linearRegression  END ----------------

---------------- xgboost ----------------
n_fold: 0 Score: 24757.262606995544
n_fold: 1 Score: 21314.254189367835
n_fold: 2 Score: 28340.92489733778
n_fold: 3 Score: 18449.231681568883


2019-10-31 09:24:20,395 - INFO - [cv with xgboost] done in 19.95 s


n_fold: 4 Score: 21694.853633174596
---------------- xgboost  END ----------------

---------------- lightgbm ----------------
n_fold: 0 Score: 24623.181835665888
n_fold: 1 Score: 23771.844037072842
n_fold: 2 Score: 28239.62718834958
n_fold: 3 Score: 19410.471720051122


2019-10-31 09:24:32,941 - INFO - [cv with lightgbm] done in 12.54 s


n_fold: 4 Score: 21273.068348048982
---------------- lightgbm  END ----------------



In [90]:
from src.exp.exp4_simple.step5_plot_result import plot_result
@interact(n_fold=list(range(N_FOLDS)))
def plot_results(n_fold):
    plot_result(original_train_df, lr_result, n_fold, "lr")
    plot_result(original_train_df, xgb_result, n_fold, "xgb")
    plot_result(original_train_df, lgb_result, n_fold, "lgb")

interactive(children=(Dropdown(description='n_fold', options=(0, 1, 2, 3, 4), value=0), Output()), _dom_classe…