# Exp 4. Simple modeling with almost no pp/fe 

In [1]:
# Import Modules
import os
import sys
import warnings
from pathlib import Path

import pandas as pd
# Enable plotly offline plotting
import plotly
import plotly.graph_objects as go
import seaborn as sns
from ipywidgets import interact


# Ignore sklearn warnings
def warn(*args, **kwargs):
    pass



pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)
warnings.warn = warn


plotly.offline.init_notebook_mode(connected=True)

# Init project path
PROJECT_DIR = os.getcwd() + "/../../"
sys.path.insert(0, PROJECT_DIR)
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from src.exp.common.setup import *
from src.exp.common.load_data import load_data
from src.exp.common.correct_invalid_data import correct_invalid_data
from src.exp.common.preprocessing import preprocessing
from src.exp.common.rename_non_ascii_cols import rename_non_ascii_cols

# Step 1. Load train.csv, test.csv
with timer("load data"):
    train_df, test_df = load_data(DATA_DIR=DATA_DIR)
    original_train_df, original_test_df = load_data(DATA_DIR=DATA_DIR)
    train_df = correct_invalid_data(train_df)
    original_train_df = correct_invalid_data(original_train_df)

# Step 2. Preprocessing
with timer("preprocessing"):
    train_df = preprocessing(df=train_df)
    test_df = preprocessing(df=test_df)

# Step 3. Rename non-ascii columns as lightGBM doesn't support them.
with timer("rename non-ascii cols"):
    train_df = rename_non_ascii_cols(df=train_df)

2019-11-02 04:14:44,397 - INFO - logger set up
2019-11-02 04:14:44,400 - INFO - seed=42
2019-11-02 04:14:46,685 - INFO - [load data] done in 2.14 s
2019-11-02 04:15:04,574 - INFO - [preprocessing] done in 17.89 s
2019-11-02 04:15:04,605 - INFO - [rename non-ascii cols] done in 0.03 s


In [3]:
# LightGBM parameters
lgb_params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": {"l2"},
    "num_leaves": 1000,
    "learning_rate": 0.1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 20,
    "verbose": -1,
}

# xgb parameters
xgb_params = {"objective": "reg:squarederror", "eval_metric": "rmse"}

In [4]:
# Step 4. Cross Validation
from src.exp.exp4_simple.cv import cv

with timer("cv with linear regression"):
    lr_result, scores = cv(
        train_df=train_df, model_name="linearRegression", params=None
    )
with timer("cv with xgboost"):
    xgb_result, scores = cv(
        train_df=train_df, model_name="xgboost", params=xgb_params
    )
with timer("cv with lightgbm"):
    lgb_result, scores = cv(
        train_df=train_df, model_name="lightgbm", params=lgb_params
    )

ImportError: cannot import name 'cv' from 'src.exp.exp4_simple.cv' (/app/notebook/exp/../../src/exp/exp4_simple/cv.py)

In [None]:
from src.exp.common.plot_result import plot_result


@interact(n_fold=list(range(N_FOLDS)))
def plot_results(n_fold):
    plot_result(original_train_df, lr_result, n_fold, "lr")
    plot_result(original_train_df, xgb_result, n_fold, "xgb")
    plot_result(original_train_df, lgb_result, n_fold, "lgb")