In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgm
%matplotlib inline
import matplotlib.pyplot as plt
from feature_engineering.feature_selector import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from feature_engineering.separate_str_num import * 
from model_selection.regressor_model_factory import RegressorModelFactory
from model_selection.cv import k_fold_regressor

  from ._conv import register_converters as _register_converters


In [2]:
train = pd.read_csv('./input/fusai_train_20180117.csv')
testa = pd.read_csv('./input/fusai_testA_20180117.csv')

In [3]:
answer_testa = pd.read_csv('./input/fusai_answer_a_20180127.csv', header=None, names=['id', 'values'])

In [4]:
train = train[train.iloc[:, -1] > 2.25]

In [5]:
train_X = train.iloc[:,1:-1]
train_Y = train.iloc[:, -1]

In [6]:
train_X = delete_constant(train_X)

In [7]:
train_X = delete_nan(train_X)

In [8]:
train_X = delete_duplicates(train_X)

In [33]:
data_num, data_str = separate_num_str(train_X)
test_num, test_str = separate_num_str(testa)

In [11]:
pearsons = calculate_pearson(data_num, train_Y)

In [28]:
pearsons = pearsons[(pearsons > 0.2).values | (pearsons < -0.2).values]

In [34]:
mics = calculate_mic(data_num, train_Y)

In [35]:
mics_15 = mics[mics > 0.15]

In [36]:
all_str = pd.concat([data_str, test_str], axis=0)

In [37]:
all_str = pd.get_dummies(all_str)

In [39]:
train_str = all_str.iloc[:data_str.shape[0]]
testa_str = all_str.iloc[data_str.shape[0]:]

(799, 45)

In [48]:
train_data = data_num.loc[:, mics_15.index]
testa_data = test_num.loc[:, mics_15.index]

In [10]:
# data_num.fillna(data_num.median(axis=0), inplace=True)

In [52]:
train_data = pd.concat([data_num.loc[:, mics_15.index], train_str], axis=1, ignore_index=True)
testa_data = pd.concat([test_num.loc[:, mics_15.index], testa_str], axis=1, ignore_index=True)

In [42]:
train_data.shape

(799, 144)

In [43]:
testa_data.shape

(300, 144)

In [30]:
# testa_data.fillna(train_data.median(axis=0), inplace=True)

In [44]:
rmf = RegressorModelFactory()
xgb = rmf.create_model(RegressorModelFactory.MODEL_XGBOOST)

In [53]:
xgb.fit(train_data, testa_data, train_Y, answer_testa['values'])

[0]	train-mse:5.41617	val-mse:5.10141
Multiple eval metrics have been passed: 'val-mse' will be used for early stopping.

Will train until val-mse hasn't improved in 100 rounds.
[100]	train-mse:1.36609	val-mse:1.24325
[200]	train-mse:0.359787	val-mse:0.32396
[300]	train-mse:0.102283	val-mse:0.110593
[400]	train-mse:0.033223	val-mse:0.058025
[500]	train-mse:0.013797	val-mse:0.042819
[600]	train-mse:0.008024	val-mse:0.038681
[700]	train-mse:0.006128	val-mse:0.037431
[800]	train-mse:0.005417	val-mse:0.036936
[900]	train-mse:0.005155	val-mse:0.036804
[1000]	train-mse:0.005002	val-mse:0.036735
[1100]	train-mse:0.004873	val-mse:0.036592
Stopping. Best iteration:
[1093]	train-mse:0.004873	val-mse:0.036591



In [15]:
importance = pd.Series(xgb.xgbr.get_score()).sort_values(ascending=False)

In [27]:
xgb.xgbr.get_score()

{'210X1': 78,
 '210X104': 60,
 '210X106': 34,
 '210X11': 95,
 '210X119': 42,
 '210X133': 33,
 '210X14': 29,
 '210X151': 55,
 '210X156': 62,
 '210X16': 23,
 '210X160': 36,
 '210X162': 41,
 '210X167': 56,
 '210X176': 35,
 '210X187': 24,
 '210X190': 38,
 '210X206': 43,
 '210X209': 23,
 '210X216': 18,
 '210X36': 139,
 '210X51': 20,
 '210X6': 58,
 '210X89': 13,
 '220X13': 31,
 '220X166': 92,
 '220X223': 36,
 '220X3': 97,
 '220X32': 43,
 '220X54': 35,
 '220X8': 54,
 '300X1': 47,
 '310X151': 25,
 '311X154': 14,
 '311X182': 71,
 '311X183': 82,
 '311X184': 20,
 '311X189': 59,
 '311X195': 47,
 '311X196': 30,
 '311X197': 15,
 '311X211': 113,
 '311X25': 56,
 '311X26': 19,
 '311X4': 31,
 '311X52': 31,
 '311X53': 19,
 '311X74': 25,
 '311X86': 32,
 '311X87': 45,
 '311X88': 44,
 '312X424': 40,
 '312X51': 82,
 '312X558': 52,
 '312X57': 53,
 '312X690': 34,
 '330X1147': 66,
 '330X291': 49,
 '330X422': 72,
 '340X109': 47,
 '340X13': 48,
 '340X15': 33,
 '344X19': 37,
 '344X44': 41,
 '360X1006': 43,
 '360X5

In [16]:
importance = importance[importance>10]

In [17]:
important_train = train_data[importance.index]
important_testa = testa_data[importance.index]

In [18]:
xgb.fit(important_train, important_testa, train_Y, answer_testa['values'])

[0]	train-mse:5.41617	val-mse:5.10141
Multiple eval metrics have been passed: 'val-mse' will be used for early stopping.

Will train until val-mse hasn't improved in 100 rounds.
[100]	train-mse:1.36678	val-mse:1.24445
[200]	train-mse:0.360396	val-mse:0.317477
[300]	train-mse:0.103313	val-mse:0.097185
[400]	train-mse:0.034369	val-mse:0.047897
[500]	train-mse:0.014889	val-mse:0.036899
[600]	train-mse:0.009041	val-mse:0.035179
Stopping. Best iteration:
[599]	train-mse:0.009077	val-mse:0.035155

