In [2]:
%%time
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn import model_selection
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA 
from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb 
import xgboost as xgb 
from catboost import CatBoostRegressor 

from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

#Read train data file
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

#Training set
print("Training set:")
n_data = len(train_df)
n_features = train_df.shape[1]
print("Number of Records: {}".format(n_data))
print("Number of Features: {}".format(n_features))

#train_df.head(n=10)
#train_df.info()

#Check for Missing Data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size) )

if(train_df.columns[train_df.isnull().sum() != 0].size):
	print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))


#Test set
print("Test set:")
n_test = len(test_df)
n_features = test_df.shape[1]
print("Number of Records: {}".format(n_test))
print("Number of Features: {}".format(n_features))

Training set:
Number of Records: 4459
Number of Features: 4993
Total Train Features with NaN Values = 0
Test set:
Number of Records: 49342
Number of Features: 4992
CPU times: user 1min 25s, sys: 10.7 s, total: 1min 35s
Wall time: 1min 39s


In [3]:
train_df.head(n=10)

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
5,002dbeb22,2800000.0,0.0,0,0.0,0,0,0,0,0,...,12000.0,5600000.0,20000000.0,0,0,0,0,0,0,11000
6,003925ac6,164000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,40000,0,0,0
7,003eb0261,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
8,004b92275,979000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,4000000.0,0,0,0,0,0,0,0
9,0067b4fef,460000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,400000


In [5]:
test_df.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 1. Prepare Data

## 1.1 Check and Remove Constant Features

In [3]:
%%time
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = train_df["target"].values
X_test = test_df.drop(["ID"], axis=1)

colsToRemove = []
for col in X_train.columns:
    if X_train[col].std() == 0:
        colsToRemove.append(col)

X_train.drop(colsToRemove, axis=1, inplace=True)
X_test.drop(colsToRemove, axis=1, inplace=True)

print("Remove '{}' Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)


print("Training set:")
n_data = len(X_train)
n_features = X_train.shape[1]
print("Number of Records: {}".format(n_data))
print("Number of Features: {}".format(n_features))

Remove '256' Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a74

## 1.2 Split Dev/Val Set

In [4]:
X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

## 1.3 Check and Remove duplicate Columns

In [19]:
'''
colsToRemove = []
dupList = {}

columns = X_train.columns

for i in range(len(columns)-1):
    if columns[i] not in colsToRemove:
        v = X_train[columns[i]].values
        dupCols = []
        for j in range(i+1, len(columns)):
            if np.array_equal(v, X_train[columns[j]].values):
                colsToRemove.append(columns[j])
                dupCols.append(columns[j])
                dupList[columns[i]] = dupCols
    
X_train.drop(colsToRemove, axis=1, inplace=True)

print("Remove '{}' Duplicate Columns\n".format(len(colsToRemove)))
print(dupList)
'''

Remove '4' Duplicate Columns

{'34ceb0081': ['d60ddde1b'], '8d57e2749': ['acc5b709d', 'f333a5f60'], '168b3e5bc': ['f8d75792f'], 'a765da8bc': ['912836770']}


## 1.4 Stardardize features

In [27]:
%%time
from sklearn.preprocessing import StandardScaler
#X_train_scaled = StandardScaler().fit_transform(X_train)
sc_X = StandardScaler()
X_dev_scaled = sc_X.fit_transform(X_dev)
X_val_scaled = sc_X.transform(X_val)
X_test_scaled = sc_X.transform(X_test)


sc_y = StandardScaler() 
y_dev_scaled = sc_y.fit_transform(y_dev.reshape((len(y_dev), 1)))

y_dev_log1p = np.log1p(y_dev)
y_val_log1p = np.log1p(y_val)

CPU times: user 5.03 s, sys: 8.77 s, total: 13.8 s
Wall time: 12 s


# 2. Dimension Reduciton

## 2.1 PCA

In [56]:
pca_x = PCA(0.85).fit(X_train_scaled)
print("{} componets explain 95% of the variation in data".format(pca_x.n_components_))

994 componets explain 95% of the variation in data


In [57]:
pca = PCA(n_components=pca_x.n_components_)
pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)

## 2.2 TSVD

It is said TSVD doesn't need to performed on standardised data. Why?

In [36]:
from sklearn.decomposition import TruncatedSVD

#randomly pick n_components as 1500 and later find out those components which attribute for 95% of variation in the data
svd_x = TruncatedSVD(n_components=1500, n_iter=20, random_state=42)
svd_x.fit(X_train)

#find out those components which attribute for 95% of variance in data
count = 0
for index, cumsum in enumerate(np.cumsum(svd_x.explained_variance_ratio_)):
    if cumsum <= 0.95:
        count+=1
    else:
        break
        
print(count)

#for index, cumsum in enumerate(np.cumsum(svd_x.explained_variance_ratio_)):
#    print(index, cumsum)

601


In [53]:
#svd = TruncatedSVD(n_components=count, random_state=42)
svd = TruncatedSVD(n_components=500, random_state=42)
svd.fit(X_train)
X_train_svd = svd.transform(X_train)

# 3. Model Selection

## 3.1 RandomForestRegressor

### 3.1.1 Try on Sandaridized Data

In [25]:

model_rf = RandomForestRegressor(random_state = 42)
model_rf.fit(X_dev, y_dev)
print(model_rf)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [30]:
y_pred = model_rf.predict(X_val)
print("R2 score is {}".format(metrics.r2_score(y_val, y_pred)))
print("Explained Variance is {}".format(metrics.explained_variance_score(y_val, y_pred)))

R2 score is 0.19790930023910536
Explained Variance is 0.20134266727983696


### 3.1.2 Try on PCA Data

In [58]:
X_dev_pca, X_val_pca, y_dev_pca, y_val_pca = train_test_split(X_train_pca, y_train, test_size = 0.2, random_state = 42)
model_rf_pca = RandomForestRegressor(random_state = 42) 
model_rf_pca.fit(X_dev_pca, y_dev_pca)
print(model_rf_pca)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [59]:
y_pred_pca = model_rf_pca.predict(X_val_pca)
print("R2 score is {}".format(metrics.r2_score(y_val_pca, y_pred_pca)))
print("Explained Variance is {}".format(metrics.explained_variance_score(y_val_pca, y_pred_pca)))

R2 score is 0.11942628834214941
Explained Variance is 0.12083605623580707


### 3.1.3 Try on TSVD Data

In [54]:
X_dev_svd, X_val_svd, y_dev_svd, y_val_svd = train_test_split(X_train_svd, y_train, test_size = 0.2, random_state = 42)
model_rf_svd = RandomForestRegressor(random_state = 42) 
model_rf_svd.fit(X_dev_svd, y_dev_svd)
print(model_rf_svd)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [55]:
y_pred_svd = model_rf_svd.predict(X_val_svd)
print("R2 score is {}".format(metrics.r2_score(y_val_svd, y_pred_svd)))
print("Explained Variance is {}".format(metrics.explained_variance_score(y_val_svd, y_pred_svd)))

R2 score is 0.13377217032882027
Explained Variance is 0.13855342419530314


## 3.2 CatBoost

In [28]:
%%time
cb_model = CatBoostRegressor(iterations = 500,
                            learning_rate = 0.05,
                            depth = 10,
                            eval_metric = 'RMSE',
                            random_seed = 42,
                            bagging_temperature = 0.2,
                            od_type = 'Iter',
                            metric_period = 50,
                            od_wait = 20)
cb_model.fit(X_dev, y_dev_log1p,
            eval_set=(X_val, y_val_log1p),
            use_best_model=True,
            verbose=True)

0:	learn: 13.8849206	test: 13.8784128	best: 13.8784128 (0)	total: 5.05s	remaining: 42m
50:	learn: 2.0342737	test: 2.0171864	best: 2.0171864 (50)	total: 5m 11s	remaining: 45m 45s
100:	learn: 1.6029678	test: 1.6073706	best: 1.6073706 (100)	total: 10m 15s	remaining: 40m 31s
150:	learn: 1.5250946	test: 1.5581991	best: 1.5581991 (150)	total: 15m 38s	remaining: 36m 9s
200:	learn: 1.4736060	test: 1.5357500	best: 1.5356146 (196)	total: 21m 29s	remaining: 31m 57s
250:	learn: 1.3910444	test: 1.5095707	best: 1.5095707 (250)	total: 28m 9s	remaining: 27m 55s
300:	learn: 1.3377256	test: 1.4954333	best: 1.4953918 (298)	total: 32m 44s	remaining: 21m 38s
350:	learn: 1.3047366	test: 1.4877374	best: 1.4877374 (350)	total: 36m 48s	remaining: 15m 37s
400:	learn: 1.2748954	test: 1.4846130	best: 1.4846130 (400)	total: 40m 41s	remaining: 10m 2s
450:	learn: 1.2454797	test: 1.4808275	best: 1.4807827 (449)	total: 44m 41s	remaining: 4m 51s

bestTest = 1.475326417
bestIteration = 499

Shrink model to first 500 ite

In [29]:
pred_test_cat = np.expm1(cb_model.predict(X_test))

In [30]:
submission = pd.DataFrame()
submission['ID'] = test_df['ID']
submission['target'] = pred_test_cat
submission.to_csv('submission.csv', index=False)

## 3.3 LightGBM

In [32]:
def run_lgb(X_dev, y_dev, X_val, y_val, X_test):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "baggging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed" : 42
    }
    
    lgbdev = lgb.Dataset(X_dev, label=y_dev)
    lgbval = lgb.Dataset(X_val, label=y_val)
    evals_result = {}
    model = lgb.train(params,
                     lgbdev,
                     5000,
                     valid_sets=[lgbdev, lgbval],
                     early_stopping_rounds = 100,
                     verbose_eval=150,
                     evals_result=evals_result)
    pred_test_y = np.expm1(model.predict(X_test, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

# Train LGB
pred_test_lgb, lgb_model, evals_result_lgb = run_lgb(X_dev, y_dev_log1p, X_val, y_val_log1p, X_test)
print("LightGBM Training Completed...")

Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.46025	valid_1's rmse: 1.51496
[300]	training's rmse: 1.2833	valid_1's rmse: 1.44614
[450]	training's rmse: 1.16789	valid_1's rmse: 1.42235
[600]	training's rmse: 1.08366	valid_1's rmse: 1.4136
[750]	training's rmse: 1.01814	valid_1's rmse: 1.4113
[900]	training's rmse: 0.966116	valid_1's rmse: 1.4108
Early stopping, best iteration is:
[908]	training's rmse: 0.963643	valid_1's rmse: 1.41069
LightGBM Training Completed...


In [33]:
submission_lgb = pd.DataFrame()
submission_lgb['ID'] = test_df['ID']
submission_lgb['target'] = pred_test_lgb
submission_lgb.to_csv('submission_lgb.csv', index=False)

## 3.4 Xgboost

In [35]:
def run_xgb(X_dev, y_dev, X_val, y_val, X_test):
    params = {
        'objective' : 'reg:linear',
        'eval_metric' : 'rmse',
        'eta' : 0.001,
        'max_depth' : 10,
        'subsample' : 0.6,
        'colsample_bytree' : 0.6,
        'alpha' : 0.001,
        'random_state' : 42,
        'silent' : True
    }
    
    xgbdev = xgb.DMatrix(X_dev, y_dev)
    xgbval = xgb.DMatrix(X_val, y_val)
    watchlist = [(xgbdev, 'train'), (xgbval, 'valid')]
    xgb_model = xgb.train(params,
                         xgbdev,
                         5000,
                         watchlist,
                         maximize=False,
                         early_stopping_rounds=100,
                         verbose_eval=100)
    
    xgbtest = xgb.DMatrix(X_test)
    pred_test_xgb = np.expm1(xgb_model.predict(xgbtest, ntree_limit=xgb_model.best_ntree_limit))
    return pred_test_xgb, xgb_model

# Train XGB
pred_test_xgb, xgb_model = run_xgb(X_dev, y_dev_log1p, X_val, y_val_log1p, X_test)
print("XGB Training Completed...")
    

[0]	train-rmse:14.0877	valid-rmse:14.0769
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.7685	valid-rmse:12.7564
[200]	train-rmse:11.5766	valid-rmse:11.5632
[300]	train-rmse:10.4999	valid-rmse:10.4853
[400]	train-rmse:9.52768	valid-rmse:9.51285
[500]	train-rmse:8.65058	valid-rmse:8.6359
[600]	train-rmse:7.85824	valid-rmse:7.84463
[700]	train-rmse:7.14349	valid-rmse:7.13189
[800]	train-rmse:6.49876	valid-rmse:6.48996
[900]	train-rmse:5.91707	valid-rmse:5.91197
[1000]	train-rmse:5.39237	valid-rmse:5.39154
[1100]	train-rmse:4.91949	valid-rmse:4.92438
[1200]	train-rmse:4.49361	valid-rmse:4.50471
[1300]	train-rmse:4.10984	valid-rmse:4.12821
[1400]	train-rmse:3.76504	valid-rmse:3.79227
[1500]	train-rmse:3.45482	valid-rmse:3.49131
[1600]	train-rmse:3.17624	valid-rmse:3.22403
[1700]	train-rmse:2.92645	valid-rmse:2.98643
[1800]	train-rmse:2.70319	valid-rmse:2.77623
[1900]	train

In [36]:
submission_xgb = pd.DataFrame()
submission_xgb['ID'] = test_df['ID']
submission_xgb['target'] = pred_test_xgb
submission_xgb.to_csv('submission_xgb.csv', index=False)