In [16]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn import model_selection
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA 
from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb 
import xgboost as xgb 

from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

#Read train data file
train_df = pd.read_csv('./train.csv')

#Training set
print("Training set:")
n_data = len(train_df)
n_features = train_df.shape[1]
print("Number of Records: {}".format(n_data))
print("Number of Features: {}".format(n_features))

#train_df.head(n=10)
#train_df.info()

#Check for Missing Data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size) )

if(train_df.columns[train_df.isnull().sum() != 0].size):
	print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))



Training set:
Number of Records: 4459
Number of Features: 4993
Total Train Features with NaN Values = 0


In [17]:
train_df.head(n=10)

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
5,002dbeb22,2800000.0,0.0,0,0.0,0,0,0,0,0,...,12000.0,5600000.0,20000000.0,0,0,0,0,0,0,11000
6,003925ac6,164000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,40000,0,0,0
7,003eb0261,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
8,004b92275,979000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,4000000.0,0,0,0,0,0,0,0
9,0067b4fef,460000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,400000


# 1. Prepare Data

## 1.1 Check and Remove Constant Features

In [18]:
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)


colsToRemove = []
for col in X_train.columns:
    if X_train[col].std() == 0:
        colsToRemove.append(col)

X_train.drop(colsToRemove, axis=1, inplace=True)

print("Remove '{}' Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)


Remove '256' Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a74

## 1.2 Check and Remove duplicate Columns

In [19]:
colsToRemove = []
dupList = {}

columns = X_train.columns

for i in range(len(columns)-1):
    if columns[i] not in colsToRemove:
        v = X_train[columns[i]].values
        dupCols = []
        for j in range(i+1, len(columns)):
            if np.array_equal(v, X_train[columns[j]].values):
                colsToRemove.append(columns[j])
                dupCols.append(columns[j])
                dupList[columns[i]] = dupCols
    
X_train.drop(colsToRemove, axis=1, inplace=True)

print("Remove '{}' Duplicate Columns\n".format(len(colsToRemove)))
print(dupList)

Remove '4' Duplicate Columns

{'34ceb0081': ['d60ddde1b'], '8d57e2749': ['acc5b709d', 'f333a5f60'], '168b3e5bc': ['f8d75792f'], 'a765da8bc': ['912836770']}


## 1.3 Stardardize features

In [23]:
from sklearn.preprocessing import StandardScaler
X_train_scaled = StandardScaler().fit_transform(X_train)

# 2. Dimension Reduciton

## 2.1 PCA

In [32]:
pca_x = PCA(0.95).fit(X_train_scaled)
print("{} componets explain 95% of the variation in data".format(pca_x.n_components_))

1528 componets explain 95%% of the variation in data


In [33]:
pca = PCA(n_components=1528)
pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)

## 2.2 TSVD

It is said TSVD doesn't need to performed on standardised data. Why?

In [36]:
from sklearn.decomposition import TruncatedSVD

#randomly pick n_components as 1500 and later find out those components which attribute for 95% of variation in the data
svd_x = TruncatedSVD(n_components=1500, n_iter=20, random_state=42)
svd_x.fit(X_train)

#find out those components which attribute for 95% of variance in data
count = 0
for index, cumsum in enumerate(np.cumsum(svd_x.explained_variance_ratio_)):
    if cumsum <= 0.95:
        count+=1
    else:
        break
        
print(count)

#for index, cumsum in enumerate(np.cumsum(svd_x.explained_variance_ratio_)):
#    print(index, cumsum)

601


In [53]:
#svd = TruncatedSVD(n_components=count, random_state=42)
svd = TruncatedSVD(n_components=500, random_state=42)
svd.fit(X_train)
X_train_svd = svd.transform(X_train)

# 3. Model Selection

## 3.1 RandomForestRegressor

### 3.1.1 Try on Sandaridized Data

In [25]:
X_dev, X_val, y_dev, y_val = train_test_split(X_train_scaled, y_train, test_size = 0.2, random_state = 42)
model_rf = RandomForestRegressor(random_state = 42)
model_rf.fit(X_dev, y_dev)
print(model_rf)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [30]:
y_pred = model_rf.predict(X_val)
print("R2 score is {}".format(metrics.r2_score(y_val, y_pred)))
print("Explained Variance is {}".format(metrics.explained_variance_score(y_val, y_pred)))

R2 score is 0.19790930023910536
Explained Variance is 0.20134266727983696


### 3.1.2 Try on PCA Data

In [34]:
X_dev_pca, X_val_pca, y_dev_pca, y_val_pca = train_test_split(X_train_pca, y_train, test_size = 0.2, random_state = 42)
model_rf_pca = RandomForestRegressor(random_state = 42) 
model_rf_pca.fit(X_dev_pca, y_dev_pca)
print(model_rf_pca)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [29]:
y_pred_pca = model_rf_pca.predict(X_val_pca)
print("R2 score is {}".format(metrics.r2_score(y_val_pca, y_pred_pca)))
print("Explained Variance is {}".format(metrics.explained_variance_score(y_val_pca, y_pred_pca)))

R2 score is 0.16939224777182715
Explained Variance is 0.1724516812903465


### 3.1.3 Try on TSVD Data

In [54]:
X_dev_svd, X_val_svd, y_dev_svd, y_val_svd = train_test_split(X_train_svd, y_train, test_size = 0.2, random_state = 42)
model_rf_svd = RandomForestRegressor(random_state = 42) 
model_rf_svd.fit(X_dev_svd, y_dev_svd)
print(model_rf_svd)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [55]:
y_pred_svd = model_rf_svd.predict(X_val_svd)
print("R2 score is {}".format(metrics.r2_score(y_val_svd, y_pred_svd)))
print("Explained Variance is {}".format(metrics.explained_variance_score(y_val_svd, y_pred_svd)))

R2 score is 0.13377217032882027
Explained Variance is 0.13855342419530314
