In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
import xgboost as xgb

%matplotlib inline 
np.set_printoptions(threshold=np.inf)

import warnings
warnings.filterwarnings('ignore')

In [2]:
training_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('test.csv')

In [3]:
print('Traning Dataframe Shape: {}'.format(training_df.shape))
print('Validation Dataframe Shape: {}'.format(validation_df.shape))

Traning Dataframe Shape: (4209, 378)
Validation Dataframe Shape: (4209, 377)


In [4]:
training_df_d = training_df.copy()
validation_df_d = validation_df.copy()

In [5]:
training_df_d = training_df_d.drop(['ID'], axis=1)
validation_df_d = validation_df_d.drop(['ID'], axis=1)

In [6]:
training_df_d.head()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
validation_df_d.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,az,v,n,f,d,t,a,w,0,0,...,0,0,0,1,0,0,0,0,0,0
1,t,b,ai,a,d,b,g,y,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,v,as,f,d,a,j,j,0,0,...,0,0,0,1,0,0,0,0,0,0
3,az,l,n,f,d,z,l,n,0,0,...,0,0,0,1,0,0,0,0,0,0
4,w,s,as,c,d,y,i,m,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
print('Traning Duplicate Dataframe Shape: {}'.format(training_df_d.shape))
print('Validation Duplicate Dataframe Shape: {}'.format(validation_df_d.shape))

Traning Duplicate Dataframe Shape: (4209, 377)
Validation Duplicate Dataframe Shape: (4209, 376)


In [11]:
for col_train in training_df_d.columns:
    if training_df_d[col_train].dtype == 'object':        
        lableEncoderTrain = LabelEncoder()
        training_df_d[col_train] = lableEncoderTrain.fit_transform(list(training_df_d[col_train].values))

In [12]:
for col_validate in validation_df_d.columns:
    if validation_df_d[col_validate].dtype == 'object':
        lableEncoderValidate = LabelEncoder()
        validation_df_d[col_validate] = lableEncoderValidate.fit_transform(list(validation_df_d[col_validate].values))

In [13]:
validation_df_d.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,3,26,0,22,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,3,9,6,24,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,3,0,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,3,31,11,13,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,3,30,8,12,0,0,...,1,0,0,0,0,0,0,0,0,0


In [14]:
training_df_d.head()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,32,23,17,0,3,24,9,14,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,32,21,19,4,3,28,11,14,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,20,24,34,2,3,27,9,23,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,20,21,34,5,3,27,11,4,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,20,23,34,5,3,12,3,13,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#Checking For Null Values
training_df_d.isnull()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
validation_df_d.isnull()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
for train_col in training_df.columns:
    if train_col not in ['ID', 'y']:
        print("{}: {}".format(train_col, training_df[train_col].unique()))

X0: ['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab']
X1: ['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
X2: ['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
X3: ['a' 'e' 'c' 'f' 'd' 'b' 'g']
X4: ['d' 'b' 'c' 'a']
X5: ['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
X6: ['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']
X8: ['o' 'x' 'e' 'n' 's' 'a' 'h' 'p' 'm' 'k' 'd' 'i' 'v' 'j' 'b' 'q' 'w' 'g'
 'y' 'l' 'f' 'u' 'r' 't' 'c']
X10: [0 1]
X11: [0]
X12: [0 1]
X13: [1 0]
X14: [0 1]
X15: [0 1]
X16: [0 1]
X17: [0 1]
X18: [1 0]
X19: [0 1]
X20:

In [22]:
for test_col in validation_df.columns:
    if test_col not in 'ID':
        print("{}: {}".format(test_col, validation_df[test_col].unique()))

X0: ['az' 't' 'w' 'y' 'x' 'f' 'ap' 'o' 'ay' 'al' 'h' 'z' 'aj' 'd' 'v' 'ak'
 'ba' 'n' 'j' 's' 'af' 'ax' 'at' 'aq' 'av' 'm' 'k' 'a' 'e' 'ai' 'i' 'ag'
 'b' 'am' 'aw' 'as' 'r' 'ao' 'u' 'l' 'c' 'ad' 'au' 'bc' 'g' 'an' 'ae' 'p'
 'bb']
X1: ['v' 'b' 'l' 's' 'aa' 'r' 'a' 'i' 'p' 'c' 'o' 'm' 'z' 'e' 'h' 'w' 'g' 'k'
 'y' 't' 'u' 'd' 'j' 'q' 'n' 'f' 'ab']
X2: ['n' 'ai' 'as' 'ae' 's' 'b' 'e' 'ak' 'm' 'a' 'aq' 'ag' 'r' 'k' 'aj' 'ay'
 'ao' 'an' 'ac' 'af' 'ax' 'h' 'i' 'f' 'ap' 'p' 'au' 't' 'z' 'y' 'aw' 'd'
 'at' 'g' 'am' 'j' 'x' 'ab' 'w' 'q' 'ah' 'ad' 'al' 'av' 'u']
X3: ['f' 'a' 'c' 'e' 'd' 'g' 'b']
X4: ['d' 'b' 'a' 'c']
X5: ['t' 'b' 'a' 'z' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac'
 'ad' 'ae' 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
X6: ['a' 'g' 'j' 'l' 'i' 'd' 'f' 'h' 'c' 'k' 'e' 'b']
X8: ['w' 'y' 'j' 'n' 'm' 's' 'a' 'v' 'r' 'o' 't' 'h' 'c' 'k' 'p' 'u' 'd' 'g'
 'b' 'q' 'e' 'l' 'f' 'i' 'x']
X10: [0 1]
X11: [0 1]
X12: [0 1]
X13: [0 1]
X14: [0 1]
X15: [0 1]
X16: [0 1]
X17: [

In [24]:
x = training_df_d.iloc[:, 1:].values
y = training_df_d.iloc[:, 0].values

In [25]:
x.shape

(4209, 376)

In [26]:
y.shape

(4209,)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

In [28]:
print("X_train Shape: {} X_test Shape {}".format(X_train.shape, X_test.shape))
print("y_train Shape: {} y_test Shape {}".format(y_train.shape, y_test.shape))

X_train Shape: (3367, 376) X_test Shape (842, 376)
y_train Shape: (3367,) y_test Shape (842,)


In [29]:
stdScaller = StandardScaler()

In [None]:
X_train_Scaled = stdScaller.fit_transform(X_train)

In [None]:
X_test_Scaled = stdScaller.transform(X_test)

In [30]:
n_comp=15
pca = PCA(n_components= n_comp)

In [31]:
X_train_pca_Us = pca.fit_transform(X_train)
X_test_pca_Us = pca.transform(X_test)

In [32]:
model = xgb.XGBRegressor()

In [33]:
model.fit(X_train_pca_Us, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [34]:
y_pred = model.predict(X_test_pca_Us)

In [37]:
for yp, y in zip(y_pred, y_test):
    print("Y^: {} Y: {}".format(yp, y))

Y^: 106.21578979492188 Y: 96.49
Y^: 96.5357666015625 Y: 96.93
Y^: 105.2703857421875 Y: 114.22
Y^: 96.09687042236328 Y: 88.1
Y^: 97.06352233886719 Y: 92.63
Y^: 96.65774536132812 Y: 93.83
Y^: 103.33512878417969 Y: 109.79
Y^: 103.6824722290039 Y: 89.03
Y^: 110.51071166992188 Y: 109.38
Y^: 98.75154113769531 Y: 103.9
Y^: 94.41645812988281 Y: 93.59
Y^: 93.14576721191406 Y: 91.03
Y^: 97.59732055664062 Y: 93.5
Y^: 110.47480010986328 Y: 110.7
Y^: 100.50263214111328 Y: 108.92
Y^: 78.12870025634766 Y: 75.88
Y^: 99.52096557617188 Y: 89.34
Y^: 99.38542938232422 Y: 118.44
Y^: 108.12872314453125 Y: 114.88
Y^: 101.17237854003906 Y: 92.21
Y^: 97.55756378173828 Y: 89.75
Y^: 95.09654235839844 Y: 108.01
Y^: 110.23529815673828 Y: 108.42
Y^: 101.79460144042969 Y: 100.21
Y^: 102.39891815185547 Y: 109.09
Y^: 96.17451477050781 Y: 112.97
Y^: 100.32978820800781 Y: 95.57
Y^: 93.09314727783203 Y: 84.76
Y^: 97.28694152832031 Y: 92.34
Y^: 102.37555694580078 Y: 101.89
Y^: 96.89611053466797 Y: 92.83
Y^: 112.2037811279

In [38]:
X_val = pca.transform(validation_df_d.values)

In [39]:
X_val[0]

array([15.05770702, 12.34553945, 16.8495866 , -0.26885661, 10.57794974,
        6.7997893 , -2.04217781,  2.31077483,  1.26556363, -1.53174262,
        1.48351782,  3.38563688, -0.79337205, -0.40559774, -0.60113543])

In [43]:
#model.predict(X_val)
print(model)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)


In [45]:
model.predict(X_val)

array([ 77.187775,  95.44889 ,  84.82961 ,  76.45038 , 114.58659 ,
        90.235954, 112.80874 , 102.10282 , 117.38312 ,  98.36348 ,
       117.47454 ,  99.242096,  97.949394, 101.5381  ,  98.88054 ,
        97.949394, 117.463165,  97.949394,  94.74356 ,  98.86562 ,
        97.949394,  97.949394,  94.85569 ,  97.68742 ,  92.97952 ,
       117.39405 ,  98.795555,  97.0913  ,  93.57566 ,  85.36801 ,
       106.00217 , 103.86159 , 106.35214 ,  98.97386 , 102.186646,
       103.24299 ,  98.93925 ,  98.11998 ,  95.02345 ,  99.37936 ,
        99.96051 ,  92.67098 , 101.12376 ,  96.6955  , 116.82655 ,
        92.95331 ,  98.11998 ,  98.137856, 105.41012 , 110.360596,
        97.68742 , 104.03414 , 100.88127 , 100.3155  , 102.07538 ,
        95.893745, 110.41921 ,  91.62593 , 102.5126  , 101.46703 ,
       103.80359 , 104.03414 ,  95.5895  , 108.48419 ,  98.121   ,
       105.27674 , 103.80674 ,  91.38055 , 111.24271 , 104.03414 ,
        94.87477 , 109.220245,  89.88881 , 103.15463 , 108.031