In [1]:
# Create an ML algorithm that can accurately predict the time a car will spend on the test bench 
# based on the vehicle configuration
# Credentials - kasham1991@gmail.com / karan sharma


# Agenda
# 1. If for any column(s), the variance is equal to zero, then you need to remove those variable(s)
# 2. Check for null and unique values for test and train sets
# 3. Apply label encoder for categorical variables
# 4. Perform dimensaionlity reduction with PCA
# 5. Predict the test_df values using xgboost

In [2]:
# Importing the required libraries
# Loading the train/test data
# The lowercase alphabets are categorical variables
import numpy as np
import pandas as pd

train = pd.read_csv('C://Datasets//MERCtrain.csv')
train.head()
# train.info()
# print('Size of training set')
# train.shape

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Separating y column as this is for pediction output
y_train = train['y'].values
y_train

array([130.81,  88.53,  76.26, ..., 109.22,  87.48, 110.85])

In [4]:
# A lot of columns that have an X 
# Let's check for the same 
# 376 features with X
colums_x = [c for c in train.columns if 'X' in c]
# colums_x
print(len(colums_x))
print(train[colums_x].dtypes.value_counts())

376
int64     368
object      8
dtype: int64


In [5]:
# Looking at the test datset for simiilar features
test = pd.read_csv('C://Datasets//MERCtest.csv')
test.head()
# train.info()
# print('Size of training set')
# train.shape

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
# Creating the final dataset
# Removing unwanted columns (ID); y has been removed earlier
final_column = list(set(train.columns) - set(['ID', 'y']))

x_train = train[final_column]
# x_train
x_test = test[final_column]
# x_test

In [7]:
# Searching for null values
# Creating a function for the same
# There are no missin values
def detect(df):
    if df.isnull().any().any():
        print("Yes")
    else:
        print("No")

detect(x_train)
detect(x_test)

No
No


In [8]:
# Removal of columns with a variance of 0
# Column with a variance of 1 is irrelevant so we drop it

for column in final_column:
    check = len(np.unique(x_train[column]))
    if check == 1:
        x_train.drop(column, axis = 1) 
        x_test.drop(column, axis = 1)
    if check > 2: # Column is categorical; hence mapping to ordinal measure of value
        mapit = lambda x: sum([ord(digit) for digit in x])
        x_train[column] = x_train[column].apply(mapit)
        x_test[column] = x_test[column].apply(mapit)

x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(mapit)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[column] = x_test[column].apply(mapit)


Unnamed: 0,X253,X348,X250,X142,X84,X146,X338,X96,X373,X383,...,X158,X332,X336,X164,X366,X186,X178,X362,X23,X75
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,1,1,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,1,1,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [9]:
# Performing dimensionality reduction with principal components analysis
from sklearn.decomposition import PCA
n_comp = 12
pca = PCA(n_components = n_comp, random_state = 42)
pca_result_train = pca.fit_transform(x_train)
pca_result_test = pca.transform(x_test)
# print(pca_result_train)
# print(pca_result_test)

In [10]:
# ML Modeling with XGboost
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Splitting the data by 80/20
x_train, x_valid, y_train, y_valid = train_test_split(pca_result_train, y_train, test_size = 0.2, random_state = 42)

In [11]:
# Building the final feature set
f_train = xgb.DMatrix(x_train, label = y_train)
f_valid = xgb.DMatrix(x_valid, label = y_valid)
f_test = xgb.DMatrix(x_test)
f_test = xgb.DMatrix(pca_result_test)

In [12]:
# Setting the parameters for XGB
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

In [13]:
# Predicting the score
# Creating a function for the same

def scorer(m, w):
    labels = w.get_label()
    return 'r2', r2_score(labels, m)

final_set = [(f_train, 'train'), (f_valid, 'valid')]

P = xgb.train(params, f_train, 1000, final_set, early_stopping_rounds=50, feval=scorer, maximize=True, verbose_eval=10)

[0]	train-rmse:98.99695	valid-rmse:98.88884	train-r2:-59.49733	valid-r2:-61.82690
Multiple eval metrics have been passed: 'valid-r2' will be used for early stopping.

Will train until valid-r2 hasn't improved in 50 rounds.
[10]	train-rmse:81.14410	valid-rmse:81.07848	train-r2:-39.64492	valid-r2:-41.23399
[20]	train-rmse:66.59753	valid-rmse:66.55611	train-r2:-26.37844	valid-r2:-27.45948
[30]	train-rmse:54.75785	valid-rmse:54.73342	train-r2:-17.50910	valid-r2:-18.24670
[40]	train-rmse:45.14008	valid-rmse:45.13794	train-r2:-11.57816	valid-r2:-12.08984
[50]	train-rmse:37.34669	valid-rmse:37.35502	train-r2:-7.60987	valid-r2:-7.96497
[60]	train-rmse:31.04961	valid-rmse:31.08432	train-r2:-4.95120	valid-r2:-5.20775
[70]	train-rmse:25.98429	valid-rmse:26.03124	train-r2:-3.16787	valid-r2:-3.35352
[80]	train-rmse:21.93689	valid-rmse:21.99638	train-r2:-1.97059	valid-r2:-2.10852
[90]	train-rmse:18.73373	valid-rmse:18.81611	train-r2:-1.16641	valid-r2:-1.27463
[100]	train-rmse:16.22580	valid-rmse:16.

In [14]:
# Predicting on test set
p_test = P.predict(f_test)
p_test

array([ 79.28774,  96.29256,  81.22566, ...,  98.70664, 107.1244 ,
        95.05699], dtype=float32)

In [15]:
Predicted_Data = pd.DataFrame()
Predicted_Data['y'] = p_test
Predicted_Data.head()

Unnamed: 0,y
0,79.287743
1,96.292557
2,81.225662
3,77.318939
4,109.973053


In [16]:
# Thank You :) 