## Mercedes Benz Greener Manufacturing

Since the first automobile, the Benz Patent Motor Car in 1886, Mercedes-Benz has stood for important automotive innovations. These include the passenger safety cell with a crumple zone, the airbag, and intelligent assistance systems. Mercedes-Benz applies for nearly 2000 patents per year, making the brand the European leader among premium carmakers. Mercedes-Benz is the leader in the premium car industry. With a huge selection of features and options, customers can choose the customized Mercedes-Benz of their dreams.
To ensure the safety and reliability of every unique car configuration before they hit the road, the company’s engineers have developed a robust testing system. As one of the world’s biggest manufacturers of premium cars, safety and efficiency are paramount on Mercedes-Benz’s production lines. However, optimizing the speed of their testing system for many possible feature combinations is complex and time-consuming without a powerful algorithmic approach.


In [1]:
#import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

In [2]:
#load the datasets
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')

### Exploratory Data Analysis

In [3]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [7]:
#Determining the int64 and object column names
l_obj=[]
l_int=[]
for i in df_train.drop(['y'],axis=1).columns:
    if df_train[i].dtype=='O':
        l_obj.append(i)
    else:
        l_int.append(i)

In [8]:
print(l_obj)

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [9]:
print(l_int)

['ID', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'X83', 'X84', 'X85', 'X86', 'X87', 'X88', 'X89', 'X90', 'X91', 'X92', 'X93', 'X94', 'X95', 'X96', 'X97', 'X98', 'X99', 'X100', 'X101', 'X102', 'X103', 'X104', 'X105', 'X106', 'X107', 'X108', 'X109', 'X110', 'X111', 'X112', 'X113', 'X114', 'X115', 'X116', 'X117', 'X118', 'X119', 'X120', 'X122', 'X123', 'X124', 'X125', 'X126', 'X127', 'X128', 'X129', 'X130', 'X131', 'X132', 'X133', 'X134', 'X135', 'X136', 'X137', 'X138', 'X139', 'X140', 'X141', 'X142', 'X143', 'X144', 'X145', 'X146', 'X147', '

In [10]:
#columns that have zero variance
l_rem=[]
for i in l_int:
    if df_train[i].var()==0:
        l_rem.append(i)
print(l_rem)

['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [11]:
#removing the columns with zero variance
df_train=df_train.drop(l_rem,axis=1)
df_test=df_test.drop(l_rem,axis=1)

In [12]:
print(df_train.shape,df_test.shape)

(4209, 366) (4209, 365)


In [13]:
#checking for null values 
l_tr_null=[]
for j in df_train.columns:
    if df_train[j].isnull().any()==True:
        l_tr_null.append(j)
print(l_tr_null)

[]


In [14]:
l_ts_null=[]
for j in df_test.columns:
    if df_test[j].isnull().any()==True:
        l_ts_null.append(j)
print(l_ts_null)

[]


In [15]:
#printing the unique values
for i in l_obj:
    print(i,':\n',df_train[i].unique(),'\n')

X0 :
 ['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab'] 

X1 :
 ['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab'] 

X2 :
 ['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar'] 

X3 :
 ['a' 'e' 'c' 'f' 'd' 'b' 'g'] 

X4 :
 ['d' 'b' 'c' 'a'] 

X5 :
 ['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa'] 

X6 :
 ['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b'] 

X8 :
 ['o' 'x' 'e' 'n' 's' 'a' 'h' 'p' 'm' 'k' 'd' 'i' 'v' 'j' 'b' 'q' 'w' 'g'
 'y' 'l' 'f' 'u' 'r' 't' 'c'] 



In [16]:
for i in l_int:
    if (i not in l_rem) & (i not in ['ID']):
        print(i,':\n',df_train[i].unique(),'\n')

X10 :
 [0 1] 

X12 :
 [0 1] 

X13 :
 [1 0] 

X14 :
 [0 1] 

X15 :
 [0 1] 

X16 :
 [0 1] 

X17 :
 [0 1] 

X18 :
 [1 0] 

X19 :
 [0 1] 

X20 :
 [0 1] 

X21 :
 [1 0] 

X22 :
 [0 1] 

X23 :
 [0 1] 

X24 :
 [0 1] 

X26 :
 [0 1] 

X27 :
 [0 1] 

X28 :
 [0 1] 

X29 :
 [0 1] 

X30 :
 [0 1] 

X31 :
 [1 0] 

X32 :
 [0 1] 

X33 :
 [0 1] 

X34 :
 [0 1] 

X35 :
 [1 0] 

X36 :
 [0 1] 

X37 :
 [1 0] 

X38 :
 [0 1] 

X39 :
 [0 1] 

X40 :
 [0 1] 

X41 :
 [0 1] 

X42 :
 [0 1] 

X43 :
 [0 1] 

X44 :
 [0 1] 

X45 :
 [0 1] 

X46 :
 [1 0] 

X47 :
 [0 1] 

X48 :
 [0 1] 

X49 :
 [0 1] 

X50 :
 [0 1] 

X51 :
 [0 1] 

X52 :
 [0 1] 

X53 :
 [0 1] 

X54 :
 [0 1] 

X55 :
 [0 1] 

X56 :
 [0 1] 

X57 :
 [0 1] 

X58 :
 [1 0] 

X59 :
 [0 1] 

X60 :
 [0 1] 

X61 :
 [0 1] 

X62 :
 [0 1] 

X63 :
 [0 1] 

X64 :
 [0 1] 

X65 :
 [0 1] 

X66 :
 [0 1] 

X67 :
 [0 1] 

X68 :
 [1 0] 

X69 :
 [0 1] 

X70 :
 [1 0] 

X71 :
 [0 1] 

X73 :
 [0 1] 

X74 :
 [1 0] 

X75 :
 [0 1] 

X76 :
 [0 1] 

X77 :
 [0 1] 

X78 :
 [0 1] 

X79 :
 [0 

In [17]:
for i in l_obj:
    print(i,':\n',df_test[i].unique(),'\n')

X0 :
 ['az' 't' 'w' 'y' 'x' 'f' 'ap' 'o' 'ay' 'al' 'h' 'z' 'aj' 'd' 'v' 'ak'
 'ba' 'n' 'j' 's' 'af' 'ax' 'at' 'aq' 'av' 'm' 'k' 'a' 'e' 'ai' 'i' 'ag'
 'b' 'am' 'aw' 'as' 'r' 'ao' 'u' 'l' 'c' 'ad' 'au' 'bc' 'g' 'an' 'ae' 'p'
 'bb'] 

X1 :
 ['v' 'b' 'l' 's' 'aa' 'r' 'a' 'i' 'p' 'c' 'o' 'm' 'z' 'e' 'h' 'w' 'g' 'k'
 'y' 't' 'u' 'd' 'j' 'q' 'n' 'f' 'ab'] 

X2 :
 ['n' 'ai' 'as' 'ae' 's' 'b' 'e' 'ak' 'm' 'a' 'aq' 'ag' 'r' 'k' 'aj' 'ay'
 'ao' 'an' 'ac' 'af' 'ax' 'h' 'i' 'f' 'ap' 'p' 'au' 't' 'z' 'y' 'aw' 'd'
 'at' 'g' 'am' 'j' 'x' 'ab' 'w' 'q' 'ah' 'ad' 'al' 'av' 'u'] 

X3 :
 ['f' 'a' 'c' 'e' 'd' 'g' 'b'] 

X4 :
 ['d' 'b' 'a' 'c'] 

X5 :
 ['t' 'b' 'a' 'z' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac'
 'ad' 'ae' 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa'] 

X6 :
 ['a' 'g' 'j' 'l' 'i' 'd' 'f' 'h' 'c' 'k' 'e' 'b'] 

X8 :
 ['w' 'y' 'j' 'n' 'm' 's' 'a' 'v' 'r' 'o' 't' 'h' 'c' 'k' 'p' 'u' 'd' 'g'
 'b' 'q' 'e' 'l' 'f' 'i' 'x'] 



In [18]:
for i in l_int:
    if (i not in l_rem) & (i not in ['ID']):
        print(i,':\n',df_test[i].unique(),'\n')


X10 :
 [0 1] 

X12 :
 [0 1] 

X13 :
 [0 1] 

X14 :
 [0 1] 

X15 :
 [0 1] 

X16 :
 [0 1] 

X17 :
 [0 1] 

X18 :
 [0 1] 

X19 :
 [0 1] 

X20 :
 [0 1] 

X21 :
 [0 1] 

X22 :
 [0 1] 

X23 :
 [0 1] 

X24 :
 [0 1] 

X26 :
 [0 1] 

X27 :
 [1 0] 

X28 :
 [1 0] 

X29 :
 [1 0] 

X30 :
 [0 1] 

X31 :
 [1 0] 

X32 :
 [0 1] 

X33 :
 [0 1] 

X34 :
 [0 1] 

X35 :
 [1 0] 

X36 :
 [0 1] 

X37 :
 [1 0] 

X38 :
 [0 1] 

X39 :
 [0 1] 

X40 :
 [0 1] 

X41 :
 [0 1] 

X42 :
 [0 1] 

X43 :
 [1 0] 

X44 :
 [0 1] 

X45 :
 [0 1] 

X46 :
 [1 0] 

X47 :
 [0 1] 

X48 :
 [0 1] 

X49 :
 [0 1] 

X50 :
 [0 1] 

X51 :
 [0 1] 

X52 :
 [0 1] 

X53 :
 [0 1] 

X54 :
 [1 0] 

X55 :
 [0 1] 

X56 :
 [0 1] 

X57 :
 [0 1] 

X58 :
 [0 1] 

X59 :
 [0 1] 

X60 :
 [0 1] 

X61 :
 [1 0] 

X62 :
 [0 1] 

X63 :
 [0 1] 

X64 :
 [0 1] 

X65 :
 [0 1] 

X66 :
 [0 1] 

X67 :
 [0 1] 

X68 :
 [0 1] 

X69 :
 [0 1] 

X70 :
 [1 0] 

X71 :
 [0 1] 

X73 :
 [0 1] 

X74 :
 [1 0] 

X75 :
 [0 1] 

X76 :
 [1 0] 

X77 :
 [0 1] 

X78 :
 [0 1] 

X79 :
 [0 

### Preprocessing

In [19]:
#Applying label encoder on the categorical variables
le=LabelEncoder()
for i in l_obj:
    df_train[i]=le.fit_transform(df_train[i])
    df_test[i]=le.fit_transform(df_test[i])

In [20]:
x_tr=df_train.drop(['y','ID'],axis=1).values
y_tr=df_train[['y']]
id_val=df_test['ID'].values
x_ts=df_test.drop(['ID'],axis=1).values
print(x_tr.shape,y_tr.shape,x_ts.shape)

(4209, 364) (4209, 1) (4209, 364)


### Model Training and Prediction

In [21]:
#Train Test Split and making matrices for xgb
xtrain,xvalid,ytrain,yvalid=train_test_split(x_tr,y_tr,test_size=0.2,random_state=42)
d_train = xgb.DMatrix(xtrain, label=ytrain)
d_valid = xgb.DMatrix(xvalid, label=yvalid)
d_test = xgb.DMatrix(x_ts)

In [22]:
#Defining Parameters for XGBoost
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

In [23]:
#Training the model
def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:98.98858	valid-rmse:98.87984	train-r2:-59.48709	valid-r2:-61.81546
Multiple eval metrics have been passed: 'valid-r2' will be used for early stopping.

Will train until valid-r2 hasn't improved in 50 rounds.
[10]	train-rmse:81.05274	valid-rmse:80.98148	train-r2:-39.55346	valid-r2:-41.13300
[20]	train-rmse:66.42594	valid-rmse:66.38055	train-r2:-26.23755	valid-r2:-27.30953
[30]	train-rmse:54.51056	valid-rmse:54.48060	train-r2:-17.34230	valid-r2:-18.06930
[40]	train-rmse:44.81946	valid-rmse:44.79547	train-r2:-11.40012	valid-r2:-11.89197
[50]	train-rmse:36.95561	valid-rmse:36.92964	train-r2:-7.43050	valid-r2:-7.76196
[60]	train-rmse:30.59481	valid-rmse:30.56276	train-r2:-4.77814	valid-r2:-5.00118
[70]	train-rmse:25.47331	valid-rmse:25.42478	train-r2:-3.00556	valid-r2:-3.15303
[80]	train-rmse:21.37585	valid-rmse:21.30364	train-r2:-1.82059	valid-r2:-1.91581
[90]	train-rmse:18.12391	valid-rmse:18.02556	train-r2:-1.02766	valid-r2:-1.08751
[100]	train-rmse:15.57007	valid-rmse:15.

In [24]:
#Predicting Values
pred_test = clf.predict(d_test)
df_pred = pd.DataFrame()
df_pred['ID'] = id_val
df_pred['y'] = pred_test
df_pred.to_csv('xgb_merc.csv', index=False)
df_pred.head()

Unnamed: 0,ID,y
0,1,85.699326
1,2,102.601212
2,3,85.202553
3,4,76.192261
4,5,110.485764


### Model Training and Prediction with Dimensionality Reduction

In [25]:
# 30 features are selected for prediction
pca_ins=PCA(n_components=30)

In [26]:
#Apply PCA to the dataset
pca_ins.fit(xtrain)
x_tr_t=pca_ins.transform(xtrain)
x_va_t=pca_ins.transform(xvalid)

In [27]:
x_ts_t=pca_ins.transform(x_ts)

In [28]:
#Define the matrices and parameters
d_train = xgb.DMatrix(x_tr_t, label=ytrain)
d_valid = xgb.DMatrix(x_va_t, label=yvalid)
d_test = xgb.DMatrix(x_ts_t)
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

In [29]:
#Train the model again
def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:98.99545	valid-rmse:98.88363	train-r2:-59.49546	valid-r2:-61.82029
Multiple eval metrics have been passed: 'valid-r2' will be used for early stopping.

Will train until valid-r2 hasn't improved in 50 rounds.
[10]	train-rmse:81.12351	valid-rmse:81.00957	train-r2:-39.62432	valid-r2:-41.16224
[20]	train-rmse:66.55009	valid-rmse:66.43793	train-r2:-26.33946	valid-r2:-27.35850
[30]	train-rmse:54.67293	valid-rmse:54.58706	train-r2:-17.45173	valid-r2:-18.14391
[40]	train-rmse:45.01286	valid-rmse:44.95891	train-r2:-11.50736	valid-r2:-11.98622
[50]	train-rmse:37.17142	valid-rmse:37.13473	train-r2:-7.52925	valid-r2:-7.85955
[60]	train-rmse:30.81703	valid-rmse:30.80856	train-r2:-4.86238	valid-r2:-5.09809
[70]	train-rmse:25.69783	valid-rmse:25.70814	train-r2:-3.07648	valid-r2:-3.24612
[80]	train-rmse:21.59465	valid-rmse:21.63029	train-r2:-1.87862	valid-r2:-2.00591
[90]	train-rmse:18.33153	valid-rmse:18.39514	train-r2:-1.07439	valid-r2:-1.17399
[100]	train-rmse:15.75644	valid-rmse:15.

In [30]:
#Predicting values based on the reduced dimensions
pred_test_pca = clf.predict(d_test)
df_pred_pca = pd.DataFrame()
df_pred_pca['ID'] = id_val
df_pred_pca['y'] = pred_test_pca
df_pred_pca.to_csv('xgb_merc_pca.csv', index=False)
df_pred_pca.head()

Unnamed: 0,ID,y
0,1,79.10067
1,2,94.513527
2,3,79.337204
3,4,77.359955
4,5,108.826019
