In [26]:
# Base python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#sklearn packages - model selection
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

#sklearn packages - metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Q1

In [3]:
df = pd.read_csv('data-1.csv')

df.head()

Unnamed: 0,No.,C,Si,Mn,P,S,Cr,Ni,Mo,N,...,Al,Ti,V,B,Th,I,U,Ve,Strain,TCL
0,1,0.01,0.48,1.61,0.024,0.019,17.33,10.62,2.09,0.06,...,0.02,0.0,0.0,0.0,3.18,100,12.0,4.23,4.0,1.5
1,2,0.011,0.58,1.06,0.032,0.013,16.95,10.5,2.15,0.078,...,0.02,0.0,0.0,0.0,3.18,100,12.0,4.23,4.0,1.1
2,3,0.01,0.46,1.09,0.021,0.001,17.4,11.5,2.88,0.105,...,0.02,0.0,0.0,0.0,3.18,100,12.0,4.23,4.0,0.9
3,4,0.01,0.51,1.6,0.021,0.001,17.55,12.95,2.76,0.113,...,0.02,0.0,0.0,0.0,3.18,100,12.0,4.23,4.0,3.7
4,5,0.012,0.46,1.54,0.027,0.023,16.28,10.15,2.06,0.098,...,0.02,0.0,0.0,0.0,3.18,100,12.0,4.23,4.0,1.5


In [4]:
# break df into x and y numpy arrays
df_y = df.TCL.values
df_x = df.drop(['TCL', 'No.'], axis = 1).values


# are the two arrays the correct shape?
print(df_x.shape, df_y.shape)

# Any missing values?
print('The number of missing x values is:', np.count_nonzero(np.isnan(df_x)))
print('The number of missing y values is:', np.count_nonzero(np.isnan(df_y)))

(487, 21) (487,)
The number of missing x values is: 0
The number of missing y values is: 0


In [5]:
#Normalize feature data

normX = (df_x - df_x.mean(axis = 0)) / df_x.std(axis = 0)


#check our work. Variance should be one and mean should be ~0
print(normX.mean(axis = 0), normX.var(axis = 0))

[-2.91803998e-17  5.83607997e-17  5.10656997e-17 -8.75411995e-17
 -1.60492199e-16 -8.75411995e-16 -2.91803998e-16 -6.56558996e-17
 -1.45901999e-17  7.29509996e-17  2.91803998e-17 -1.16721599e-16
  7.29509996e-17 -2.91803998e-17  5.83607997e-17  1.16721599e-16
 -8.75411995e-17  1.02131399e-16 -3.79345198e-16 -2.62623598e-16
  2.11557899e-16] [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [6]:
# Train Test split

X_train, X_test, y_train, y_test = train_test_split(normX, df_y, test_size = 0.2, random_state = 1)

# Q2

In [18]:
#get transformed x matrix
x_train_reduced = PCA(n_components= 2).fit_transform(X_train)
x_test_reduced = PCA(n_components= 2).fit_transform(X_test)

#define ridge regression and fit data
rdg = Ridge(alpha= 0.1).fit(x_train_reduced, y_train)

#get y predictions
y_train_pred = rdg.predict(x_train_reduced)
y_test_pred = rdg.predict(x_test_reduced)

 #MAE values
print('The train MAE at alpha of  is: %0.3f' % mean_absolute_error(y_train, y_train_pred))
print('The test MAE is: %0.3f' % mean_absolute_error(y_test, y_test_pred))

#RMSE values
#MAE values
print('The train RMSE is: %0.3f' % mean_squared_error(y_train, y_train_pred))
print('The test RMSE is: %0.3f' % mean_squared_error(y_test, y_test_pred))

#R^2 values
print('The train R^2 is: %0.2f' % r2_score(y_train, y_train_pred))
print('The test R^2 is: %0.2f' % r2_score(y_test, y_test_pred))

The train MAE at alpha of  is: 2.958
The test MAE is: 3.601
The train RMSE is: 14.449
The test RMSE is: 23.291
The train R^2 is: 0.09
The test R^2 is: 0.01


## Q2 Comment

For reference the error values from the previous lab using a Ridge regression without dimensional reduction are

* The alpha value used in this regression is 0.1

* The train MAE at alpha of  is: 3.806
* The test MAE is: 4.651

* The train RMSE is: 22.017
* The test RMSE is: 34.094

* The train R^2 is: -0.39
* The test R^2 is: -0.45

It can be seen that across all error metrics, adding a dimensional reduction step improves the model fit.

# Q3

In [19]:
#get transformed x matrix
x_train_reduced = PCA(n_components= 4).fit_transform(X_train)
x_test_reduced = PCA(n_components= 4).fit_transform(X_test)

#define ridge regression and fit data
rdg = Ridge(alpha= 0.1).fit(x_train_reduced, y_train)

#get y predictions
y_train_pred = rdg.predict(x_train_reduced)
y_test_pred = rdg.predict(x_test_reduced)

 #MAE values
print('The train MAE at alpha of  is: %0.3f' % mean_absolute_error(y_train, y_train_pred))
print('The test MAE is: %0.3f' % mean_absolute_error(y_test, y_test_pred))

#RMSE values
#MAE values
print('The train RMSE is: %0.3f' % mean_squared_error(y_train, y_train_pred))
print('The test RMSE is: %0.3f' % mean_squared_error(y_test, y_test_pred))

#R^2 values
print('The train R^2 is: %0.2f' % r2_score(y_train, y_train_pred))
print('The test R^2 is: %0.2f' % r2_score(y_test, y_test_pred))

The train MAE at alpha of  is: 2.908
The test MAE is: 3.399
The train RMSE is: 14.124
The test RMSE is: 21.818
The train R^2 is: 0.11
The test R^2 is: 0.07


## Q3 Comment

The error metrics are *slightly* better when 4 components are used instead of two, but only slight. This suggest that the first two components adaquately capture most of the variation in the dataset

# Q4

In [20]:
#get transformed x matrix
x_train_reduced = PCA(n_components= 8).fit_transform(X_train)
x_test_reduced = PCA(n_components= 8).fit_transform(X_test)

#define ridge regression and fit data
rdg = Ridge(alpha= 0.1).fit(x_train_reduced, y_train)

#get y predictions
y_train_pred = rdg.predict(x_train_reduced)
y_test_pred = rdg.predict(x_test_reduced)

 #MAE values
print('The train MAE at alpha of  is: %0.3f' % mean_absolute_error(y_train, y_train_pred))
print('The test MAE is: %0.3f' % mean_absolute_error(y_test, y_test_pred))

#RMSE values
#MAE values
print('The train RMSE is: %0.3f' % mean_squared_error(y_train, y_train_pred))
print('The test RMSE is: %0.3f' % mean_squared_error(y_test, y_test_pred))

#R^2 values
print('The train R^2 is: %0.2f' % r2_score(y_train, y_train_pred))
print('The test R^2 is: %0.2f' % r2_score(y_test, y_test_pred))

The train MAE at alpha of  is: 2.823
The test MAE is: 3.466
The train RMSE is: 13.604
The test RMSE is: 22.629
The train R^2 is: 0.14
The test R^2 is: 0.04


## Q4 Comment

Similiar to the last comment, the addition of the higher order components does little to improve fit. In this case, with 8 PCs, the test error increases on all metrics, suggesting that 8 out of the max 21 PCs is too many

# Q5

Pick 4 principal components for the neural network

In [25]:
#get transformed x matrix
x_train_reduced = PCA(n_components= 4).fit_transform(X_train)
x_test_reduced = PCA(n_components= 4).fit_transform(X_test)

#define model and fit data
mlp = MLPRegressor(hidden_layer_sizes= (6, 6), \
                  max_iter= 1000).fit(x_train_reduced, y_train)

#Generate Predicted data set
y_train_pred = mlp.predict(x_train_reduced)
y_test_pred = mlp.predict(x_test_reduced)

#MAE values
print('The train MAE is: %0.2f' % mean_absolute_error(y_train, y_train_pred))
print('The test MAE is: %0.2f' % mean_absolute_error(y_test, y_test_pred))

#RMSE values
print('The train RMSE is: %0.2f' % mean_squared_error(y_train, y_train_pred))
print('The test RMSE is: %0.2f' % mean_squared_error(y_test, y_test_pred))

#R^2 values
print('The train R^2 is: %0.2f' % r2_score(y_train, y_train_pred))
print('The test R^2 is: %0.2f' % r2_score(y_test, y_test_pred))

The train MAE is: 2.58
The test MAE is: 3.34
The train RMSE is: 11.35
The test RMSE is: 21.91
The train R^2 is: 0.28
The test R^2 is: 0.07




## Q5 Comment

The values of error metrics for the model trained on the non-dimensional reduced data set are:

* The train MAE is: 1.13
* The test MAE is: 1.73

* The train RMSE is: 2.77
* The test RMSE is: 7.42

* The train R^2 is: 0.82
* The test R^2 is: 0.68

In this case, combining dimension reduction with the neural network results in higher scores in all error metrics and suggests that the model is poorly fit. Given that training and test error are both higher, it is clear that PCA and NN are a poor combination. The likely cause is that the PCA still generates linear features, where as the neural network can handle non-linear features. The main effect that PCA does is discard data that the neural network could have used.

# Q6- Pick your favorite

Model of choice is Gradient Boosting tree Regression

In [28]:
#get transformed x matrix
x_train_reduced = PCA(n_components= 4).fit_transform(X_train)
x_test_reduced = PCA(n_components= 4).fit_transform(X_test)


#define model
gbt = GradientBoostingRegressor().fit(x_train_reduced, y_train)

#get prediction arrays for train and test data sets
y_train_pred = gbt.predict(x_train_reduced)
y_test_pred = gbt.predict(x_test_reduced)

#MAE values
print('The train MAE is: %0.2f' % mean_absolute_error(y_train, y_train_pred))
print('The test MAE is: %0.2f' % mean_absolute_error(y_test, y_test_pred))

#RMSE values
#MAE values
print('The train RMSE is: %0.2f' % mean_squared_error(y_train, y_train_pred))
print('The test RMSE is: %0.2f' % mean_squared_error(y_test, y_test_pred))

#R^2 values
print('The train R^2 is: %0.2f' % r2_score(y_train, y_train_pred))
print('The test R^2 is: %0.2f' % r2_score(y_test, y_test_pred))

The train MAE is: 1.40
The test MAE is: 3.52
The train RMSE is: 3.24
The test RMSE is: 24.05
The train R^2 is: 0.80
The test R^2 is: -0.02


## Q6 Comment

The original gradient boosted tree regression had the following scores for the error metrics of interest:

* The train MAE is: 0.81
* The test MAE is: 1.56

* The train RMSE is: 1.22
* The test RMSE is: 5.49

* The train R^2 is: 0.92
* The test R^2 is: 0.77

Gradient boosted tree regression is significantly worse when combined with dimensional reduction. Similar to the case of PCA then NN, the gradient boosted tree regression can handle non-linearity and the linear features coming from PCA are only discarding otherwise useful data.