In [38]:
#Importing required libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


In [3]:
df_train = pd.read_excel("nanoindent.ods", engine = "odf")

In [4]:
df_train.head()

Unnamed: 0,A,B,n,MLE
0,100,100,0.1,0.7898
1,100,100,0.5,-0.4796
2,100,100,0.99,0.3492
3,100,550,0.1,0.5564
4,100,550,0.5,-0.752


In [5]:
df_train.columns

Index(['A', 'B', 'n', 'MLE'], dtype='object')

In [6]:
y = df_train.pop("MLE")
X = df_train


In [7]:
for col in X.columns:
    print(f"{col}", X[col].corr(y))

A -0.02755004686697247
B 0.05565161993433248
n -0.26326423539256993


In [8]:
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                      y, 
                                                      test_size= 0.2,
                                                     random_state= 7)

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

model = make_pipeline(
    
    StandardScaler(),
    LinearRegression()
)
model.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [10]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_train)
MSE_linear = mean_squared_error(y_train, y_pred)
print("The training mean squared error for the linear regression model is :", round(MSE_linear, 4))
print("The validation mean squared error for the linear regression model is :", 
      round(mean_squared_error(y_valid, model.predict(X_valid)), 4))

The training mean squared error for the linear regression model is : 0.2354
The validation mean squared error for the linear regression model is : 0.401


In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeRegressor(random_state = 42)

tree_params = {"max_depth": [5]}

tree_grid = GridSearchCV(estimator= dtree, param_grid= tree_params, cv = 5)

tree_grid.fit(X_train, y_train)
y_pred = tree_grid.predict(X_train)
MSE_tree = mean_squared_error(y_train, y_pred)
print("The training mean squared error for the decision tree model is :", round(MSE_linear, 4))
print("The validation mean squared error for the decision tree model is :", 
      round(mean_squared_error(y_valid, tree_grid.predict(X_valid)), 4))

The training mean squared error for the decision tree model is : 0.2354
The validation mean squared error for the decision tree model is : 0.3714


In [12]:
tree_grid.best_params_

{'max_depth': 5}

In [17]:
from sklearn.svm import SVR

svr = SVR()
X_scaled = StandardScaler().fit_transform(X)

svr_params = {"kernel": ["sigmoid"],
             "gamma": [0.1],
             "C": [0.2, 0.3]}

svr_grid = GridSearchCV(estimator= svr, param_grid= svr_params, cv = 5)
svr_grid.fit(X_train, y_train)
y_pred = svr_grid.predict(X_train)
MSE_svr = mean_squared_error(y_train, y_pred)
print("The training mean squared error for the SVR model is :", round(MSE_svr, 4))
print("The validation mean squared error for the SVR model is :", 
      round(mean_squared_error(y_valid, svr_grid.predict(X_valid)), 4))

The training mean squared error for the SVR model is : 0.3235
The validation mean squared error for the SVR model is : 0.1679


In [18]:
svr_grid.best_params_

{'C': 0.2, 'gamma': 0.1, 'kernel': 'sigmoid'}

In [23]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state= 42)

rf_params = {"n_estimators": [5],
            "max_depth": [3]}

rf_grid = GridSearchCV(estimator= rf, param_grid= rf_params, cv = 5)

rf_grid.fit(X_train, y_train)
y_pred = rf_grid.predict(X_train)
MSE_rf = mean_squared_error(y_train, y_pred)
print("The training mean squared error for the rf model is :", round(MSE_rf, 4))
print("The validation mean squared error for the rf model is :", 
      round(mean_squared_error(y_valid, rf_grid.predict(X_valid)), 4))

The training mean squared error for the rf model is : 0.1075
The validation mean squared error for the rf model is : 0.362


In [24]:
rf_grid.best_params_

{'max_depth': 3, 'n_estimators': 5}

In [45]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(random_state= 42)
gb_params = {
    "n_estimators": [50],
    "max_depth": [5],
    "learning_rate": [0.01],
    "loss": ["squared_error"]
}

gb_grid = GridSearchCV(estimator= gb, param_grid= gb_params, cv = 5)

gb_grid.fit(X_train, y_train)

y_pred = gb_grid.predict(X_train)
MSE_gb = mean_squared_error(y_train, y_pred)
print("The training mean squared error for the gb model is :", round(MSE_gb, 4))
print("The validation mean squared error for the gb model is :", 
      round(mean_squared_error(y_valid, gb_grid.predict(X_valid)), 4))

The training mean squared error for the gb model is : 0.11
The validation mean squared error for the gb model is : 0.1897


In [32]:
gb_grid.best_params_

{'learning_rate': 0.01,
 'loss': 'squared_error',
 'max_depth': 5,
 'n_estimators': 50}

In [46]:
from xgboost import XGBRegressor

xgbr = XGBRegressor(random_state = 42)

xgbr_params = {"eta": [0.2],
              "gamma": [0.5],
              "max_depth": [5],
              "num_boost_round": [25]}

xgbr_grid = GridSearchCV(estimator= xgbr,
                        param_grid= xgbr_params, 
                        cv = 5)
xgbr_grid.fit(X_train, y_train)

y_pred = xgbr_grid.predict(X_train)
MSE_xgbr = mean_squared_error(y_train, y_pred)
print("The training mean squared error for the xgbr model is :", round(MSE_xgbr, 4))
print("The validation mean squared error for the xgbr model is :", 
      round(mean_squared_error(y_valid, xgbr_grid.predict(X_valid)), 4))


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } 

In [44]:
xgbr_grid.best_params_

{'eta': 0.2, 'gamma': 0.5, 'max_depth': 5, 'num_boost_round': 25}

### Selection

Best on the results above, it is established that though SVR performs best on the validation data, it's training error is quite different from the validation error. So, there is a big chance that the model will not genaralize well.

The XGBoost model gives a slightly larger validation error but it's training error is not too far away. So, we shall use XGBoost for the test data.


In [47]:
df_test = pd.read_excel("nanoindent_test.ods", engine = "odf")

y_test = df_test.pop("MLE")
X_test = df_test


y_pred_test = xgbr_grid.predict(X_test)

print("The test mean squared error for the xgbr model is :", 
      round(mean_squared_error(y_test, y_pred_test), 4))


The test mean squared error for the xgbr model is : 0.2463


In [49]:
y_pred_test = svr_grid.predict(X_test)

print("The test mean squared error for the svr model is :", 
      round(mean_squared_error(y_test, y_pred_test), 4))


The test mean squared error for the svr model is : 0.1671


In [50]:
y_pred_test = gb_grid.predict(X_test)

print("The test mean squared error for the svr model is :", 
      round(mean_squared_error(y_test, y_pred_test), 4))


The test mean squared error for the svr model is : 0.2698
