In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Read the csv file and checking the first five observation

In [57]:
df = pd.read_csv(r'USED DATA\Volvo 11 B Vsh.csv')
df.head()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
0,3351.6,8.682,0.061,0.277,6.865,2.314,0.001,0.055,0.869
1,3351.7,8.672,0.059,0.283,6.73,2.373,0.001,0.058,0.892
2,3351.8,8.625,0.057,0.285,6.58,2.309,0.001,0.061,0.881
3,3351.9,8.578,0.057,0.28,6.467,2.255,0.001,0.062,0.886
4,3352.0,8.601,0.056,0.267,6.4,2.309,0.001,0.064,0.876


In [58]:
df.describe()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
count,13911.0,13911.0,13911.0,13911.0,13911.0,13911.0,13911.0,13911.0,13911.0
mean,4048.43871,8.66173,0.056124,0.191735,5.367626,2506.850652,346.473834,0.159142,0.247266
std,402.43141,0.071179,0.01634,0.055184,1.340783,12047.425389,518.542832,0.078339,0.174159
min,3351.6,8.318,-0.04,0.024,3.647,0.14,0.001,0.02,0.026
25%,3699.35,8.625,0.046,0.161,4.395,2.973,0.6375,0.085,0.105
50%,4049.2,8.656,0.054,0.184,4.919,11.597,39.965,0.185,0.202
75%,4396.95,8.672,0.065,0.213,5.979,59.3285,554.622,0.228,0.328
max,4744.7,9.175,0.201,0.541,11.229,62290.77,3224.226,0.292,1.0


In [59]:
# selecting features and label
X = df.iloc[:, 1:-1]  # features
y = df.iloc[:,-1]  #label

In [60]:
[X.shape, y.shape]

[(13911, 7), (13911,)]

In [61]:
# selecting features and label
X = df.iloc[:, 1:-1]  # features
y = df.iloc[:,-1]  #label

# Standardization
from sklearn.preprocessing import StandardScaler

# transforming X_train value to feed to the model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)


In [62]:
# function to convert seconds to into hours, minutes and seconds
# this function is used to measure the running time of any model
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
      
    return "%d:%02d:%02d" % (hour, minutes, seconds)

### Loading the linear regression model from sklearn
### Fit the model, do the prediction

In [63]:
import time
from sklearn.linear_model import LinearRegression # LinearRegression, RidgeRegression
regressor = LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None, positive=False)

start = time.time() # starting of the time
# fit/train the model
regressor.fit(X_train_scaled,y)

end = time.time() # ending of the time
print('run_time:', convert(end - start), 'h:m:s')



run_time: 0:00:00 h:m:s


In [64]:
import os

# Create the output/cnn directory if it doesn't exist
output_dir = "./output/mvr"
os.makedirs(output_dir, exist_ok=True)

In [65]:
pred_1 = regressor.predict(X_train_scaled) # prediction for training set

In [66]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y, pred_1], axis=1) # training
a = pd.DataFrame(a, columns=['y', 'y_pred'])



with pd.ExcelWriter("./output/mvr/predicted_mvr_model.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="training", index=True)
    

### Evaluation Metrics for testing set

In [67]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error


train_r2 = r2_score(y,pred_1) # r2_score for training set
train_mae = mean_absolute_error(y, pred_1) # mean absolute error for training set
train_mse = mean_squared_error(y, pred_1) # mean squared error for training set
train_rmse = np.sqrt(mean_squared_error(y, pred_1)) # mean squared error for training set
train_mape = mean_absolute_percentage_error(y, pred_1) # mean_absolute_percentage_error for training set
train_ev = explained_variance_score(y, pred_1) 
train_maxE = max_error(y, pred_1)
train_minE = min(abs(y - pred_1))



metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE','RMSE','MAPE', 'EV', 'maxE', 'minE'],
'whole': [train_r2, train_mae, train_mse, train_mape,train_rmse, train_ev, train_maxE, train_minE],
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/mvr/performance_mvr.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6,7
performance_metrics,R2,MAE,MSE,RMSE,MAPE,EV,maxE,minE
whole,0.622299,0.076658,0.011455,0.429438,0.10703,0.622299,0.549473,0.000001


In [68]:
regressor.coef_

array([-0.04363137,  0.0041224 ,  0.06905595, -0.01740707,  0.00585657,
       -0.05453675, -0.078955  ])

In [69]:
regressor.intercept_

0.2472659765653081

In [70]:
df = pd.read_csv(r'USED DATA\Volvo 11 A Vsh.csv')
df.head()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
0,3575.0,8.648,0.069,0.252,6.823,1.96,0.001,0.087,0.773
1,3575.1,8.648,0.07,0.254,6.81,1.914,0.001,0.088,0.742
2,3575.2,8.672,0.067,0.253,6.825,1.775,0.001,0.09,0.74
3,3575.3,8.672,0.062,0.248,6.841,1.766,0.001,0.092,0.758
4,3575.4,8.672,0.059,0.243,6.857,1.75,0.001,0.092,0.778


In [71]:
# selecting features and label
X = df.iloc[:, 1:-1]  # features
y = df.iloc[:,-1]  #label

# Standardization
from sklearn.preprocessing import StandardScaler

# transforming X_train value to feed to the model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)

In [72]:
pred_1 = regressor.predict(X_train_scaled) # prediction for training set

In [73]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y, pred_1], axis=1) # training
a = pd.DataFrame(a, columns=['y', 'y_pred'])



with pd.ExcelWriter("./output/mvr/predicted_mvr_model_11A.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="training", index=True)

In [74]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error


train_r2 = r2_score(y,pred_1) # r2_score for training set
train_mae = mean_absolute_error(y, pred_1) # mean absolute error for training set
train_mse = mean_squared_error(y, pred_1) # mean squared error for training set
train_rmse = np.sqrt(mean_squared_error(y, pred_1)) # mean squared error for training set
train_mape = mean_absolute_percentage_error(y, pred_1) # mean_absolute_percentage_error for training set
train_ev = explained_variance_score(y, pred_1) 
train_maxE = max_error(y, pred_1)
train_minE = min(abs(y - pred_1))



metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE','RMSE','MAPE', 'EV', 'maxE', 'minE'],
'whole': [train_r2, train_mae, train_mse, train_mape,train_rmse, train_ev, train_maxE, train_minE],
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/mvr/performance_mvr_11a.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6,7
performance_metrics,R2,MAE,MSE,RMSE,MAPE,EV,maxE,minE
whole,0.186724,0.158677,0.045052,0.92282,0.212254,0.249737,0.709172,0.000236


In [75]:
df = pd.read_csv(r'USED DATA\Volvo T2 vsh.csv')
df.head()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
0,4335.0,8.703,0.057,0.214,6.779,2.194,0.001,0.091,0.794
1,4335.1,8.687,0.057,0.216,6.8,2.261,0.001,0.088,0.785
2,4335.2,8.672,0.052,0.214,6.869,2.249,0.001,0.089,0.783
3,4335.3,8.672,0.046,0.209,6.956,2.231,0.001,0.09,0.765
4,4335.4,8.693,0.042,0.202,7.019,2.231,0.001,0.09,0.763


In [76]:
# selecting features and label
X = df.iloc[:, 1:-1]  # features
y = df.iloc[:,-1]  #label

# Standardization
from sklearn.preprocessing import StandardScaler

# transforming X_train value to feed to the model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)

In [79]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y, pred_1], axis=1) # training
a = pd.DataFrame(a, columns=['y', 'y_pred'])



with pd.ExcelWriter("./output/mvr/predicted_mvr_model_t2.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="training", index=True)

In [78]:
pred_1 = regressor.predict(X_train_scaled) # prediction for training set

In [80]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error


train_r2 = r2_score(y,pred_1) # r2_score for training set
train_mae = mean_absolute_error(y, pred_1) # mean absolute error for training set
train_mse = mean_squared_error(y, pred_1) # mean squared error for training set
train_rmse = np.sqrt(mean_squared_error(y, pred_1)) # mean squared error for training set
train_mape = mean_absolute_percentage_error(y, pred_1) # mean_absolute_percentage_error for training set
train_ev = explained_variance_score(y, pred_1) 
train_maxE = max_error(y, pred_1)
train_minE = min(abs(y - pred_1))



metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE','RMSE','MAPE', 'EV', 'maxE', 'minE'],
'whole': [train_r2, train_mae, train_mse, train_mape,train_rmse, train_ev, train_maxE, train_minE],
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/mvr/performance_mvr_11t2.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6,7
performance_metrics,R2,MAE,MSE,RMSE,MAPE,EV,maxE,minE
whole,0.32844,0.148432,0.044314,0.626029,0.21051,0.430895,0.628452,0.000026


### Saving trained model to a file and reading the same file using pickle module

In [81]:
#from sklearn.externals import joblib
from joblib import dump, load
dump(regressor, './output/mvr/trained_linear_regression_model.joblib') 

#clf = load('trained_linear_regression_model.joblib')

['./output/mvr/trained_linear_regression_model.joblib']

In [82]:
pred_1 = regressor.predict(X_train_scaled) # prediction for training set

In [83]:
#from sklearn.externals import joblib
from joblib import dump, load
dump(model, './output/mvr/trained_linear_regression_model_poly.joblib') 

#clf = load('trained_linear_regression_model.joblib')

['./output/mvr/trained_linear_regression_model_poly.joblib']

In [84]:
#prediction using test cases or completely new cases
pred_1 = regressor.predict(X_train_scaled) # prediction for training set
pred_2 = regressor.predict(X_test_scaled) # prediction for testing set

NameError: name 'X_test_scaled' is not defined

### Regression plot for testing set