#  Import

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from sklearn.model_selection import KFold
from functions.common_function import *
from functions.initialize_model import initialize_model_expanded
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

'''Enabling plotting of graphs just below the plotting commands'''
%matplotlib inline
'''Enabling the disply of all rows and columns within the dataframe'''
pd.set_option("display.max_rows", None, "display.max_columns", None)

#  Constant

In [2]:
num_feature = 8
cat_col = [4, 5]
num_ori_feature = num_feature - len(cat_col)
num_target = 3
limit = pd.DataFrame({'lower' : [303, 20, 0, 2, 0, 0, 0, 0, 122, 1236, 14], \
                     'higher' : [840, 44, 17, 5, 1, 1, 1, 1, 408, 3240, 101], \
                     'ref' : [530, 40, 14, 3.2, np.nan, np.nan,np.nan, np.nan, np.nan, np.nan, np.nan]})


'''Import dataset'''
dataset = pd.read_csv("Dataset/Choudhury_Dataset.csv")

# Implementation

In [3]:
'''Converting Categorical Data into binary representation'''
converted_dataset = convert_cat(dataset, cat_col, num_ori_feature, num_target, [dataset.iloc[:, 4].unique(), dataset.iloc[:, 5].unique()])

'''Normalising dataset according to higher and lower limit values'''
normalised_dataset = normalise(converted_dataset, limit)

'''Feature Target Splitting'''
feature, target = x_y_split(normalised_dataset, num_feature, num_target)

'''Model Fitting'''
linear_reg_model = LinearRegression()
MAE = abs(cross_val_score(linear_reg_model, feature, target, cv= 4, scoring= 'neg_mean_absolute_error'))
MSE = abs(cross_val_score(linear_reg_model, feature, target, cv= 4, scoring= 'neg_mean_squared_error' ))
RMSE = abs(cross_val_score(linear_reg_model, feature, target, cv=4, scoring= 'neg_root_mean_squared_error' ))


'''Provide average score'''
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(MAE)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - MAE: {MAE[i]} - MSE: {MSE[i]}- RMSE: {RMSE[i]}') 
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> MAE: {np.mean(MAE)} - Standard Deviation: {np.std(MAE)}')
print(f'> MSE: {np.mean(MSE)}')
print(f'> RMSE: {np.mean(RMSE)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - MAE: 0.14502579000061167 - MSE: 0.032699710389299703- RMSE: 0.16122437645741314
------------------------------------------------------------------------
> Fold 2 - MAE: 0.09555249344520078 - MSE: 0.018037283446608994- RMSE: 0.11717451740658252
------------------------------------------------------------------------
> Fold 3 - MAE: 0.1656448398494165 - MSE: 0.16943759939678726- RMSE: 0.27051634882311176
------------------------------------------------------------------------
> Fold 4 - MAE: 0.048499177296968364 - MSE: 0.00317976902958524- RMSE: 0.05198891818428977
------------------------------------------------------------------------
Average scores for all folds:
> MAE: 0.11368057514804931 - Standard Deviation: 0.04544240660482537
> MSE: 0.0558385905655703
> RMSE: 0.15022604021784927
--------------------------------

In [4]:
linear_reg_model  = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size = 0.20, random_state = 0)
linear_reg_model.fit(X_train, Y_train)
Y_pred = linear_reg_model.predict(X_test)
prediction = pd.DataFrame(Y_pred, columns = get_col_names(target))

'''Preparation to Rescale target values'''
min_y = limit.iloc[num_feature: num_feature + num_target, 0].to_list()
max_y = limit.iloc[num_feature: num_feature + num_target, 1].to_list()

corr_list = []
'''Tabulating the differences of Expected and Predictions made by the ANN Model'''
for i in range(len(X_test)):
    '''Rescaling of normalised data'''
    expected = pd.DataFrame(inverse_transform(Y_test.iloc[i].to_list(), max_y, min_y))
    predicted = pd.DataFrame(inverse_transform(prediction.iloc[i].to_list(), max_y, min_y))
    comparison_df = pd.concat([expected, predicted], axis = 1)
    comparison_df.columns = ['Expected', 'Predicted']
    comparison_df.index = get_col_names(target)
    display(comparison_df.style.set_caption(f"Element {i + 1}"))
    corr, _ = pearsonr(expected.iloc[:, 0].tolist(), predicted.iloc[:, 0].tolist())
    corr_list.append(corr)
    
'''Provide average score'''
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(corr_list)):
    print('------------------------------------------------------------------------')
    print(f'> Iteration {i+1} - Pearson Correlation: {corr_list[i]}') 
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Average Pearsons Correlation: {np.mean(corr_list)} - Standard Deviation: {np.std(corr_list)}')
print('------------------------------------------------------------------------')  

Unnamed: 0,Expected,Predicted
Average Particle Velocity,270.0,264.921941
Average Particle Temperature,2399.0,2407.681549
Average Particle Diameter,51.0,48.605412


Unnamed: 0,Expected,Predicted
Average Particle Velocity,264.0,251.576661
Average Particle Temperature,2373.0,2251.362917
Average Particle Diameter,47.0,43.928757


Unnamed: 0,Expected,Predicted
Average Particle Velocity,179.0,566.139773
Average Particle Temperature,2456.0,2306.827996
Average Particle Diameter,49.0,36.716862


Unnamed: 0,Expected,Predicted
Average Particle Velocity,263.0,306.046591
Average Particle Temperature,2393.0,2370.942665
Average Particle Diameter,50.0,46.238954


------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Iteration 1 - Pearson Correlation: 0.9999988909008115
------------------------------------------------------------------------
> Iteration 2 - Pearson Correlation: 0.9999997314934042
------------------------------------------------------------------------
> Iteration 3 - Pearson Correlation: 0.9844235110625483
------------------------------------------------------------------------
> Iteration 4 - Pearson Correlation: 0.9998026808987948
------------------------------------------------------------------------
Average scores for all folds:
> Average Pearsons Correlation: 0.9960562035888898 - Standard Deviation: 0.006716617884383949
------------------------------------------------------------------------
