<a href="https://colab.research.google.com/github/hoonzi-s/APC/blob/main/%231%20ALK%20_%20213BTMRVP%20Inferential%20_%20Rev%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [170]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [171]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [172]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE

In [173]:
data_origin = pd.read_csv('/content/drive/MyDrive/#1 ALK RVP CSV.csv', index_col = 0)
data_origin.info()
data_origin.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 2708 entries, 2014-09-16 to 2022-05-17
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RVP_LAB         2708 non-null   float64
 1   S.B5.213TI2804  2708 non-null   float64
 2   S.B5.213PI2806  2708 non-null   float64
 3   S.B5.213FC2602  2708 non-null   float64
dtypes: float64(4)
memory usage: 105.8+ KB


Unnamed: 0,RVP_LAB,S.B5.213TI2804,S.B5.213PI2806,S.B5.213FC2602
count,2708.0,2708.0,2708.0,2708.0
mean,39.308013,133.969684,335.047833,85.923927
std,3.406074,3.452283,29.851255,10.770634
min,28.5,97.606803,291.821242,35.510519
25%,37.4,131.998333,318.077753,80.966155
50%,39.0,133.436735,325.511703,88.584186
75%,40.7,136.158762,338.359004,92.780427
max,98.6,145.939417,520.512731,113.968771


In [174]:
data_origin.columns = ['RVP', 'T', 'P', 'F']
data = data_origin[(data_origin['RVP'] < 70) & (data_origin['T'] > 120)]
data

Unnamed: 0,RVP,T,P,F
2014-09-16,35.5,136.582564,335.836740,89.956760
2014-09-17,33.5,138.822292,338.164993,89.462762
2014-09-18,34.4,138.079717,339.017050,91.780995
2014-09-19,34.3,137.367731,335.839848,96.444227
2014-09-20,35.2,137.442057,349.710404,91.365322
...,...,...,...,...
2022-05-13,43.7,127.127323,326.187806,84.417961
2022-05-14,49.9,123.287827,321.014425,96.055985
2022-05-15,48.7,125.099084,332.709037,92.796671
2022-05-16,51.0,122.406470,325.956086,95.187382


In [175]:
A = 7.00961   # Temperature in Celsius
B = 1022.48
C = 248.145
P_ref = (335 / 101.325 + 1) * 760   # kPa to mmHg

data['PCT'] = data.iloc[:, 1] + B * np.log10(P_ref / (data.iloc[:, 2] / 101.325 + 1) / 760) / (A - np.log10(P_ref)) / (A - np.log10((data.iloc[:, 2] / 101.325 + 1) * 760))
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,RVP,T,P,F,PCT
2014-09-16,35.5,136.582564,335.836740,89.956760,136.512888
2014-09-17,33.5,138.822292,338.164993,89.462762,138.559265
2014-09-18,34.4,138.079717,339.017050,91.780995,137.746123
2014-09-19,34.3,137.367731,335.839848,96.444227,137.297795
2014-09-20,35.2,137.442057,349.710404,91.365322,136.231441
...,...,...,...,...,...
2022-05-13,43.7,127.127323,326.187806,84.417961,127.867296
2022-05-14,49.9,123.287827,321.014425,96.055985,124.467575
2022-05-15,48.7,125.099084,332.709037,92.796671,125.290371
2022-05-16,51.0,122.406470,325.956086,95.187382,123.166055


In [176]:
n = 10
information = pd.DataFrame(columns=['Variables', 'Solution', 'Alpha', 'Scaler', 'train_R2', 'test_R2', 'MAE', 'MSE'])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE


# T, P

## LinearRegression

In [177]:
input = data[['T', 'P']]
target = data['RVP']

train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'LinearRegression'
alpha = np.nan
scaler = np.nan

In [178]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  lr = LinearRegression()
  scores = cross_validate(lr, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466


## Ridge

### Ridge (alpha = 0.001)

In [179]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Ridge'
alpha = 0.001
scaler = np.nan

In [180]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  rg = Ridge(alpha = alpha)
  scores = cross_validate(rg, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316


### Ridge (alpha = 0.01)

In [181]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Ridge'
alpha = 0.01
scaler = np.nan

In [182]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  rg = Ridge(alpha = alpha)
  scores = cross_validate(rg, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132


### Ridge (alpha = 0.1)

In [183]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Ridge'
alpha = 0.1
scaler = np.nan

In [184]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  rg = Ridge(alpha = alpha)
  scores = cross_validate(rg, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146


In [185]:
Arg = Ridge(alpha = alpha)
scores = cross_validate(rg, train_input, train_target, cv = KFold(), return_estimator = True)
print(np.mean(scores['test_score']))
for model in scores['estimator']:
  print(model.coef_)

0.5234001644950975
[-0.71929552  0.09074151]
[-0.71028763  0.09440547]
[-0.69850004  0.08907151]
[-0.72819625  0.09480138]
[-0.71883971  0.09280782]


### Ridge (alpha = 1)

In [186]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Ridge'
alpha = 1
scaler = np.nan

In [187]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  rg = Ridge(alpha = alpha)
  scores = cross_validate(rg, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854


### Ridge (alpha = 10)

In [188]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Ridge'
alpha = 10
scaler = np.nan

In [189]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  rg = Ridge(alpha = alpha)
  scores = cross_validate(rg, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972


### Ridge (alpha = 100)

In [190]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Ridge'
alpha = 100
scaler = np.nan

In [191]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  rg = Ridge(alpha = alpha)
  scores = cross_validate(rg, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386


## Lasso

### Lasso (alpha = 0.001)

In [192]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Lasso'
alpha = 0.001
scaler = np.nan

In [193]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  ls = Lasso(alpha = alpha)
  scores = cross_validate(ls, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344


### Lasso (alpha = 0.01)

In [194]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Lasso'
alpha = 0.01
scaler = np.nan

In [195]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  ls = Lasso(alpha = alpha)
  scores = cross_validate(ls, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344
0,"T, P",Lasso,0.01,,0.525285,0.510928,1.619753,4.916843


### Lasso (alpha = 0.1)

In [196]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Lasso'
alpha = 0.1
scaler = np.nan

In [197]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  ls = Lasso(alpha = alpha)
  scores = cross_validate(ls, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344
0,"T, P",Lasso,0.01,,0.525285,0.510928,1.619753,4.916843
0,"T, P",Lasso,0.1,,0.519701,0.522649,1.618636,4.922786


### Lasso (alpha = 1)

In [198]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Lasso'
alpha = 1
scaler = np.nan

In [199]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  ls = Lasso(alpha = alpha)
  scores = cross_validate(ls, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344
0,"T, P",Lasso,0.01,,0.525285,0.510928,1.619753,4.916843
0,"T, P",Lasso,0.1,,0.519701,0.522649,1.618636,4.922786


### Lasso (alpha = 10)

In [200]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Lasso'
alpha = 10
scaler = np.nan

In [201]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  ls = Lasso(alpha = alpha)
  scores = cross_validate(ls, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344
0,"T, P",Lasso,0.01,,0.525285,0.510928,1.619753,4.916843
0,"T, P",Lasso,0.1,,0.519701,0.522649,1.618636,4.922786


### Lasso (alpha = 100)

In [202]:
input = data[['T', 'P']]
target = data['RVP']
train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'T, P'
soln = 'Lasso'
alpha = 100
scaler = np.nan

In [203]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input, target, test_size = 7/27)
  ls = Lasso(alpha = alpha)
  scores = cross_validate(ls, train_input, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_input, train_target))
  test_R2.append(best_model.score(test_input, test_target))
  inferential = np.sum(best_model.coef_ * input, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Alpha': alpha, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344
0,"T, P",Lasso,0.01,,0.525285,0.510928,1.619753,4.916843
0,"T, P",Lasso,0.1,,0.519701,0.522649,1.618636,4.922786


# Poly T, P

## LinearRegression

### StandardScaler

In [204]:
input = data[['T', 'P']]
target = data['RVP']

pf = PolynomialFeatures()
pf.fit(input)
input_poly = pf.transform(input)

train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'Poly T, P'
soln = 'LinearRegression'
alpha = np.nan
scaler = 'Standard'

In [205]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input_poly, target, test_size = 7/27)
  ss = StandardScaler()
  ss.fit(train_input)
  input_poly = ss.transform(input_poly)
  train_poly = ss.transform(train_input)
  test_poly = ss.transform(test_input)

  lr = LinearRegression()
  scores = cross_validate(lr, train_poly, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_poly, train_target))
  test_R2.append(best_model.score(test_poly, test_target))
  inferential = np.sum(best_model.coef_ * input_poly, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344
0,"T, P",Lasso,0.01,,0.525285,0.510928,1.619753,4.916843
0,"T, P",Lasso,0.1,,0.519701,0.522649,1.618636,4.922786


### MinMaxScaler

In [206]:
input = data[['T', 'P']]
target = data['RVP']

pf = PolynomialFeatures()
pf.fit(input)
input_poly = pf.transform(input)

train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'Poly T, P'
soln = 'LinearRegression'
alpha = np.nan
scaler = 'MinMax'

In [207]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input_poly, target, test_size = 7/27)
  mm = MinMaxScaler()
  mm.fit(train_input)
  input_poly = mm.transform(input_poly)
  train_poly = mm.transform(train_input)
  test_poly = mm.transform(test_input)

  lr = LinearRegression()
  scores = cross_validate(lr, train_poly, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_poly, train_target))
  test_R2.append(best_model.score(test_poly, test_target))
  inferential = np.sum(best_model.coef_ * input_poly, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

Unnamed: 0,Variables,Solution,Alpha,Scaler,train_R2,test_R2,MAE,MSE
0,"T, P",LinearRegression,,,0.52624,0.50566,1.620766,4.918466
0,"T, P",Ridge,0.001,,0.521987,0.510685,1.619855,4.920316
0,"T, P",Ridge,0.01,,0.521759,0.517022,1.619685,4.920132
0,"T, P",Ridge,0.1,,0.522151,0.5154,1.620021,4.918146
0,"T, P",Ridge,1.0,,0.521696,0.519134,1.619375,4.920854
0,"T, P",Ridge,10.0,,0.523793,0.512611,1.621018,4.918972
0,"T, P",Ridge,100.0,,0.521758,0.517711,1.620042,4.920386
0,"T, P",Lasso,0.001,,0.524759,0.511814,1.619704,4.916344
0,"T, P",Lasso,0.01,,0.525285,0.510928,1.619753,4.916843
0,"T, P",Lasso,0.1,,0.519701,0.522649,1.618636,4.922786


## Ridge

### Ridge (alpha = 0.001)

In [210]:
input = data[['T', 'P']]
target = data['RVP']

pf = PolynomialFeatures()
pf.fit(input)
input_poly = pf.transform(input)

train_R2 = []
test_R2 = []
mae = []
mse = []

vars = 'Poly T, P'
soln = 'Ridge'
alpha = 0.001
scaler = 'Standard'

In [213]:
for i in range(n):
  train_input, test_input, train_target, test_target = tts(input_poly, target, test_size = 7/27)
  ss = StandardScaler()
  ss.fit(train_input)
  input_poly = ss.transform(input_poly)
  train_poly = ss.transform(train_input)
  test_poly = ss.transform(test_input)

  rg = Ridge()
  scores = cross_validate(rg, train_poly, train_target, cv = KFold(), return_estimator = True)
  
  best_index = np.argmax(scores['test_score'])
  best_model = scores['estimator'][best_index]

  train_R2.append(best_model.score(train_poly, train_target))
  test_R2.append(best_model.score(test_poly, test_target))
  inferential = np.sum(best_model.coef_ * input_poly, axis = 1) + best_model.intercept_
  mae.append(MAE(target, inferential))
  mse.append(MSE(target, inferential))

new = pd.DataFrame({'Variables': vars, 
                    'Solution': soln, 
                    'Scaler': scaler, 
                    'train_R2': np.mean(train_R2), 
                    'test_R2': np.mean(test_R2), 
                    'MAE': np.mean(mae), 
                    'MSE': np.mean(mse)}, 
                   index = [0])
information = pd.concat([information, new])
information

  f"X has feature names, but {self.__class__.__name__} was fitted without"


ValueError: ignored