## Let's apply Linear, Lasso, Ridge, Random Forest, Xgboost algo's for prediction of viscosity

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
import xgboost
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso ,Ridge 
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
data=pd.read_csv('Data.csv')

In [3]:
data.head(2)

Unnamed: 0,log(shear rate) in s^-1,Polymer conc(wt%),NaCl concentration(wt%),Ca+2 concentration(wt%),Temperature(in celsius),log(viscosity) in cP
0,0.010415,0.3,0.1,0.0,25.0,2309.56
1,0.022561,0.3,0.1,0.0,25.0,2298.77


In [4]:
# independent variables (X):shear rate, Polymer conc, NaCl concentration, Ca+2 concentration, Temperature
# dependent variable (Y): Viscosity

In [5]:
X=data[data.columns[0:5]]
Y=data[['log(viscosity) in cP']]

In [6]:
X.head(2)

Unnamed: 0,log(shear rate) in s^-1,Polymer conc(wt%),NaCl concentration(wt%),Ca+2 concentration(wt%),Temperature(in celsius)
0,0.010415,0.3,0.1,0.0,25.0
1,0.022561,0.3,0.1,0.0,25.0


In [7]:
Y[0:2]

Unnamed: 0,log(viscosity) in cP
0,2309.56
1,2298.77


### standardize the dependent variables

In [8]:
scaler_1=StandardScaler()
X=pd.DataFrame(scaler_1.fit_transform(X),columns=X.columns)

### scaling viscosity values between 0 to 1

In [9]:
scaler_2=MinMaxScaler()
Y=pd.DataFrame(scaler_2.fit_transform(Y),columns=Y.columns)

### train test split

In [10]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=0,test_size=0.15,shuffle=True)

In [11]:
print(X_train.shape)
print(X_test.shape)

(555, 5)
(99, 5)


In [12]:
linear=LinearRegression()
lasso=Lasso()
ridge=Ridge()

In [13]:
linear.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
lasso.fit(X_train,Y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [15]:
ridge.fit(X_train,Y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [16]:
pred_linear=linear.predict(X_test)
pred_lasso=lasso.predict(X_test)
pred_ridge=ridge.predict(X_test)

In [17]:
Y_test['pred_linear']=pred_linear
Y_test['pred_lasso']=pred_lasso
Y_test['pred_ridge']=pred_ridge


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### why lasso regression is behaving in this manner

In [18]:
select_lasso=SelectFromModel(Lasso(random_state=0))
select_lasso.fit(X_train,Y_train)

SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [19]:
select_lasso.get_support()

array([False, False, False, False, False])

In [20]:
X.columns[select_lasso.get_support()]

Index([], dtype='object')

#### This is the reason for bad performance of lasso regression

In [21]:
select_ridge=SelectFromModel(Ridge(random_state=0))
select_ridge.fit(X_train,Y_train)

SelectFromModel(estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                                max_iter=None, normalize=False, random_state=0,
                                solver='auto', tol=0.001),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [22]:
select_ridge.get_support()

array([False,  True,  True,  True, False])

In [23]:
X.columns[select_ridge.get_support()]

Index(['Polymer conc(wt%)', 'NaCl concentration(wt%)',
       'Ca+2 concentration(wt%)'],
      dtype='object')

### above explains why ridge is performing better

### Random forest and XGboost

In [24]:
xgb=XGBRegressor()
xgb.fit(X_train,Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [25]:
pred_xgb=xgb.predict(X_test)

In [26]:
# we can get some how much importance is given to different features while prediction

In [27]:
X.columns

Index(['log(shear rate) in s^-1', 'Polymer conc(wt%)',
       'NaCl concentration(wt%)', 'Ca+2 concentration(wt%)',
       'Temperature(in celsius)'],
      dtype='object')

In [28]:
xgb.feature_importances_

array([6.6579990e-02, 5.7819480e-01, 1.4500351e-01, 2.0975587e-01,
       4.6576138e-04], dtype=float32)

In [29]:
Y_test['pred_xgb']=pred_xgb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### random forest

In [30]:
RF= RandomForestRegressor()
RF.fit(X_train,Y_train)

  


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [31]:
pred_RF=RF.predict(X_test)

In [32]:
X.columns

Index(['log(shear rate) in s^-1', 'Polymer conc(wt%)',
       'NaCl concentration(wt%)', 'Ca+2 concentration(wt%)',
       'Temperature(in celsius)'],
      dtype='object')

In [33]:
RF.feature_importances_

array([2.69214062e-01, 4.34452706e-01, 1.06046381e-01, 1.90216392e-01,
       7.04586252e-05])

In [34]:
Y_test['pred_RF']=pred_RF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [35]:
Y_test.head(10)

Unnamed: 0,log(viscosity) in cP,pred_linear,pred_lasso,pred_ridge,pred_xgb,pred_RF
535,0.011565,0.041858,0.0307,0.04181,0.010752,0.009761
492,0.479771,0.104577,0.0307,0.10435,0.988122,0.9631
14,0.302129,0.069757,0.0307,0.069658,0.31595,0.304569
247,0.016399,0.076284,0.0307,0.076108,0.019927,0.017803
85,0.003802,0.021742,0.0307,0.021794,0.003877,0.003845
127,0.006506,0.012577,0.0307,0.012582,0.006815,0.006545
586,0.439753,0.104573,0.0307,0.104347,0.636553,0.662505
529,0.015567,0.056558,0.0307,0.056483,0.012935,0.015803
330,0.006222,-0.001054,0.0307,-0.000903,0.0059,0.006489
483,0.00024,-0.040078,0.0307,-0.039882,0.001186,0.000533


### Calculating Mean absolute percentage error in prediction

In [36]:
err_linear=np.mean(np.abs((Y_test['log(viscosity) in cP']-Y_test['pred_linear'])/Y_test['log(viscosity) in cP'])*100)
err_lasso=np.mean(np.abs((Y_test['log(viscosity) in cP']-Y_test['pred_lasso'])/Y_test['log(viscosity) in cP'])*100)
err_ridge=np.mean(np.abs((Y_test['log(viscosity) in cP']-Y_test['pred_ridge'])/Y_test['log(viscosity) in cP'])*100)
err_xgb=np.mean(np.abs((Y_test['log(viscosity) in cP']-Y_test['pred_xgb'])/Y_test['log(viscosity) in cP'])*100)
err_RF=np.mean(np.abs((Y_test['log(viscosity) in cP']-Y_test['pred_RF'])/Y_test['log(viscosity) in cP'])*100)

In [37]:
print("MAPE in linear model predictions : {} ".format(err_linear))
print("MAPE in lasso model predictions : {} ".format(err_lasso))
print("MAPE in ridge model predictions : {} ".format(err_ridge))
print("MAPE in Xgboost model predictions : {} ".format(err_xgb))
print("MAPE in Random forest model predictions : {} ".format(err_RF))

MAPE in linear model predictions : 919.6201829041855 
MAPE in lasso model predictions : 865.1112041236879 
MAPE in ridge model predictions : 916.7047391249197 
MAPE in Xgboost model predictions : 34.02340475528004 
MAPE in Random forest model predictions : 33.82296529356474 


####  For sure linear models are giving extremely vague results as it seems none of the feature is linearly related with viscosity while on the other hand Random forest and Xgboost algorithms can be further tuned as they for promising results as they can handle non-linearities in relations

### Fine tuning of base RF and XGB models

In [38]:
RF_tuned=RandomForestRegressor()

parameters={'n_estimators':[50,80,100,120],
           }
RF_reg=GridSearchCV(RF_tuned,parameters,scoring='neg_mean_squared_error',cv=5)
Y_train=Y_train.to_numpy()
Y_train=Y_train.ravel()
RF_reg.fit(X_train,Y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [39]:
print(RF_reg.best_params_)
print(RF_reg.best_score_)

{'n_estimators': 100}
-0.001139362423841153


In [40]:
RF_final=RandomForestRegressor(n_estimators=100)
RF_final.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [41]:
RF_final_pred=RF_final.predict(X_test)

In [42]:
Y_test['RF_tuned_pred']=RF_final_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [43]:
err_RF_tuned=np.mean(np.abs((Y_test['log(viscosity) in cP']-Y_test['RF_tuned_pred'])/Y_test['log(viscosity) in cP'])*100)
print(err_RF_tuned)

31.471701848090095


In [44]:
Y_test.head(10)

Unnamed: 0,log(viscosity) in cP,pred_linear,pred_lasso,pred_ridge,pred_xgb,pred_RF,RF_tuned_pred
535,0.011565,0.041858,0.0307,0.04181,0.010752,0.009761,0.009894
492,0.479771,0.104577,0.0307,0.10435,0.988122,0.9631,0.946483
14,0.302129,0.069757,0.0307,0.069658,0.31595,0.304569,0.309732
247,0.016399,0.076284,0.0307,0.076108,0.019927,0.017803,0.017974
85,0.003802,0.021742,0.0307,0.021794,0.003877,0.003845,0.003844
127,0.006506,0.012577,0.0307,0.012582,0.006815,0.006545,0.006423
586,0.439753,0.104573,0.0307,0.104347,0.636553,0.662505,0.614891
529,0.015567,0.056558,0.0307,0.056483,0.012935,0.015803,0.015724
330,0.006222,-0.001054,0.0307,-0.000903,0.0059,0.006489,0.006479
483,0.00024,-0.040078,0.0307,-0.039882,0.001186,0.000533,0.000522
