In [214]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler, StandardScaler, scale
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [53]:
data = pd.read_csv("../dataset/polymer.csv")
data.head()

Unnamed: 0,log(shear rate) in s^-1,Polymer conc(wt%),NaCl concentration(wt%),Ca+2 concentration(wt%),Temperature(in celsius),log(viscosity) in cP
0,0.010415,0.3,0.1,0.0,25,2309.56
1,0.022561,0.3,0.1,0.0,25,2298.77
2,0.04887,0.3,0.1,0.0,25,2288.02
3,0.101641,0.3,0.1,0.0,25,1981.19
4,0.220169,0.3,0.1,0.0,25,1715.09


### Feature Engineering

Viscocity = **shear stress** / **shear rate**

I engineered the features to derive **shear stress** in order to improve the quality of the data

In [54]:
data['shear stress'] = data['log(viscosity) in cP'] * data['log(shear rate) in s^-1']

In [55]:
data.head()

Unnamed: 0,log(shear rate) in s^-1,Polymer conc(wt%),NaCl concentration(wt%),Ca+2 concentration(wt%),Temperature(in celsius),log(viscosity) in cP,shear stress
0,0.010415,0.3,0.1,0.0,25,2309.56,24.054529
1,0.022561,0.3,0.1,0.0,25,2298.77,51.86232
2,0.04887,0.3,0.1,0.0,25,2288.02,111.816453
3,0.101641,0.3,0.1,0.0,25,1981.19,201.370133
4,0.220169,0.3,0.1,0.0,25,1715.09,377.60965


**I separated the features from the target variable**

In [56]:
X = data.drop('log(viscosity) in cP', axis = 1)
y = data['log(viscosity) in cP']

**I chose to use a random state of 0 to ensure that I get a reproducable result after splitting**

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [208]:
scaler = StandardScaler()
model = LGBMRegressor(learning_rate=0.9, reg_alpha=0.6)

In [209]:
pipe = Pipeline([
    
    ('scaler', scaler),
    ('model', model)   
])

In [210]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 LGBMRegressor(boosting_type='gbdt', class_weight=None,
                               colsample_bytree=1.0, importance_type='split',
                               learning_rate=0.9, max_depth=-1,
                               min_child_samples=20, min_child_weight=0.001,
                               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                               num_leaves=31, objective=None, random_state=None,
                               reg_alpha=0.6, reg_lambda=0.0, silent=True,
                               subsample=1.0, subsample_for_bin=200000,
                               subsample_freq=0))],
         verbose=False)

In [211]:
prediction = pipe.predict(X_test)

In [216]:
prediction_score = r2_score(y_test, prediction)

print(f"The root mean square score for the test set is {prediction_score}")

The root mean square score for the test set is 0.9204579561063964


### Report

I tried three models; Linear Regression, Decision Tree, Xgboost, and Lightgbm

All models except Linear Regression gave an accuracy score that is beyond 0.5. The quality of the model increased after I engineered the features.

The best score I recorded without hyperparameter tunning was Lightbgm's model, after which I tunned some of the hyperparameters mannually to improve the performance of the model.

I finalized my work and decided to work with just Lightgbm model in my workflow, removing other models from the workflow.