# Urea Concentration Prediction
Julia A King  
November 9th, 2021  
<span style="color:darkred">Center for Dialysis and Innovation</span>  
  
### Purpose
Use optical absorbance data for urea in solvents (water or fresh dialysate) to model the urea concentration

In [2]:
# Import packages

# Pandas library for the pandas dataframes
import pandas as pd    

# Import Scikit-Learn library for the regression models
import sklearn         
from sklearn import linear_model, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Note - you will need version 0.24.1 of scikit-learn to load this library (SequentialFeatureSelector)
from sklearn.feature_selection import f_regression, SequentialFeatureSelector
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Import numpy 
import numpy as np

# Another statistic model library
import statsmodels.api as sm
import statsmodels.formula.api as smf

import scipy.stats as stats
import scipy
from scipy import interpolate
from scipy.interpolate import interp1d

# Import plotting libraries
import seaborn as sns
import matplotlib 
from matplotlib import pyplot as plt

# Set larger fontsize for all plots
matplotlib.rcParams.update({'font.size': 14})

# Command to automatically reload modules before executing cells
# not needed here but might be if you are writing your own library 
%load_ext autoreload
%autoreload 2

In [3]:
# Import data

abs_data = pd.read_csv('absorbances_dataset.csv')

# Date Codes:
# 44482 = 10/13/2021
# 44496 = 10/27/2021
# 44418 = 8/10/2021

In [4]:
abs_data

Unnamed: 0,urea_concentration (mM),wavelength (nm),absorbance (A),stdev.p,solvent,background,date,n_replicates
0,0.5,190,0.553000,0.002000,water,water,44482,2
1,0.5,191,0.457000,0.002000,water,water,44482,2
2,0.5,192,0.364500,0.002500,water,water,44482,2
3,0.5,193,0.286500,0.002500,water,water,44482,2
4,0.5,194,0.232000,0.002000,water,water,44482,2
...,...,...,...,...,...,...,...,...
3926,12.0,296,0.000667,0.000943,fresh_dialysate,fresh_dialysate,44418,3
3927,12.0,297,0.000667,0.000943,fresh_dialysate,fresh_dialysate,44418,3
3928,12.0,298,0.000667,0.000943,fresh_dialysate,fresh_dialysate,44418,3
3929,12.0,299,0.000667,0.000943,fresh_dialysate,fresh_dialysate,44418,3


### Add column to make solvent an integer

In [5]:
# Initiate solvent_id list
solvent_id = []

# Define solvent numerical id's
water_id = 0
fresh_dialysate_id = 1

# Populate solvent_id list with solvent numerical id's
for i in abs_data['solvent']:
    if i == 'water':
        solvent_id.append(water_id)
    else:
        solvent_id.append(fresh_dialysate_id)

# Add solvent_id list to Pandas DataFrame for absorption data (abs_data)
abs_data['solvent_id'] = solvent_id

### Specify features matrix as `X_columns` - wavelength and absorbance
We won't confuse the model with any other data right now.

In [6]:
X = abs_data[['wavelength (nm)', 'absorbance (A)', 'stdev.p', 'solvent_id']].values
y = abs_data[['urea_concentration (mM)']].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4, shuffle=True)

### Observe linear regression for urea concentration

In [7]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Use model to fit to the training data, the X values are times and the Y values are positions of the Cheetah
regr.fit(X_train, y_train)
beta1 = regr.coef_[0][0]
beta0 = regr.intercept_[0]

# Print the slope m and intercept b
print('Scikit learn - Slope: ', beta1 , 'Intercept: ', beta0 )

# From the equation
Y_calc_test_2 = beta1*X_test + beta0

# Another way to get this is using the regr.predict function
Y_calc_test = regr.predict(X_test)

# Predict the values of  𝑦  in the test set using our fitted parameters.
Y_calc_test = regr.predict(X_test)

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, Y_calc_test))

# Print the coefficient of determination - 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(y_test, Y_calc_test))

# OLS Regression
mreg = sm.OLS(y_train, X_train).fit()
mreg.summary(alpha=0.1) # Set significance level

Scikit learn - Slope:  0.011822247589995888 Intercept:  21.748588442630805
Mean squared error: 821.22
Coefficient of determination: 0.10


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.422
Model:,OLS,Adj. R-squared (uncentered):,0.422
Method:,Least Squares,F-statistic:,573.9
Date:,"Fri, 12 Nov 2021",Prob (F-statistic):,0.0
Time:,09:46:20,Log-Likelihood:,-14946.0
No. Observations:,3144,AIC:,29900.0
Df Residuals:,3140,BIC:,29920.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.05,0.95]
x1,0.0715,0.002,41.459,0.000,0.069,0.074
x2,9.7234,1.512,6.430,0.000,7.235,12.211
x3,1006.9092,183.570,5.485,0.000,704.874,1308.944
x4,-13.7247,1.313,-10.450,0.000,-15.886,-11.564

0,1,2,3
Omnibus:,911.089,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2054.093
Skew:,1.663,Prob(JB):,0.0
Kurtosis:,5.15,Cond. No.,118000.0


### Choose just water for model
The model is confused using both water and fresh dialysate

In [14]:
# Specify DF of just the water runs
abs_data_water = abs_data.head(3265)

# Define features and target
X = abs_data_water[['wavelength (nm)', 'absorbance (A)', 'stdev.p', 'solvent_id']].values
y = abs_data_water[['urea_concentration (mM)']].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4, shuffle=True)

In [15]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Use model to fit to the training data, the X values are times and the Y values are positions of the Cheetah
regr.fit(X_train, y_train)
beta1 = regr.coef_[0][0]
beta0 = regr.intercept_[0]

# Print the slope m and intercept b
print('Scikit learn - Slope: ', beta1 , 'Intercept: ', beta0 )

# From the equation
Y_calc_test_2 = beta1*X_test + beta0

# Another way to get this is using the regr.predict function
Y_calc_test = regr.predict(X_test)

# Predict the values of  𝑦  in the test set using our fitted parameters.
Y_calc_test = regr.predict(X_test)

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, Y_calc_test))

# Print the coefficient of determination - 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(y_test, Y_calc_test))

# OLS Regression
mreg = sm.OLS(y_train, X_train).fit()
mreg.summary(alpha=0.1) # Set significance level

Scikit learn - Slope:  0.013005041007122537 Intercept:  21.91031876138851
Mean squared error: 960.12
Coefficient of determination: 0.01


  return np.sqrt(eigvals[0]/eigvals[-1])


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.432
Model:,OLS,Adj. R-squared (uncentered):,0.431
Method:,Least Squares,F-statistic:,660.5
Date:,"Fri, 12 Nov 2021",Prob (F-statistic):,2.13e-319
Time:,09:47:40,Log-Likelihood:,-12661.0
No. Observations:,2612,AIC:,25330.0
Df Residuals:,2609,BIC:,25350.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.05,0.95]
x1,0.0728,0.002,38.059,0.000,0.070,0.076
x2,10.3975,1.700,6.114,0.000,7.599,13.196
x3,1125.7079,219.192,5.136,0.000,765.040,1486.375
const,0,0,,,0,0

0,1,2,3
Omnibus:,591.551,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1069.503
Skew:,1.459,Prob(JB):,5.76e-233
Kurtosis:,4.147,Cond. No.,inf
