In [None]:
# Make classification model for biscuit type
# Features are eta, gamma, phi, t and L 

# Apply classifier to microscopy data
# Investigate (r) for each biscuit type
# Can we predict biscuit type from (r) values?

# Regression models.
# How do the (r) values change with the biscuit type?
# Can we predict (L) better than the Washburn equation?
# How much better?



## Making a classifier model

Random Forest and Gradient Boosting classifiers for predicting biscuit type

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data 

dunking_data = pd.read_csv('data/dunking-data.csv')
microscopy_data = pd.read_csv('data/microscopy-data.csv')
tr1_data = pd.read_csv('data/tr-1.csv')
tr2_data = pd.read_csv('data/tr-2.csv')
tr3_data = pd.read_csv('data/tr-3.csv')

# Constants
tea_surface_tension = 6.78e-2  # N m−1
contact_angle = 1.45  # rad
tea_dynamic_viscosity = 9.93e-4  # Pa s

tr1_data['eta'] = tea_dynamic_viscosity
tr1_data['gamma'] = tea_surface_tension
tr1_data['phi'] = contact_angle

tr2_data['eta'] = tea_dynamic_viscosity
tr2_data['gamma'] = tea_surface_tension
tr2_data['phi'] = contact_angle

tr3_data['eta'] = tea_dynamic_viscosity
tr3_data['gamma'] = tea_surface_tension
tr3_data['phi'] = contact_angle

print(tr1_data)
print(dunking_data)

             t         L        dL       eta   gamma   phi
0    30.000000  0.008087  0.000392  0.000993  0.0678  1.45
1    32.727273  0.008253  0.000270  0.000993  0.0678  1.45
2    35.454545  0.008607  0.000501  0.000993  0.0678  1.45
3    38.181818  0.008920  0.000267  0.000993  0.0678  1.45
4    40.909091  0.009604  0.000274  0.000993  0.0678  1.45
..         ...       ...       ...       ...     ...   ...
95  289.090909  0.024948  0.000307  0.000993  0.0678  1.45
96  291.818182  0.025015  0.000711  0.000993  0.0678  1.45
97  294.545455  0.024558  0.000322  0.000993  0.0678  1.45
98  297.272727  0.025243  0.000489  0.000993  0.0678  1.45
99  300.000000  0.024784  0.000851  0.000993  0.0678  1.45

[100 rows x 6 columns]
         gamma       phi       eta         L          t    biscuit
0     0.073897  1.333006  0.000999  0.011196  19.362214  Digestive
1     0.080946  1.476758  0.001012  0.005894  11.852589  Digestive
2     0.087408  1.477141  0.000984  0.009249  24.793669  Digestive


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV

# Preprocessing
dunking_data = dunking_data.dropna()  # drop missing values
le = LabelEncoder()
dunking_data['biscuit'] = le.fit_transform(dunking_data['biscuit'])  # encode target variable

# Split data
X = dunking_data.drop('biscuit', axis=1)
y = dunking_data['biscuit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create classifier pipelines 
pipe_rf = Pipeline([('clf', RandomForestClassifier())])

pipe_gb = Pipeline([('clf', GradientBoostingClassifier())])

pipe_PCA_rf = Pipeline([('pca', PCA(n_components=4)), ('clf', RandomForestClassifier())])

pipe_PCA_gb = Pipeline([('pca', PCA(n_components=4)), ('clf', GradientBoostingClassifier())])

# Define hyperparameter grids for each model (adjust as needed)
rf_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_param_dist = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8]
}

# Create pipelines with hyperparameter tuning
pipe_rf_tuning = Pipeline([('clf', RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=rf_param_dist, n_iter=100, cv=5))])
pipe_gb_tuning = Pipeline([('clf', RandomizedSearchCV(estimator=GradientBoostingClassifier(), param_distributions=gb_param_dist, n_iter=100, cv=5))])

pipe_PCA_rf_tuning = Pipeline([('pca', PCA(n_components=4)), ('clf', RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=rf_param_dist, n_iter=100, cv=5))])
pipe_PCA_gb_tuning = Pipeline([('pca', PCA(n_components=4)), ('clf', RandomizedSearchCV(estimator=GradientBoostingClassifier(), param_distributions=gb_param_dist, n_iter=100, cv=5))])

pipelines = [pipe_rf, pipe_gb, pipe_PCA_rf, pipe_PCA_gb, pipe_rf_tuning, pipe_gb_tuning, pipe_PCA_rf_tuning, pipe_PCA_gb_tuning]
pipeline_names = ['Random Forest', 'Gradient Boosting', 'PCA-Random Forest', 'PCA-Gradient Boosting', 'Random Forest Tuned', 'Gradient Boosting Tuned', 'PCA-Random Forest Tuned', 'PCA-Gradient Boosting Tuned']

# Loop to fit each of pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

# Compare accuracies
for index, val in enumerate(pipelines):
    print('{} pipeline test accuracy: {}'.format(pipeline_names[index], val.score(X_test, y_test)))




In [10]:
# Use the best model to predict biscuit types for the microscopy data


In [None]:
# Investigate the (r) of different biscuit types 

## Predicting (L) using ML and Washburn equation

In [None]:
import numpy as np

def calculate_L(gamma, r, t, phi, eta):
    """
    Calculate the distance that the fluid travels into the solid.

    Parameters:
    gamma: the surface tension of the liquid
    r: the radius of the capillary pore
    t: the length of time for the capillary flow to occur
    phi: the contact angle between the solid and the liquid
    eta: the dynamic viscosity of the liquid

    Returns:
    L: the distance that the fluid travels into the solid
    """
    L = np.sqrt((gamma * r * t * np.cos(phi)) / (2 * eta))
    return L

In [None]:
microscopy_data["L_washburn"] = calculate_L(microscopy_data["gamma"], microscopy_data["r"], microscopy_data["t"], microscopy_data["phi"], microscopy_data["eta"])
microscopy_data["L_washburn_residuals"] = microscopy_data["L"] - microscopy_data["L_washburn"]