#notebooks.simple_models

## Setup

In [5]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

## Data

In [6]:
file_path = "smart_microfluidics/data/cleaned_data.csv"
data = pd.read_csv(file_path, encoding="latin1")

In [7]:
file_path = "smart_microfluidics/data/cleaned_data.csv"
yes_data = data[data["OUTPUT"] == 1]

## Random forest regressor

In [9]:
features = yes_data.drop(columns=["ID", "SIZE", "PDI", "OUTPUT"])
targets = yes_data[["SIZE", "PDI"]]
categorical_columns = features.select_dtypes(include=["object"]).columns
numerical_columns = features.select_dtypes(include=["float64", "int64"]).columns

preprocessor = ColumnTransformer(transformers=[("cat", OneHotEncoder(drop="first"), categorical_columns),],remainder="passthrough")
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)
model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"R-squared: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

R-squared: 0.36157896454981364
Mean Squared Error: 1958.890858993266
Mean Absolute Error: 15.086741645521377


In [None]:
pickle_file_path = "smart_microfluidics/models/random_forest_model.pkl"
with open(pickle_file_path, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {pickle_file_path}")

### Example of usage

In [10]:
with open("smart_microfluidics/models/random_forest_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
input_data = pd.DataFrame({
    "ML": ["HSPC"],
    "CHIP": ["Micromixer"],
    "TLP": [5.0],
    "ESM": [0.0],
    "HSPC": [3.75],
    "CHOL": [0.0],
    "PEG": [1.25],
    "TFR ": [1.0],
    "FRR": [3.0],
    "BUFFER": ["PBS"],
    "OUTPUT": [1]
})

predictions = loaded_model.predict(input_data)
print("Predicted SIZE and PDI:", predictions)


Predicted SIZE and PDI: [[115.64875      0.34697875]]


## XGBoost

In [11]:
features = yes_data.drop(columns=["ID", "SIZE", "PDI"])
targets = yes_data[["SIZE", "PDI"]]
categorical_columns = features.select_dtypes(include=["object"]).columns
numerical_columns = features.select_dtypes(include=["float64", "int64"]).columns

preprocessor = ColumnTransformer(transformers=[("cat", OneHotEncoder(drop="first"), categorical_columns),],remainder="passthrough")
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)
xgboost_model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", MultiOutputRegressor(XGBRegressor(n_estimators=100, random_state=42)))])

xgboost_model.fit(X_train, y_train)
y_pred = xgboost_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"R-squared: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

R-squared: 0.32854801416397095
Mean Squared Error: 1967.81201171875
Mean Absolute Error: 14.646432876586914


## Esample of usage

In [12]:
with open("smart_microfluidics/models/xgboost_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
input_data = pd.DataFrame({
    "ML": ["HSPC"],
    "CHIP": ["Micromixer"],
    "TLP": [5.0],
    "ESM": [0.0],
    "HSPC": [3.75],
    "CHOL": [0.0],
    "PEG": [1.25],
    "TFR ": [1.0],
    "FRR": [3.0],
    "BUFFER": ["PBS"],
    "OUTPUT": [1]
})

predictions = loaded_model.predict(input_data)
print("Predicted SIZE and PDI:", predictions)


Predicted SIZE and PDI: [[118.00421      0.33404157]]


## Inverse problem

In [13]:
targets = yes_data[["SIZE", "PDI"]]
numerical_columns = ["ESM", "HSPC", "CHOL", "PEG", "TFR ", "FRR"]
features = yes_data[numerical_columns]
X_train, X_test, y_train, y_test = train_test_split(targets, features, test_size=0.2, random_state=42)

inverse_model = MultiOutputRegressor(XGBRegressor(n_estimators=100, random_state=42))
inverse_model.fit(X_train, y_train)

inverse_model.fit(X_train, y_train)
y_pred = inverse_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"R-squared: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

R-squared: 0.11767681688070297
Mean Squared Error: 89.26007080078125
Mean Absolute Error: 3.7459394931793213


In [None]:
pickle_file_path = "smart_microfluidics/models/inverse_xgboost_model.pkl"
with open(pickle_file_path, "wb") as file:
    pickle.dump(inverse_model, file)

print(f"Inverse model saved to {pickle_file_path}")

## Example of usage

In [14]:
with open("smart_microfluidics/models/inverse_xgboost_model.pkl", "rb") as file:
    inverse_model = pickle.load(file)
input_data = pd.DataFrame({"SIZE": [118.0], "PDI": [0.33]})

predictions = inverse_model.predict(input_data)
predictions = np.abs(predictions)
predictions = np.where(predictions < 0.5, 0, predictions)
print("Predicted values for ESM, HSPC, CHOL, PEG, TFR, and FRR:", predictions)

Predicted values for ESM, HSPC, CHOL, PEG, TFR, and FRR: [[0.        3.7620564 0.        1.2572409 0.9628239 3.0061107]]
