# notebooks.smart_microfluidics_models

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

In [None]:
file_path = "data.csv"
data = pd.read_csv(file_path, encoding="latin1")

## Random forest regressor

In [3]:
features = data.drop(columns=["ID", "SIZE", "PDI"])
targets = data[["SIZE", "PDI"]]
categorical_columns = features.select_dtypes(include=["object"]).columns
numerical_columns = features.select_dtypes(include=["float64", "int64"]).columns

preprocessor = ColumnTransformer(transformers=[("cat", OneHotEncoder(drop="first"), categorical_columns),],remainder="passthrough")
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)
model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))])
model.fit(X_train, y_train)
pickle_file_path = "random_forest_model.pkl"
with open(pickle_file_path, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {pickle_file_path}")

Model saved to random_forest_model.pkl


In [7]:
with open("random_forest_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
input_data = pd.DataFrame({
    "ML": ["HSPC"],
    "CHIP": ["Micromixer"],
    "TLP": [5.0],
    "ESM": [0.0],
    "HSPC": [3.75],
    "CHOL": [0.0],
    "PEG": [1.25],
    "TFR ": [1.0],
    "FRR": [3.0],
    "FR-O": [0.25],
    "FR-W": [0.75],
    "BUFFER": ["PBS"],
    "OUTPUT": ["YES"]
})

predictions = loaded_model.predict(input_data)
print("Predicted SIZE and PDI:", predictions)


Predicted SIZE and PDI: [[121.99666667   0.3471169 ]]


## XGBoost

In [9]:
features = data.drop(columns=["ID", "SIZE", "PDI"])
targets = data[["SIZE", "PDI"]]
categorical_columns = features.select_dtypes(include=["object"]).columns
numerical_columns = features.select_dtypes(include=["float64", "int64"]).columns

preprocessor = ColumnTransformer(transformers=[("cat", OneHotEncoder(drop="first"), categorical_columns),],remainder="passthrough")
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)
xgboost_model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", MultiOutputRegressor(XGBRegressor(n_estimators=100, random_state=42)))])

xgboost_model.fit(X_train, y_train)
pickle_file_path = "xgboost_model.pkl"
with open(pickle_file_path, "wb") as file:
    pickle.dump(xgboost_model, file)
print(f"Model saved to {pickle_file_path}")

Model saved to xgboost_model.pkl


## Inverse problem

In [11]:
targets = data[["SIZE", "PDI"]]
numerical_columns = ["TLP", "ESM", "HSPC", "CHOL", "PEG", "TFR ", "FRR", "FR-O", "FR-W"]
features = data[numerical_columns]
X_train, X_test, y_train, y_test = train_test_split(targets, features, test_size=0.2, random_state=42)

inverse_model = MultiOutputRegressor(XGBRegressor(n_estimators=100, random_state=42))
inverse_model.fit(X_train, y_train)

pickle_file_path = "inverse_xgboost_model.pkl"
with open(pickle_file_path, "wb") as file:
    pickle.dump(inverse_model, file)

print(f"Inverse model saved to {pickle_file_path}")


Inverse model saved to inverse_xgboost_model.pkl
