#notebooks.models_extended_2

## Setup

In [1]:
import os
os.chdir("smart_microfluidics")

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

## Data

In [3]:
df = pd.read_csv('data/cleaned_data.csv')
df = df[df["OUTPUT"] == 1]
df = df.drop(["OUTPUT"], axis=1)
df = df[df["CHIP"] == "Micromixer"]
df = df.drop(["CHIP"], axis=1)
df = df[df["ML"] == "ESM"]
df = df.drop(["ML"], axis=1)
df = df.drop(df.select_dtypes(include=['object', 'category']).columns, axis=1)

In [4]:
X = df.drop(columns=["SIZE", "PDI"])
y_size = df["SIZE"]
y_pdi = df["PDI"]

## XGBoost for size predictions

In [5]:
X_train, X_test, y_train_size, y_test_size = train_test_split(X, y_size, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb_model = XGBRegressor(random_state=42)

param_grid_xgb = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [0, 0.1, 1]
}

random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_grid_xgb, n_iter=50, scoring='neg_mean_squared_error', cv=5, random_state=42)
random_search_xgb.fit(X_train_scaled, y_train_size)


best_xgb_model = random_search_xgb.best_estimator_
y_pred_xgb_size = best_xgb_model.predict(X_test_scaled)
r2_xgb = r2_score(y_test_size, y_pred_xgb_size)
mse_xgb = mean_squared_error(y_test_size, y_pred_xgb_size)
mae_xgb = mean_absolute_error(y_test_size, y_pred_xgb_size)

print("Optimized XGBoost Model for SIZE Evaluation:")
print(f"Best Parameters: {random_search_xgb.best_params_}")
print(f"R-squared: {r2_xgb}")
print(f"Mean Squared Error: {mse_xgb}")
print(f"Mean Absolute Error: {mae_xgb}")

Optimized XGBoost Model for SIZE Evaluation:
Best Parameters: {'subsample': 0.7, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.2, 'lambda': 1, 'gamma': 0, 'colsample_bytree': 0.8}
R-squared: 0.8289789823207528
Mean Squared Error: 463.42585081807505
Mean Absolute Error: 13.469367980957031


## XGBoost for pdi predictions

In [6]:
X_train, X_test, y_train_pdi, y_test_pdi = train_test_split(X, y_pdi, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb_model = XGBRegressor(random_state=42)

param_grid_xgb = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [0, 0.1, 1]
}

random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_grid_xgb, n_iter=50, scoring='neg_mean_squared_error', cv=5, random_state=42)
random_search_xgb.fit(X_train_scaled, y_train_pdi)

best_xgb_model = random_search_xgb.best_estimator_
y_pred_xgb_pdi = best_xgb_model.predict(X_test_scaled)
r2_xgb = r2_score(y_test_pdi, y_pred_xgb_pdi)
mse_xgb = mean_squared_error(y_test_pdi, y_pred_xgb_pdi)
mae_xgb = mean_absolute_error(y_test_pdi, y_pred_xgb_pdi)

print("Optimized XGBoost Model for PDI Evaluation:")
print(f"Best Parameters: {random_search_xgb.best_params_}")
print(f"R-squared: {r2_xgb}")
print(f"Mean Squared Error: {mse_xgb}")
print(f"Mean Absolute Error: {mae_xgb}")

Optimized XGBoost Model for PDI Evaluation:
Best Parameters: {'subsample': 1.0, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.01, 'lambda': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
R-squared: 0.34588894116461155
Mean Squared Error: 0.003431941253060487
Mean Absolute Error: 0.03838176091086297


## Random forest regressor predictions for pdi

In [7]:
X_train, X_test, y_train_pdi, y_test_pdi = train_test_split(X, y_pdi, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_model = RandomForestRegressor(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

random_search_rf = RandomizedSearchCV(rf_model, param_distributions=param_grid_rf,
                                      n_iter=50, scoring='neg_mean_squared_error',
                                      cv=5, random_state=42, n_jobs=-1)
random_search_rf.fit(X_train_scaled, y_train_pdi)


best_rf_model = random_search_rf.best_estimator_
y_pred_rf_pdi = best_rf_model.predict(X_test_scaled)

# Evaluate the model
r2_rf = r2_score(y_test_pdi, y_pred_rf_pdi)
mse_rf = mean_squared_error(y_test_pdi, y_pred_rf_pdi)
mae_rf = mean_absolute_error(y_test_pdi, y_pred_rf_pdi)

# Print evaluation metrics
print("Optimized RandomForestRegressor Model for SIZE Evaluation:")
print(f"Best Parameters: {random_search_rf.best_params_}")
print(f"R-squared: {r2_rf}")
print(f"Mean Squared Error: {mse_rf}")
print(f"Mean Absolute Error: {mae_rf}")

Optimized RandomForestRegressor Model for SIZE Evaluation:
Best Parameters: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
R-squared: 0.5383643771732456
Mean Squared Error: 0.0024220754510437216
Mean Absolute Error: 0.03355048611111106
