In [19]:
import r3.schema as schema
import r3.adapters as adapters
import r3.schema as schema
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C
import xgboost
from sklearn.ensemble import RandomForestRegressor
import numpy as np


X_columns = [
    schema.IVTReactionSchema.NTP_M.value,
    schema.IVTReactionSchema.T7RNAP_u_uL.value,
    schema.IVTReactionSchema.DNA_ug_mL.value,
    schema.IVTReactionSchema.Mg2_M.value,
    schema.IVTReactionSchema.TIME_min.value,
]
y_columns = [schema.IVTReactionSchema.RNA_g_L.value]
experimental_conditions, response = adapters.DataPipelineAdapter(
    "csp_lhs", verbose=False
).get(X_columns=X_columns, y_columns=y_columns)

# Remove sample 11
experimental_conditions = experimental_conditions.drop(index=[11])
response = response.drop(index=[11])

experimental_data = experimental_conditions
experimental_data[y_columns[0]] = response[y_columns[0]]
experimental_data

Unnamed: 0,NTPs [M],T7RNAP [units/uL],DNA [µg/mL],Mg2+ [M],Reaction Time [min],RNA [g/L]
0,0.012,238,99,0.043,105.0,3.2185
1,0.023,161,89,0.053,67.2,5.956
2,0.011,123,40,0.06,220.8,3.0215
3,0.013,180,56,0.039,112.2,3.4645
4,0.033,338,75,0.029,73.8,6.694
5,0.045,392,83,0.034,50.4,1.57
6,0.034,316,66,0.058,30.6,7.178
7,0.051,172,58,0.019,234.0,0.055
8,0.019,379,80,0.011,26.4,0.088296
9,0.009,109,49,0.013,16.2,0.582245


In [None]:
ntp = experimental_conditions[schema.IVTReactionSchema.NTP_M.value].values.ravel()
ntp_equation = -8240.881 * ntp**2 + 549.265 * ntp**1 + -2.635 * ntp**0
t7_rnap = experimental_conditions[
    schema.IVTReactionSchema.T7RNAP_u_uL.value
].values.ravel()
t7_rnap_equation = 0.000 * t7_rnap**2 - 0.003 * t7_rnap + 2.444 * t7_rnap**0
mg2 = experimental_conditions[schema.IVTReactionSchema.Mg2_M.value].values.ravel()
mg2_equation = -2137.944 * mg2**2 + 248.926 * mg2 + -1.388 * mg2**0

ntp = experimental_conditions[schema.IVTReactionSchema.NTP_M.value].values.ravel()
ntp_equation = -8240.881 * ntp**2 + 549.265 * ntp**1 + -2.635 * ntp**0
t7_rnap = experimental_conditions[
    schema.IVTReactionSchema.T7RNAP_u_uL.value
].values.ravel()
t7_rnap_equation = 0.000 * t7_rnap**2 - 0.003 * t7_rnap + 2.444 * t7_rnap**0
mg2 = experimental_conditions[schema.IVTReactionSchema.Mg2_M.value].values.ravel()
mg2_equation = -2137.944 * mg2**2 + 248.926 * mg2 + -1.388 * mg2**0

# Select statistically significant features and polynomial features


features = [
    schema.IVTReactionSchema.NTP_M.value,

    schema.IVTReactionSchema.T7RNAP_u_uL.value,
    schema.IVTReactionSchema.Mg2_M.value,
]



# Prepare the feature matrix
df = experimental_data.copy()



X = df[
    [
        "NTPs [M]",
        "T7RNAP [units/uL]",
        "Mg2+ [M]",
    ]
].values


y = df["RNA [g/L]"].values


# Train-test split (80% train, 20% test)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



# Standardize using training set statistics


scaler_X = StandardScaler()


scaler_y = StandardScaler()


X_train_scaled = scaler_X.fit_transform(X_train)


X_test_scaled = scaler_X.transform(X_test)



y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()


y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).flatten()



# Gaussian Process


rbf_gp = GaussianProcessRegressor(

    kernel=C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
    + WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-2, 1e2)),
    n_restarts_optimizer=50,
    random_state=42,
)



rbf_gp.fit(X_train_scaled, y_train_scaled)


y_pred_gp_train = scaler_y.inverse_transform(

    rbf_gp.predict(X_train_scaled).reshape(-1, 1)

).flatten()


y_pred_gp_test = scaler_y.inverse_transform(

    rbf_gp.predict(X_test_scaled).reshape(-1, 1)

).flatten()



# XGBoost
xgb = xgboost.XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    objective="reg:squarederror",
    random_state=42,
)



xgb.fit(X_train_scaled, y_train_scaled)


y_pred_xgb_train = scaler_y.inverse_transform(

    xgb.predict(X_train_scaled).reshape(-1, 1)

).flatten()


y_pred_xgb_test = scaler_y.inverse_transform(

    xgb.predict(X_test_scaled).reshape(-1, 1)

).flatten()



# Random Forest



rf = RandomForestRegressor(n_estimators=100, random_state=42)


rf.fit(X_train_scaled, y_train_scaled)


y_pred_rf_train = scaler_y.inverse_transform(
    rf.predict(X_train_scaled).reshape(-1, 1)

).flatten()


y_pred_rf_test = scaler_y.inverse_transform(

    rf.predict(X_test_scaled).reshape(-1, 1)

).flatten()


print("Gaussian Process (RBF):")
print(

    "  Train RMSE =",
    np.sqrt(mean_squared_error(y_train, y_pred_gp_train)),
    "R2 =",
    r2_score(y_train, y_pred_gp_train),

    "MAE =",
    mean_absolute_error(y_train, y_pred_gp_train),
)
print(

    "  Test  RMSE =",
    np.sqrt(mean_squared_error(y_test, y_pred_gp_test)),
    "R2 =",
    r2_score(y_test, y_pred_gp_test),

    "MAE =",
    mean_absolute_error(y_test, y_pred_gp_test),
)



print("XGBoost:")
print(

    "  Train RMSE =",
    np.sqrt(mean_squared_error(y_train, y_pred_xgb_train)),
    "R2 =",
    r2_score(y_train, y_pred_xgb_train),
    "MAE =",
    mean_absolute_error(y_train, y_pred_xgb_train),
)



print(
    "  Test  RMSE =",
    np.sqrt(mean_squared_error(y_test, y_pred_xgb_test)),
    "R2 =",
    r2_score(y_test, y_pred_xgb_test),
    "MAE =",
    mean_absolute_error(y_test, y_pred_xgb_test),
)



print("Random Forest:")
print(

    "  Train RMSE =",
    np.sqrt(mean_squared_error(y_train, y_pred_rf_train)),
    "R2 =",

    r2_score(y_train, y_pred_rf_train),
    "MAE =",
    mean_absolute_error(y_train, y_pred_rf_train),
)
print(
    "  Test  RMSE =",
    np.sqrt(mean_squared_error(y_test, y_pred_rf_test)),
    "R2 =",

    r2_score(y_test, y_pred_rf_test),
    "MAE =",
    mean_absolute_error(y_test, y_pred_rf_test),
)

Gaussian Process (RBF):
  Train RMSE = 1.6186704609013016 R2 = 0.732064962806998 MAE = 1.2400828327995417
  Test  RMSE = 2.298875676031707 R2 = 0.4224915957445259 MAE = 1.779806895590033
XGBoost:
  Train RMSE = 0.3517783158390011 R2 = 0.9873453319658378 MAE = 0.24424245739009529
  Test  RMSE = 1.9004603375099558 R2 = 0.6053201457306783 MAE = 1.4453367581367498
Random Forest:
  Train RMSE = 0.8422351211384034 R2 = 0.9274597043596534 MAE = 0.6401762457446802
  Test  RMSE = 2.045427482672889 R2 = 0.5428112741204241 MAE = 1.6699657511458328
