<a href="https://colab.research.google.com/github/kunalsanga/BIO_reactor/blob/main/BIO_reactor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset to examine its structure and contents
file_path = 'biodata.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary of the dataset
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 60 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   paper_number                             1209 non-null   int64  
 1   gen_info                                 1203 non-null   float64
 2   env_info                                 1203 non-null   float64
 3   good                                     1209 non-null   int64  
 4   no_of_designs                            1209 non-null   object 
 5   cs1                                      1191 non-null   float64
 6   cs1_mw                                   1105 non-null   float64
 7   cs_conc1                                 1175 non-null   float64
 8   CS_C1                                    1105 non-null   float64
 9   CS_H1                                    1105 non-null   float64
 10  CS_O1                                    1105 no

(   paper_number  gen_info  env_info  good no_of_designs  cs1  cs1_mw  \
 0             1       1.0       1.0     1             1  9.0   92.09   
 1             2       0.0       1.0     1             1  1.0  180.16   
 2             2       1.0       1.0     1             2  1.0  180.16   
 3             2       1.0       1.0     1             3  1.0  180.16   
 4             2       1.0       1.0     1             4  1.0  180.16   
 
       cs_conc1  CS_C1  CS_H1  ...  atp_cost  nadh_nadph_cost  yield  yield_o  \
 0  2714.735585    3.0    8.0  ...       7.0             14.0  0.060      NaN   
 1    83.259325    6.0   12.0  ...       5.0             10.0  0.012      NaN   
 2    83.259325    6.0   12.0  ...       5.0             10.0  0.016      NaN   
 3    83.259325    6.0   12.0  ...       5.0             10.0  0.017      NaN   
 4    83.259325    6.0   12.0  ...       7.0             14.0  0.020      NaN   
 
    yield2  titer   rate  fermentation_time  bio_titre  bio_growth_rate 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [None]:
# Select relevant features and target
columns = ['temp', 'oxygen', 'rxt_volume', 'media']
filtered_data = data[columns]

In [None]:
# Handle missing values: Drop rows with missing target ('oxygen') and fill features
filtered_data = filtered_data.dropna(subset=['oxygen'])
filtered_data['temp'] = filtered_data['temp'].fillna(filtered_data['temp'].mean())
filtered_data['rxt_volume'] = filtered_data['rxt_volume'].fillna(filtered_data['rxt_volume'].mean())
filtered_data['media'] = filtered_data['media'].fillna('Unknown')

In [None]:
# Encode categorical features
filtered_data = pd.get_dummies(filtered_data, columns=['media'], drop_first=True)

In [None]:
# Split data into features and target
X = filtered_data.drop(columns=['oxygen'])
y = filtered_data['oxygen']


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2

(0.20344565643422352, 0.17898199415735008, 0.695777751809807)