In [1]:
# Cell 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

# Set random seed for reproducibility
np.random.seed(42)


In [2]:
# Cell 2: Load the dataset
# Load the previously created dataset
df = pd.read_csv("improved_gut_health_dataset.csv")

# Display the first few rows of the dataset
print(df.head())


   Abdominal Pain  Bloating  Diarrhea  Constipation  Dietary Habits  \
0               7         8         2             7               3   
1               7         7         4             5               3   
2               6         6         2             5               2   
3               7         7         4             6               3   
4               8         8         2             7               1   

   Stress Levels  Physical Activity  Age  Medication History       Diet Type  \
0              4                  2   40                   0           Vegan   
1              4                  2   21                   0  Non-Vegetarian   
2              4                  2   31                   0           Vegan   
3              4                  3   34                   0      Vegetarian   
4              3                  2   40                   0      Vegetarian   

  Gut Health Disease  
0                IBS  
1                IBS  
2                IBS  


In [3]:
# Cell 3: Preprocess the dataset
# Encode categorical features
label_encoders = {}
categorical_columns = ["Diet Type", "Gut Health Disease"]

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Separate features and target
X = df.drop(columns=["Gut Health Disease"])
y = df["Gut Health Disease"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Cell 4: Train a Random Forest Classifier with hyperparameter tuning
# Define the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_rf = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s


In [5]:
# Cell 5: Evaluate the model
# Predict on the test set
y_pred = best_rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoders["Gut Health Disease"].classes_))


Accuracy: 96.50%
Classification Report:
                    precision    recall  f1-score   support

    Celiac Disease       1.00      1.00      1.00       113
   Crohn's Disease       0.94      0.92      0.93        95
               IBS       1.00      1.00      1.00       104
Ulcerative Colitis       0.91      0.93      0.92        88

          accuracy                           0.96       400
         macro avg       0.96      0.96      0.96       400
      weighted avg       0.97      0.96      0.97       400



In [6]:
# Cell 6: Save the trained model (Optional)
import joblib

# Save the trained model for future use
joblib.dump(best_rf, "gut_health_model.pkl")
print("Trained model saved as gut_health_model.pkl")


Trained model saved as gut_health_model.pkl
