In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import joblib




















In [6]:
pip install numpy pandas scikit-learn imbalanced-learn joblib


Collecting numpyNote: you may need to restart the kernel to use updated packages.

  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pandas
  Downloading pandas-2.3.1-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting joblib
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting sklearn-compat<1,>

In [8]:
# Load the dataset
csv_path = r"d:\Downloads\DiseasePredictions-Web-application-with-Flask-main\DataSet\diabetes.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
           'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [9]:
# Load dataset safely
try:
    data = pd.read_csv(csv_path)
except FileNotFoundError:
    raise FileNotFoundError(f"Could not find file at {csv_path}")

In [13]:
# Step 2: Split features (X) and target (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']


In [15]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

In [17]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:
import joblib
import os

# Create folder
os.makedirs('saved_models', exist_ok=True)

# Save the scaler
joblib.dump(scaler, 'saved_models/diabetes_scaler.pkl')



['saved_models/diabetes_scaler.pkl']

In [22]:
# Grid search parameters
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

In [23]:
# Model and tuning
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    verbose=2
)
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [24]:
# Best model
best_rf = grid_search_rf.best_estimator_

In [25]:
joblib.dump(best_rf, 'saved_models/diabetes_model.sav')

['saved_models/diabetes_model.sav']

In [26]:
import pickle

In [27]:
filename = 'diabetes_model.sav'
pickle.dump(best_rf, open(filename, 'wb'))