In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFE


In [2]:
pip install xgboost


Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Step 1: Load the data
data = pd.read_csv(r'd:\Downloads\DiseasePredictions-Web-application-with-Flask-main\DataSet\parkinsons.csv')

# Step 2: Separate features and target
X = data.drop(['name', 'status'], axis=1)
y = data['status']

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Feature Selection using RFE
rfe_selector = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=10)
X_train_rfe = rfe_selector.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe_selector.transform(X_test_scaled)

# Step 6: Define hyperparameter grid for XGBClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Step 7: Initialize XGBoost classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Step 8: Grid Search for best parameters
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_rfe, y_train)

# Step 9: Train the best model
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train_rfe, y_train)

# Step 10: Make predictions
y_pred = best_xgb.predict(X_test_rfe)

# Step 11: Evaluate
print(f"\n✅ Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\n📋 Classification Report:")
print(classification_report(y_test, y_pred))
print("\n🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n🔧 Best Parameters:", grid_search.best_params_)

# Step 12: Save the model and scaler
os.makedirs('saved_models', exist_ok=True)
joblib.dump(best_xgb, 'saved_models/parkinsons_model.sav')
joblib.dump(scaler, 'saved_models/parkinsons_scaler.pkl')
joblib.dump(rfe_selector, 'saved_models/parkinsons_rfe.pkl')

print("\n💾 Model, scaler, and RFE selector saved successfully!")


Fitting 5 folds for each of 36 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Accuracy: 0.90

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.57      0.67         7
           1       0.91      0.97      0.94        32

    accuracy                           0.90        39
   macro avg       0.86      0.77      0.80        39
weighted avg       0.89      0.90      0.89        39


🧮 Confusion Matrix:
[[ 4  3]
 [ 1 31]]

🔧 Best Parameters: {'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 200}

💾 Model, scaler, and RFE selector saved successfully!


In [5]:
import pickle

In [6]:
filename = 'parkinsons_model.sav'
pickle.dump(xgb, open(filename, 'wb'))