In [2]:
# Import essential Python libraries for data handling, model training, and evaluation
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [3]:
# Load the Parkinson’s dataset and drop the 'name' column which is not useful for prediction
df = pd.read_csv("../data/parkinsons.csv")
df.drop(columns=['name'], inplace=True)


In [4]:
# Define the 11 most important features (selected from previous model's feature importance)
selected_features = [
    'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)',
    'MDVP:Jitter(%)', 'MDVP:Shimmer', 'NHR',
    'HNR', 'RPDE', 'DFA', 'spread1', 'PPE'
]
X = df[selected_features]
y = df['status']


In [5]:
# Split the dataset into 80% training and 20% testing data, while maintaining class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=43
)


In [6]:
# Normalize the features so they all contribute equally to the model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
# Train an XGBoost model using the scaled training data
model = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=43
)
model.fit(X_train_scaled, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [15]:
# Predict and evaluate the model performance using accuracy and classification report
y_pred = model.predict(X_test_scaled)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


✅ Accuracy: 0.8717948717948718
              precision    recall  f1-score   support

           0     0.7778    0.7000    0.7368        10
           1     0.9000    0.9310    0.9153        29

    accuracy                         0.8718        39
   macro avg     0.8389    0.8155    0.8260        39
weighted avg     0.8687    0.8718    0.8695        39



In [16]:
# Save the model and scaler to .pkl files for use in Streamlit or other apps
pickle.dump(model, open("../models/parkinsons_model.pkl", "wb"))
pickle.dump(scaler, open("../models/parkinsons_scaler.pkl", "wb"))
