In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso, ElasticNet
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

In [2]:
%cd Z:\PPMI_Data\Excels\CollaborativeFiltering
NonMot = pd.read_csv('Feats45_unCategAge_APPRDX.csv')
NonMot1 = NonMot.drop(['PATNO', 'Patient_ID', 'Age'], axis = 1)
NonMot1['APPRDX'] = NonMot1['APPRDX'].replace([1], 'Patient')
NonMot1['APPRDX'] = NonMot1['APPRDX'].replace([2], 'Healthy')
NonMot1.head(2)

Z:\PPMI_Data\Excels\CollaborativeFiltering


Unnamed: 0,APPRDX,Anxiety,Apathy,Benton,Clock,Cognition,COGSTATE,Constipate,Depress,DopaDefic,...,Semantic,SleepDay,SleepNight,STAIA,STAIS,Symbol_Digit,Trail_Making_A,Trail_Making_B,UPSIT,Urine
0,Patient,1,0,12.16,7,1,1,1,1,0,...,57,1,0,45,59,47.5,110,204,17,1
1,Patient,1,1,7.76,6,1,1,0,0,0,...,36,1,3,40,39,52.0,27,52,9,0


In [6]:

# Load the dataset and split into training and testing sets
X, y = NonMot1.drop('APPRDX', axis=1), NonMot1['APPRDX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define the base models
base_models = [('rf', RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=1)),
               ('svc', SVC(kernel='linear', probability=True, random_state=1)),
               ('lasso', LogisticRegression(penalty='l1', solver='liblinear', max_iter=500, random_state=1)),
               ('enet', LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, l1_ratio=0.5, random_state=1)),
               ('ada', AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=2), n_estimators=50, learning_rate=0.1, random_state=1)),
               ('gb', GradientBoostingClassifier(n_estimators=100, random_state=1)),
               ('xgb', XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, random_state=1)),
               ('bagging', BaggingClassifier(n_estimators=100, max_samples=0.5, max_features=0.5, random_state=1))]

# Define the voting ensemble model
voting_model = VotingClassifier(estimators=base_models)

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())

# Train and evaluate the voting ensemble model
voting_model.fit(X_train, y_train)
voting_predictions = voting_model.predict(X_test)
voting_accuracy = accuracy_score(y_test, voting_predictions)
print(f'Voting ensemble accuracy: {voting_accuracy:.4f}')

# Train and evaluate the stacking ensemble model
stacking_model.fit(X_train, y_train)
stacking_predictions = stacking_model.predict(X_test)
stacking_accuracy = accuracy_score(y_test, stacking_predictions)
print(f'Stacking ensemble accuracy: {stacking_accuracy:.4f}')

Voting ensemble accuracy: 0.8400
Stacking ensemble accuracy: 0.8600


In [7]:

# Load the dataset
X, y = NonMot1.drop('APPRDX', axis=1), NonMot1['APPRDX']

# Define the base models
base_models = [('rf', RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=1)),
               ('svc', SVC(kernel='linear', probability=True, random_state=1)),
               ('lasso', LogisticRegression(penalty='l1', solver='liblinear', max_iter=500, random_state=1)),
               ('enet', LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, l1_ratio=0.5, random_state=1)),
               ('ada', AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=2), n_estimators=50, learning_rate=0.1, random_state=1)),
               ('gb', GradientBoostingClassifier(n_estimators=100, random_state=1)),
               ('xgb', XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, random_state=1)),
               ('bagging', BaggingClassifier(n_estimators=100, max_samples=0.5, max_features=0.5, random_state=1))]

# Define the voting ensemble model
voting_model = VotingClassifier(estimators=base_models)

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())

# Define the number of folds
num_folds = 10

# Define the K-fold cross-validation object
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=1)

# Evaluate the voting ensemble model using cross-validation
voting_scores = cross_val_score(voting_model, X, y, cv=kfold, scoring='accuracy')
print(f'Voting ensemble accuracy (mean): {voting_scores.mean():.4f}')
print(f'Voting ensemble accuracy (std): {voting_scores.std():.4f}')

# Evaluate the stacking ensemble model using cross-validation
stacking_scores = cross_val_score(stacking_model, X, y, cv=kfold, scoring='accuracy')
print(f'Stacking ensemble accuracy (mean): {stacking_scores.mean():.4f}')
print(f'Stacking ensemble accuracy (std): {stacking_scores.std():.4f}')


Voting ensemble accuracy (mean): 0.8360
Voting ensemble accuracy (std): 0.0656
Stacking ensemble accuracy (mean): 0.8440
Stacking ensemble accuracy (std): 0.0550


# Define the stacking ensemble model with Logistic Regression final estimator
stacking_model_lr = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())

# Train and evaluate the stacking ensemble model with Logistic Regression final estimator
stacking_model_lr.fit(X_train, y_train)
stacking_predictions_lr = stacking_model_lr.predict(X_test)
stacking_accuracy_lr = accuracy_score(y_test, stacking_predictions_lr)
print(f'Stacking ensemble accuracy with Logistic Regression final estimator: {stacking_accuracy_lr:.4f}')

# Define the stacking ensemble model with Random Forest final estimator
stacking_model_rf = StackingClassifier(estimators=base_models, final_estimator=RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))

# Train and evaluate the stacking ensemble model with Random Forest final estimator
stacking_model_rf.fit(X_train, y_train)
stacking_predictions_rf = stacking_model_rf.predict(X_test)
stacking_accuracy_rf = accuracy_score(y_test, stacking_predictions_rf)
print(f'Stacking ensemble accuracy with Random Forest final estimator: {stacking_accuracy_rf:.4f}')

# Define the stacking ensemble model with SVM final estimator
stacking_model_svm = StackingClassifier(estimators=base_models, final_estimator=SVC(probability=True, random_state=1))

# Train and evaluate the stacking ensemble model with SVM final estimator
stacking_model_svm.fit(X_train, y_train)
stacking_predictions_svm = stacking_model_svm.predict(X_test)
stacking_accuracy_svm = accuracy_score(y_test, stacking_predictions_svm)
print(f'Stacking ensemble accuracy with SVM final estimator: {stacking_accuracy_svm:.4f}')