In [2]:
# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, RobustScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [3]:
# Load dataset
df = pd.read_csv('alzheimers_prediction_dataset.csv')

In [4]:
from sklearn.model_selection import train_test_split

# Drop the target column and store it separately
X = df.drop(columns=['Alzheimer’s Diagnosis'])
y = df['Alzheimer’s Diagnosis'].map({'No': 0, 'Yes': 1})  # convert to binary

# Early train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 59426 samples
Test set size: 14857 samples


In [5]:
print(f'Shape of X_Train {X_train.shape}')
print(f'Shape of X_Test {X_test.shape}')
print(f'Shape of Y_Train {y_train.shape}')
print(f'Shape of Y_Test {y_test.shape}')

Shape of X_Train (59426, 24)
Shape of X_Test (14857, 24)
Shape of Y_Train (59426,)
Shape of Y_Test (14857,)


In [6]:
# Define the numerical and categorical features based on your dataset
num_features = ['Age', 'Education Level', 'BMI', 'Cognitive Test Score']

# Be careful with apostrophes and exact spellings
cat_features = [col for col in X_train.columns if col not in num_features]

# Features that may need scaling based on EDA (update these based on your actual EDA findings)
features_to_scale = ['Age', 'BMI']  

# The rest of numerical features
num_features_no_scale = list(set(num_features) - set(features_to_scale))


In [7]:
# Preprocessing pipeline for numerical features that need only imputing (not scaling)
num_pipeline1 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
])

# Preprocessing pipeline for numerical features that need imputing + scaling
num_pipeline2 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

# Preprocessing pipeline for numerical features that need imputing + unskewing
num_pipeline3 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('unskewer', PowerTransformer(method='yeo-johnson'))
])

# Preprocessing pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])



In [8]:
# Preprocessing pipeline for featureset v1 (impute + scale)
preprocessor1 = ColumnTransformer(
    transformers=[
        ('num1', num_pipeline1, list(set(num_features) - set(features_to_scale))),
        ('num2', num_pipeline2, features_to_scale),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='passthrough'
)

# Preprocessing pipeline for featureset v2 (impute + unskew)
preprocessor2 = ColumnTransformer(
    transformers=[
        ('num1', num_pipeline1, list(set(num_features) - set(features_to_scale))),
        ('num2', num_pipeline3, features_to_scale),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='passthrough'
)


In [9]:
# Create pipelines with different preprocessing steps and classifiers
pipeline1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', SVC(probability=True))
])

pipeline2 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline3 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', SVC(probability=True))
])

pipeline4 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define different pipelines with different classifiers and preprocessing steps
pipelines = {
    'svc_v1_pipeline': pipeline1,
    'rf_v1_pipeline': pipeline2,
    'svc_v2_pipeline': pipeline3,
    'rf_v2_pipeline': pipeline4
}


In [10]:
# Set up parameter grid for GridSearchCV for each model
param_grid_svc = [
    {'classifier__C': [0.1, 1, 10, 100], 
     'classifier__kernel': ['linear', 'rbf'], 
     'classifier__gamma': ['scale', 'auto']}
]

param_grid_rf = [
    {'classifier__n_estimators': [100, 200, 300],
     'classifier__max_depth': [None, 10, 20],
     'classifier__min_samples_split': [2, 5, 10]}
]

param_grids = {
    'svc_v1_pipeline': param_grid_svc,
    'rf_v1_pipeline': param_grid_rf,
    'svc_v2_pipeline': param_grid_svc,
    'rf_v2_pipeline': param_grid_rf
}


In [None]:
def f1_score_weighted(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted')

# Loop through each pipeline and perform GridSearchCV
best_estimators = {}
for pipeline_name, pipeline in pipelines.items():
    print(f"Running GridSearchCV for {pipeline_name}...")
    grid_search = GridSearchCV(
        estimator=pipeline, 
        param_grid=param_grids[pipeline_name], 
        cv=5, 
        scoring='f1_weighted', 
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Store the best estimator and results for each pipeline
    best_estimators[pipeline_name] = grid_search.best_estimator_
    print(f"Best Parameters for {pipeline_name}: {grid_search.best_params_}")
    print(f"Best Cross-Validated Score for {pipeline_name}: {grid_search.best_score_:.4f}")
    
    # Make predictions using the best estimator
    y_pred = grid_search.best_estimator_.predict(X_test)
    print(f"F1 Score on test set: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    print("-" * 50)


Running GridSearchCV for svc_v1_pipeline...
