In [1]:
# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, RobustScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [2]:
# Load dataset
df = pd.read_csv('alzheimers_prediction_dataset.csv')

In [3]:
from sklearn.model_selection import train_test_split

# Drop the target column and store it separately
X = df.drop(columns=['Alzheimer’s Diagnosis'])
y = df['Alzheimer’s Diagnosis'].map({'No': 0, 'Yes': 1})  # convert to binary

# Early train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 59426 samples
Test set size: 14857 samples


In [4]:
print(f'Shape of X_Train {X_train.shape}')
print(f'Shape of X_Test {X_test.shape}')
print(f'Shape of Y_Train {y_train.shape}')
print(f'Shape of Y_Test {y_test.shape}')

Shape of X_Train (59426, 24)
Shape of X_Test (14857, 24)
Shape of Y_Train (59426,)
Shape of Y_Test (14857,)


In [5]:
# Define feature groups
num_features = ['Age', 'Education Level', 'BMI', 'Cognitive Test Score']
features_to_scale = ['Age', 'Cognitive Test Score']  # from your V1/V2
cat_features = X.select_dtypes(include='object').columns.tolist()


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, PowerTransformer, OrdinalEncoder

# V1: RobustScaler
num_pipeline_v1 = Pipeline([
    ('scaler', RobustScaler())
])

# V2: PowerTransformer (Yeo-Johnson)
num_pipeline_v2 = Pipeline([
    ('unskewer', PowerTransformer(method='yeo-johnson'))
])

# Categorical encoding (Ordinal)
cat_pipeline = Pipeline([
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Preprocessor for V1
preprocessor1 = ColumnTransformer([
    ('num_scaled', num_pipeline_v1, features_to_scale),
    ('num_passthrough', 'passthrough', list(set(num_features) - set(features_to_scale))),
    ('cat', cat_pipeline, cat_features)
])

# Preprocessor for V2
preprocessor2 = ColumnTransformer([
    ('num_unskewed', num_pipeline_v2, features_to_scale),
    ('num_passthrough', 'passthrough', list(set(num_features) - set(features_to_scale))),
    ('cat', cat_pipeline, cat_features)
])



In [7]:
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier

pipeline_ada_v1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', AdaBoostClassifier(random_state=42))
])

pipeline_cat_v1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', CatBoostClassifier(silent=True, random_state=42))
])

pipeline_ada_v2 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', AdaBoostClassifier(random_state=42))
])

pipeline_cat_v2 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', CatBoostClassifier(silent=True, random_state=42))
])

In [8]:
# Create pipelines with different preprocessing steps and classifiers
pipeline1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', SVC(probability=True))
])

pipeline2 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline3 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', SVC(probability=True))
])

pipeline4 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define different pipelines with different classifiers and preprocessing steps
pipelines = {
    'svc_v1_pipeline': pipeline1,
    'rf_v1_pipeline': pipeline2,
    'svc_v2_pipeline': pipeline3,
    'rf_v2_pipeline': pipeline4
}


In [9]:
# AdaBoost Grid
param_grid_ada = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 1.0],
    'classifier__algorithm': ['SAMME']
}

# CatBoost Grid
param_grid_cat = {
    'classifier__depth': [4, 6],
    'classifier__iterations': [100, 200],
    'classifier__learning_rate': [0.01, 0.1]
}



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

# Register pipelines
pipelines = {
    'ada_v1': (pipeline_ada_v1, param_grid_ada),
    'cat_v1': (pipeline_cat_v1, param_grid_cat),
    'ada_v2': (pipeline_ada_v2, param_grid_ada),
    'cat_v2': (pipeline_cat_v2, param_grid_cat)
}

# F1 scorer
f1_weighted = make_scorer(f1_score, average='weighted')

# Run GridSearchCV
best_estimators = {}

for name, (pipeline, grid) in pipelines.items():
    print(f"Running GridSearchCV for {name}...")
    search = GridSearchCV(pipeline, grid, cv=5, scoring=f1_weighted, n_jobs=-1)
    search.fit(X_train, y_train)

    best_estimators[name] = search.best_estimator_
    print(f"Best parameters for {name}: {search.best_params_}")
    print(f"Best CV F1 score: {search.best_score_:.4f}")

    y_pred = search.predict(X_test)
    print(f"Test F1 score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print()


🔍 Running GridSearchCV for: svc_v1_pipeline
Fitting 5 folds for each of 16 candidates, totalling 80 fits
