In [1]:
# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load dataset
df = pd.read_csv('alzheimers_prediction_dataset.csv')

In [3]:
from sklearn.model_selection import train_test_split

# Drop the target column and store it separately
X = df.drop(columns=['Alzheimer’s Diagnosis'])
y = df['Alzheimer’s Diagnosis'].map({'No': 0, 'Yes': 1})  # convert to binary

# Early train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 59426 samples
Test set size: 14857 samples


In [4]:
print(f'Shape of X_Train {X_train.shape}')
print(f'Shape of X_Test {X_test.shape}')
print(f'Shape of Y_Train {y_train.shape}')
print(f'Shape of Y_Test {y_test.shape}')

Shape of X_Train (59426, 24)
Shape of X_Test (14857, 24)
Shape of Y_Train (59426,)
Shape of Y_Test (14857,)


In [5]:
# Numerical features in your dataset
num_features = ['Age', 'BMI', 'Education Level', 'Cognitive Test Score']

# Features to be scaled or unskewed (typically those with wider ranges or skewed distributions)
features_to_scale = ['BMI', 'Cognitive Test Score']  # Adjust based on your EDA if needed

# Categorical features in your dataset
cat_features = [
    'Country', 'Gender', 'Physical Activity Level', 'Smoking Status',
    'Alcohol Consumption', 'Diabetes', 'Hypertension', 'Cholesterol Level',
    'Family History of Alzheimer’s', 'Depression Level', 'Sleep Quality',
    'Dietary Habits', 'Air Pollution Exposure', 'Employment Status',
    'Marital Status', 'Genetic Risk Factor (APOE-ε4 allele)',
    'Social Engagement Level', 'Income Level', 'Stress Levels',
    'Urban vs Rural Living'
]


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, PowerTransformer, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

# Preprocessing pipeline for numerical features that need only imputing (not scaling, nor unskewing) 
num_pipeline1 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
])

# Preprocessing pipeline for numerical features that need imputing + scaling
num_pipeline2 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

# Preprocessing pipeline for numerical features that need imputing + unskewing
num_pipeline3 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('unskewer', PowerTransformer(method='yeo-johnson'))
])

# Preprocessing pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])


In [7]:
# Preprocessing pipeline for featureset v1 (impute + scale)
preprocessor1 = ColumnTransformer(
    transformers=[
        ('num1', num_pipeline1, list(set(num_features) - set(features_to_scale))),
        ('num2', num_pipeline2, features_to_scale),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='passthrough'
)

# Preprocessing pipeline for featureset v2 (impute + unskew)
preprocessor2 = ColumnTransformer(
    transformers=[
        ('num1', num_pipeline1, list(set(num_features) - set(features_to_scale))),
        ('num2', num_pipeline3, features_to_scale),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='passthrough'
)


In [8]:
pipeline1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', SVC())
])
pipeline2 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', AdaBoostClassifier())
])
pipeline3 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', SVC())
])
pipeline4 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', AdaBoostClassifier())
])

pipelines = {
    'svc_v1_pipeline': pipeline1,
    'ada_v1_pipeline': pipeline2,
    'svc_v2_pipeline': pipeline3,
    'ada_v2_pipeline': pipeline4
}


In [9]:
pipeline1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', SVC())
])
pipeline2 = Pipeline([
    ('preprocessor', preprocessor1),
    ('classifier', AdaBoostClassifier())
])
pipeline3 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', SVC())
])
pipeline4 = Pipeline([
    ('preprocessor', preprocessor2),
    ('classifier', AdaBoostClassifier())
])

pipelines = {
    'svc_v1_pipeline': pipeline1,
    'ada_v1_pipeline': pipeline2,
    'svc_v2_pipeline': pipeline3,
    'ada_v2_pipeline': pipeline4
}


In [10]:
param_grid1 = [
    {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf', 'poly'],
        'classifier__gamma': ['scale', 'auto']
    }
]
param_grid2 = [
    {
        'classifier__n_estimators': [50, 100, 200, 500],
        'classifier__learning_rate': [0.01, 0.1, 1.0, 10],
        'classifier__algorithm': ['SAMME']
    }
]

param_grids = {
    'svc_v1_pipeline': param_grid1,
    'ada_v1_pipeline': param_grid2,
    'svc_v2_pipeline': param_grid1,
    'ada_v2_pipeline': param_grid2
}


In [None]:
f1_score_weighted = make_scorer(f1_score, average='weighted')

best_estimators = {}
for pipeline_name, pipeline in pipelines.items():
    print(f"Running GridSearchCV for {pipeline_name}...")
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[pipeline_name],
        cv=5,
        scoring=f1_score_weighted,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Store best estimator
    best_estimators[pipeline_name] = grid_search.best_estimator_
    print(f"Best Parameters for {pipeline_name}: {grid_search.best_params_}")
    print(f"Best Cross-Validated Score: {grid_search.best_score_}")
    
    # Predict and evaluate
    y_pred = grid_search.predict(X_test)
    print(f"Test F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
    print("")


Running GridSearchCV for svc_v1_pipeline...
