In [6]:
# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
# Load dataset
df = pd.read_csv('alzheimers_prediction_dataset.csv')

In [15]:
from sklearn.model_selection import train_test_split

# Drop the target column and store it separately
X = df.drop(columns=['Alzheimer’s Diagnosis'])
y = df['Alzheimer’s Diagnosis'].map({'No': 0, 'Yes': 1})  # convert to binary

# Early train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


In [17]:
print(f'Shape of X_Train {X_train.shape}')
print(f'Shape of X_Test {X_test.shape}')
print(f'Shape of Y_Train {y_train.shape}')
print(f'Shape of Y_Test {y_test.shape}')

Shape of X_Train (55712, 24)
Shape of X_Test (18571, 24)
Shape of Y_Train (55712,)
Shape of Y_Test (18571,)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, RobustScaler, PowerTransformer
from sklearn.impute import SimpleImputer

# Numerical pipelines
num_pipeline1 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
])

num_pipeline2 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

num_pipeline3 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('unskewer', PowerTransformer(method='yeo-johnson'))
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))  # 'use_encoded_value' handles unknown categories
])


In [None]:
from sklearn.compose import ColumnTransformer

# List of features
num_features = [...]  # replace with actual numerical features
features_to_scale = [...]  # replace with numerical features that need scaling
features_to_unskew = [...]  # replace with numerical features that need unskewing
cat_features = [...]  # replace with categorical features

# Feature set 1: Imputation + Scaling
preprocessor1 = ColumnTransformer([
    ('num1', num_pipeline1, list(set(num_features) - set(features_to_scale))),  # numerical features to impute only
    ('num2', num_pipeline2, features_to_scale),  # numerical features to scale
    ('cat', cat_pipeline, cat_features)  # categorical features
])

# Feature set 2: Imputation + Unskewing
preprocessor2 = ColumnTransformer([
    ('num1', num_pipeline1, list(set(num_features) - set(features_to_unskew))),  # numerical features to impute only
    ('num2', num_pipeline3, features_to_unskew),  # numerical features to unskew
    ('cat', cat_pipeline, cat_features)  # categorical features
])
