In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('oasis_longitudinal.csv')
data

In [None]:
data.isnull().sum()


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[['SES']] = imputer.fit_transform(data[['SES']])

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[['MMSE']] = imputer.fit_transform(data[['MMSE']])

In [None]:
data = data.dropna(axis=0)
data

In [None]:
data.isnull().sum()


In [None]:
data = data.drop(['Hand'], axis=1)
data.head()

In [None]:
data['Group'] = data['Group'].replace(['Converted'], ['Demented'])
data['Group'].value_counts()


In [None]:
from sklearn.preprocessing import LabelEncoder

gender_encoder = LabelEncoder()
data['M/F'] = gender_encoder.fit_transform(data['M/F'])
print(f"Gender coding: {dict(zip(gender_encoder.classes_, gender_encoder.transform(gender_encoder.classes_)))}")


group_mapping = {'Nondemented': 0, 'Demented': 1}
data['Group'] = data['Group'].map(group_mapping)

print(f"Group coding: {group_mapping}")
print(f"Class 0 = Nondemented (normal)")
print(f"Class 1 = Demented (dementia)")

print(f"Unique values after encoding: {sorted(data['Group'].unique())}")

data.head()

In [None]:
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, average_precision_score, matthews_corrcoef, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

X_full = data[['MR Delay','CDR','M/F', 'Age', 'EDUC','SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']]
y_full = data['Group']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_full, y_full, test_size=0.2, random_state=42, stratify=y_full)

def eval_metrics(y_true, y_pred, y_proba=None):
    res = {
        'accuracy': accuracy_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'mcc': matthews_corrcoef(y_true, y_pred)
    }

    if y_proba is not None:
        try:
            res['roc_auc'] = roc_auc_score(y_true, y_proba)
        except Exception:
            pass
        try:
            res['pr_auc'] = average_precision_score(y_true, y_proba)
        except Exception:
            pass
    return res

def print_eval(name, y_true, y_pred, y_proba=None):
    m = eval_metrics(y_true, y_pred, y_proba)
    print(f"\n[{name}] Metrics:")
    for k, v in m.items():
        print(f"- {k}: {v:.4f}")
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Report:")
    print(classification_report(y_true, y_pred))

In [None]:
try:
    import imblearn  
    USE_IMBLEARN = True
    print("imblearn available — using undersampling inside CV")
except Exception as e:
    USE_IMBLEARN = False
    print("imblearn not available — proceeding WITHOUT it (using class_weight=balanced).")
    print("Reason:", repr(e))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as SkPipeline

if USE_IMBLEARN:
    from imblearn.pipeline import Pipeline as ImbPipeline
    from imblearn.under_sampling import RandomUnderSampler
    log_pipe = ImbPipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("undersample", RandomUnderSampler(random_state=42)),
        ("clf", LogisticRegression(max_iter=2000, random_state=42))
    ])
    log_param_grid = {
        "clf__penalty": ["l1", "l2"],
        "clf__C": [0.1, 1, 10],
        "clf__solver": ["liblinear", "saga"],
    }
else:
    log_pipe = SkPipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=2000, random_state=42, class_weight="balanced"))
    ])
    log_param_grid = {
        "clf__penalty": ["l1", "l2"],
        "clf__C": [0.1, 1, 10],
        "clf__solver": ["liblinear", "saga"],
    }

log_grid = GridSearchCV(
    estimator=log_pipe,
    param_grid=log_param_grid,
    cv=5,
    scoring="balanced_accuracy",
    n_jobs=1,  
    verbose=1,
)

log_grid.fit(X_train2, y_train2)
log_best = log_grid.best_estimator_
log_pred = log_best.predict(X_test2)
try:
    log_proba = log_best.predict_proba(X_test2)[:, 1]
except Exception:
    log_proba = None

print("Logistic best params:", log_grid.best_params_)
print_eval("Logistic", y_test2, log_pred, log_proba)

In [None]:
import joblib
import os
from datetime import datetime
import json

model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"logistic_model_{timestamp}"

model_path = os.path.join(model_dir, f"{model_name}.pkl")
scaler_path = os.path.join(model_dir, f"scaler_{timestamp}.pkl")
info_path = os.path.join(model_dir, f"model_info_{timestamp}.json")

try:
    scaler = log_best.named_steps.get("scaler", None)
except Exception:
    scaler = None
try:
    clf = log_best.named_steps.get("clf", None)
except Exception:
    clf = None

if clf is None:
    try:
        clf = log_best
    except NameError:
        pass

if clf is None:
    raise Exception("Model not found (clf)")

joblib.dump(clf, model_path)
if scaler is not None:
    joblib.dump(scaler, scaler_path)

print("Model saved successfully!")
print(f"Model path: {model_path}")
if scaler is not None:
    print(f"Scaler path: {scaler_path}")
else:
    print("Scaler not available in the pipeline.")

model_info = {
    'model_type': 'LogisticRegression',
    'model_description': 'Optimized LogisticRegression (GridSearchCV, anti-leakage pipeline)',
    'train_datetime': datetime.now().isoformat(),
    'model_version': '1.0',
    'n_features': int(X_full.shape[1]),
}

try:
    model_info.update({
        'best_params': log_grid.best_params_,
        'cv_best_balanced_accuracy': float(log_grid.best_score_),
    })
except Exception:
    model_info['note'] = 'GridSearchCV details not available.'

with open(info_path, 'w', encoding='utf-8') as f:
    json.dump(model_info, f, indent=2, ensure_ascii=False)

print(f"Meta information saved: {info_path}")
print("Directory contents:", os.listdir(model_dir))