In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# 1. Load data
train = pd.read_csv("C:/Users/User/Downloads/Train_Data.csv")
test = pd.read_csv("C:/Users/User/Downloads/Test_Data.csv")

# 2. Clean and map target
train['age_group'] = train['age_group'].astype(str).str.strip().str.title()
train = train[train['age_group'].isin(['Adult', 'Senior'])]
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1}).astype(int)

# 3. Separate target and drop ID
test_ids = test['SEQN']
train.drop(columns=['SEQN'], inplace=True)
test.drop(columns=['SEQN'], inplace=True)

X = train.drop(columns='age_group')
y = train['age_group']
X_test = test.copy()

# 4. Handle missing values
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# 5. Feature engineering
def add_features(df):
    df['IS_OBESE'] = (df['BMXBMI'] >= 30).astype(int)
    df['IS_DIABETIC'] = (df['DIQ010'] == 1).astype(int)
    df['GLU_BMI'] = df['LBXGLU'] / (df['BMXBMI'] + 1)
    df['GLU_GT_DIFF'] = df['LBXGLU'] - df['LBXGLT']
    df['RISK_FLAG'] = ((df['PAQ605'] == 2) & (df['LBXGLU'] > 130) & (df['BMXBMI'] > 28)).astype(int)
    return df

X = add_features(X)
X_test = add_features(X_test)

# 6. Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds_proba = np.zeros(len(X_test))
f1_scores = []
precisions = []
recalls = []
accuracies = []
optimal_thresholds = []

# 7. Train with optimal threshold search
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)

    val_proba = model.predict_proba(X_val)[:, 1]

    # 🔍 Find best threshold for F1
    best_thresh = 0.5
    best_f1 = 0
    for t in np.arange(0.1, 0.9, 0.01):
        preds = (val_proba >= t).astype(int)
        f1 = f1_score(y_val, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t

    optimal_thresholds.append(best_thresh)
    val_preds = (val_proba >= best_thresh).astype(int)

    f1 = f1_score(y_val, val_preds)
    precision = precision_score(y_val, val_preds)
    recall = recall_score(y_val, val_preds)
    accuracy = accuracy_score(y_val, val_preds)

    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)

    # Predict on test
    test_preds_proba += model.predict_proba(X_test)[:, 1]

    print(f"Fold {fold} - F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, Best Thresh: {best_thresh:.2f}")

# 8. Final prediction with avg threshold
avg_thresh = np.mean(optimal_thresholds)
base_preds = (test_preds_proba / skf.n_splits >= avg_thresh).astype(int)

# 9. Rule-based override
rule_override = ((X_test['RISK_FLAG'] == 1) & (X_test['LBXIN'] > 80)).astype(int)
final_preds = np.where(rule_override == 1, 1, base_preds)

# 10. Create submission
submission = pd.DataFrame({'age_group': final_preds})
submission.to_csv("submission_logistic_boosted.csv", index=False)

# 11. Print cross-validation results
print("\n📊 Cross-validation Summary:")
print(f"Avg F1 Score:      {np.mean(f1_scores):.4f}")
print(f"Avg Precision:     {np.mean(precisions):.4f}")
print(f"Avg Recall:        {np.mean(recalls):.4f}")
print(f"Avg Accuracy:      {np.mean(accuracies):.4f}")
print(f"Optimal threshold used: {avg_thresh:.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1 - F1: 0.4348, Precision: 0.4000, Recall: 0.4762, Accuracy: 0.8005, Best Thresh: 0.60
Fold 2 - F1: 0.4906, Precision: 0.4062, Recall: 0.6190, Accuracy: 0.7928, Best Thresh: 0.56
Fold 3 - F1: 0.4352, Precision: 0.3206, Recall: 0.6774, Accuracy: 0.7205, Best Thresh: 0.50
Fold 4 - F1: 0.3902, Precision: 0.2817, Recall: 0.6349, Accuracy: 0.6795, Best Thresh: 0.47
Fold 5 - F1: 0.5426, Precision: 0.5303, Recall: 0.5556, Accuracy: 0.8487, Best Thresh: 0.61

📊 Cross-validation Summary:
Avg F1 Score:      0.4587
Avg Precision:     0.3878
Avg Recall:        0.5926
Avg Accuracy:      0.7684
Optimal threshold used: 0.5480
