In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('Train_Data.csv')

In [None]:
df.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       1954 non-null   float64
 1   RIAGENDR   1948 non-null   float64
 2   PAQ605     1953 non-null   float64
 3   BMXBMI     1948 non-null   float64
 4   LBXGLU     1953 non-null   float64
 5   DIQ010     1948 non-null   float64
 6   LBXGLT     1955 non-null   float64
 7   LBXIN      1957 non-null   float64
 8   age_group  1952 non-null   object 
dtypes: float64(8), object(1)
memory usage: 138.4+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
SEQN,12
RIAGENDR,18
PAQ605,13
BMXBMI,18
LBXGLU,13
DIQ010,18
LBXGLT,11
LBXIN,9
age_group,14


In [None]:
df['age_group'] = df['age_group'].map({'Adult': 0, 'Senior': 1})


In [None]:
num_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)


In [None]:
cat_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
df.dropna(subset=['age_group'], inplace=True)


In [None]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 0  # Underweight
    elif 18.5 <= bmi < 25:
        return 1  # Normal
    elif 25 <= bmi < 30:
        return 2  # Overweight
    else:
        return 3  # Obese

df['BMI_cat'] = df['BMXBMI'].apply(bmi_category)


In [None]:
df['INS_GLUC_RATIO'] = df['LBXIN'] / (df['LBXGLU'] + 1)


In [None]:
df['GLU_DIFF'] = df['LBXGLT'] - df['LBXGLU']


In [None]:
df['ACTIVE_DIABETIC'] = df['PAQ605'] * df['DIQ010']


In [None]:
df['RISK_SCORE'] = (
    0.3 * df['BMXBMI'] +
    0.2 * df['LBXGLU'] +
    0.2 * df['LBXGLT'] +
    0.2 * df['LBXIN'] +
    0.1 * df['PAQ605']
)


In [None]:
df['is_diabetic'] = (df['LBXGLU'] >= 126).astype(int)  # Fasting glucose threshold
df['is_hyperinsulinemia'] = (df['LBXIN'] > 25).astype(int)  # Example threshold


In [None]:
df['GLU_GENDER'] = df['LBXGLU'] * df['RIAGENDR']
df['BMI_GENDER'] = df['BMXBMI'] * df['RIAGENDR']


In [None]:
def preprocess(df, is_train=True):
    # Encode target if present
    if is_train and 'age_group' in df.columns:
        df['age_group'] = df['age_group'].map({'Adult': 0, 'Senior': 1})

    # Impute missing
    num_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)

    cat_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
    for col in cat_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    if is_train:
        df.dropna(subset=['age_group'], inplace=True)

    # Feature Engineering
    df['BMI_cat'] = df['BMXBMI'].apply(lambda x: 0 if x < 18.5 else 1 if x < 25 else 2 if x < 30 else 3)
    df['INS_GLUC_RATIO'] = df['LBXIN'] / (df['LBXGLU'] + 1)
    df['GLU_DIFF'] = df['LBXGLT'] - df['LBXGLU']
    df['ACTIVE_DIABETIC'] = df['PAQ605'] * df['DIQ010']
    df['RISK_SCORE'] = (
        0.3 * df['BMXBMI'] +
        0.2 * df['LBXGLU'] +
        0.2 * df['LBXGLT'] +
        0.2 * df['LBXIN'] +
        0.1 * df['PAQ605']
    )
    df['GLU_GENDER'] = df['LBXGLU'] * df['RIAGENDR']
    df['BMI_GENDER'] = df['BMXBMI'] * df['RIAGENDR']
    df['is_diabetic'] = (df['LBXGLU'] >= 126).astype(int)
    df['is_hyperinsulinemia'] = (df['LBXIN'] > 25).astype(int)

    return df


In [None]:
test_df=pd.read_csv('Test_Data.csv')

In [None]:

test_df = preprocess(test_df, is_train=False)


In [None]:
train_df=df

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import MinMaxScaler
X = train_df.drop(columns=['SEQN', 'age_group'])
y = train_df['age_group']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Cross-validation scores
scores = cross_val_score(model, X, y, cv=5, scoring='f1')
print(f"Mean F1 Score (5-fold): {scores.mean():.4f}")

# Train final model
model.fit(X_train, y_train)

# Validation results
val_preds = model.predict(X_val)
print("Validation Report:\n", classification_report(y_val, val_preds))


Mean F1 Score (5-fold): 0.1999
Validation Report:
               precision    recall  f1-score   support

         0.0       0.85      0.97      0.91       328
         1.0       0.41      0.11      0.17        63

    accuracy                           0.83       391
   macro avg       0.63      0.54      0.54       391
weighted avg       0.78      0.83      0.79       391



In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, classification_report
import xgboost as xgb
import lightgbm as lgb


In [None]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

# 5-Fold Cross Validation
xgb_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='f1')
print(f"XGBoost 5-Fold F1 Score: {xgb_scores.mean():.4f}")


XGBoost 5-Fold F1 Score: 0.2564


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

xgb_model.fit(X_train, y_train)
xgb_val_preds = xgb_model.predict(X_val)

# XGBoost Evaluation
print("🔹 XGBoost Classification Report:")
print(classification_report(y_val, xgb_val_preds))

print("🔹 XGBoost Confusion Matrix:")
print(confusion_matrix(y_val, xgb_val_preds))


🔹 XGBoost Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89       328
         1.0       0.31      0.16      0.21        63

    accuracy                           0.81       391
   macro avg       0.58      0.55      0.55       391
weighted avg       0.77      0.81      0.78       391

🔹 XGBoost Confusion Matrix:
[[306  22]
 [ 53  10]]


In [None]:
lgb_model = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    learning_rate=0.1,
    n_estimators=100,
    max_depth=6,
    random_state=42
)

# 5-Fold Cross Validation
lgb_scores = cross_val_score(lgb_model, X, y, cv=5, scoring='f1')
print(f"LightGBM 5-Fold F1 Score: {lgb_scores.mean():.4f}")


[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1775
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.160794 -> initscore=-1.652329
[LightGBM] [Info] Start training from score -1.652329
[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1778
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 16
[LightGBM] [Info] [binary:

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split data first
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE on training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Original y_train distribution:", y_train.value_counts().to_dict())
print("After SMOTE:", pd.Series(y_train_resampled).value_counts().to_dict())


Original y_train distribution: {0.0: 1310, 1.0: 251}
After SMOTE: {0.0: 1310, 1.0: 1310}


In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

xgb_model.fit(X_train_resampled, y_train_resampled)
xgb_val_preds = xgb_model.predict(X_val)

# Evaluation
print("🔹 XGBoost Classification Report (SMOTE):")
print(classification_report(y_val, xgb_val_preds))

print("🔹 XGBoost Confusion Matrix (SMOTE):")
print(confusion_matrix(y_val, xgb_val_preds))


🔹 XGBoost Classification Report (SMOTE):
              precision    recall  f1-score   support

         0.0       0.85      0.81      0.83       328
         1.0       0.21      0.27      0.24        63

    accuracy                           0.72       391
   macro avg       0.53      0.54      0.53       391
weighted avg       0.75      0.72      0.73       391

🔹 XGBoost Confusion Matrix (SMOTE):
[[265  63]
 [ 46  17]]


In [None]:
# Ratio of majority / minority class
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"scale_pos_weight = {scale_pos_weight:.2f}")


scale_pos_weight = 5.22


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer
from imblearn.over_sampling import SMOTE
import xgboost as xgb

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [scale_pos_weight]
}

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=make_scorer(f1_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("✅ Best Parameters:", grid_search.best_params_)
print("✅ Best F1 Score:", grid_search.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
✅ Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'scale_pos_weight': np.float64(5.219123505976095), 'subsample': 0.8}
✅ Best F1 Score: 0.882146277610334


In [None]:
best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val)

print("🔹 Classification Report (Validation):")
print(classification_report(y_val, val_preds))

print("🔹 Confusion Matrix (Validation):")
print(confusion_matrix(y_val, val_preds))


🔹 Classification Report (Validation):
              precision    recall  f1-score   support

         0.0       0.86      0.76      0.81       328
         1.0       0.23      0.37      0.28        63

    accuracy                           0.70       391
   macro avg       0.54      0.56      0.54       391
weighted avg       0.76      0.70      0.72       391

🔹 Confusion Matrix (Validation):
[[249  79]
 [ 40  23]]


In [None]:
test_df.columns

Index(['SEQN', 'RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT',
       'LBXIN', 'BMI_cat', 'INS_GLUC_RATIO', 'GLU_DIFF', 'ACTIVE_DIABETIC',
       'RISK_SCORE', 'GLU_GENDER', 'BMI_GENDER', 'is_diabetic',
       'is_hyperinsulinemia'],
      dtype='object')

In [None]:
train_df.columns

Index(['SEQN', 'RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT',
       'LBXIN', 'age_group', 'BMI_cat', 'INS_GLUC_RATIO', 'GLU_DIFF',
       'ACTIVE_DIABETIC', 'RISK_SCORE', 'is_diabetic', 'is_hyperinsulinemia',
       'GLU_GENDER', 'BMI_GENDER'],
      dtype='object')

In [None]:
sub=pd.read_csv('Sample_Submission.csv')

In [None]:
sub.columns

Index(['age_group'], dtype='object')

In [None]:
# 1. Define exactly the features you trained on, in the same order:
feature_cols = [
    'RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010',
    'LBXGLT', 'LBXIN', 'BMI_cat', 'INS_GLUC_RATIO', 'GLU_DIFF',
    'ACTIVE_DIABETIC', 'RISK_SCORE', 'is_diabetic',
    'is_hyperinsulinemia', 'GLU_GENDER', 'BMI_GENDER'
]

# 2. Select those columns (this automatically drops 'SEQN' and the stray 'age_group'):
X_test = test_df[feature_cols]

# 3. (Optional) Quick sanity check that train/test columns match:
print("Train cols:", X.columns.tolist())
print("Test  cols:", X_test.columns.tolist())

# 4. Now make predictions:
test_preds = best_model.predict(X_test)

# 5. Build submission with only the age_group column:
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv('submission.csv', index=False)


Train cols: ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN', 'BMI_cat', 'INS_GLUC_RATIO', 'GLU_DIFF', 'ACTIVE_DIABETIC', 'RISK_SCORE', 'is_diabetic', 'is_hyperinsulinemia', 'GLU_GENDER', 'BMI_GENDER']
Test  cols: ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN', 'BMI_cat', 'INS_GLUC_RATIO', 'GLU_DIFF', 'ACTIVE_DIABETIC', 'RISK_SCORE', 'is_diabetic', 'is_hyperinsulinemia', 'GLU_GENDER', 'BMI_GENDER']
