In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_csv('/kaggle/input/nutrition-health-survey/Train_Data.csv')
df_test = pd.read_csv('/kaggle/input/nutrition-health-survey/Test_Data.csv')

In [None]:
df_train.isnull().sum()

In [None]:
df_train.dtypes

In [None]:
df_train

In [None]:
cat_cols = df_train.select_dtypes(include='object').columns.tolist()
num_cols = df_train.select_dtypes(include='float64').columns.tolist()

In [None]:
df_train['age_group'] = df_train['age_group'].fillna("Missing")
df_encoded1 = pd.get_dummies(df_train, columns=cat_cols, drop_first=True)
df_encoded1['age_group_Senior'] = df_encoded1['age_group_Senior'].astype(int)
df_encoded = df_encoded1.drop("age_group_Missing", axis=1)
df_encoded

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=42)
df_imputed = imputer.fit_transform(df_encoded )
df_imputed = pd.DataFrame(df_imputed, columns=df_encoded.columns)
df_imputed

In [None]:
df_imputed.isnull().sum()

In [None]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
X = df_imputed.drop(columns=['age_group_Senior'])  
y = df_imputed['age_group_Senior']                 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
models = {
    "RandomForest": (
        RandomForestRegressor(random_state=42),
        {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5, 10]
        }
    ),
    "GradientBoosting": (
        GradientBoostingRegressor(random_state=42),
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    ),
    "XGBoost": (
        XGBRegressor(random_state=42, verbosity=0),
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    ),
    "CatBoost": (
        CatBoostRegressor(random_state=42, verbose=0),
        {
            "iterations": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "depth": [4, 6, 8]
        }
    ),
    "AdaBoost": (
        AdaBoostRegressor(random_state=42),
        {
            "n_estimators": [50, 100, 150],
            "learning_rate": [0.01, 0.1, 1]
        }
    )
}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

best_models = {}
results = []

for name, (model, param_grid) in models.items():
    print(f"\n Training & Tuning: {name}")

    search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=3,
        n_jobs=-1
    )

    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    best_models[name] = best_model

    y_pred = best_model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    results.append((name, search.best_params_, rmse))
    
    print(f" Best Params: {search.best_params_}")
    print(f" Test RMSE: {rmse:.4f}")

In [None]:
df_test.isnull().sum()

In [None]:
df_test

In [None]:
df_test.dtypes

In [None]:
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=42)
df_imputed_train = imputer.fit_transform(df_encoded[num_cols])
df_test_num = df_test[num_cols]
df_test_imputed = imputer.transform(df_test_num)
df_test_imputed = pd.DataFrame(df_test_imputed, columns=num_cols)

In [None]:
df_test_imputed.isnull().sum()

In [None]:
best_model = best_models["XGBoost"]
y_pred = best_model.predict(df_test_imputed)

In [None]:
thresholds = np.arange(0.0, 1.01, 0.01)

for t in thresholds:
    preds = (y_pred >= t).astype(int)
    print(f"Threshold {t:.2f}: Senior(1) = {sum(preds)}, Adult(0) = {len(preds) - sum(preds)}")

In [None]:
df_train['age_group'].value_counts()

In [None]:
df_encoded.isnull().sum()

In [None]:
thresholds = np.arange(0, 1.01, 0.001)

for t in thresholds:
    y_pred_binary = (y_pred >= t).astype(int)
    num_adult = np.sum(y_pred_binary == 0)
    num_senior = np.sum(y_pred_binary == 1)
    
    if num_senior == 0:
        continue 

    ratio = num_adult / num_senior
    print(f"Threshold {t:.4f} → Adult: {num_adult}, Senior: {num_senior}, Ratio: {ratio:.4f}")

In [None]:
y_pred_binary = (y_pred >= 0.23).astype(int)

In [None]:
submission = pd.DataFrame({'age_group': y_pred_binary})
submission.to_csv('submission.csv', index=False)