In [None]:
import pandas as pd
# Importing XGBClassifier for the XGBoost model
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
# We now import RandomizedSearchCV instead of GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV 
# We need scipy.stats for defining parameter distributions
from scipy.stats import uniform, randint
import numpy as np

# --- Configuration ---
# Number of random parameter combinations to test.
# This gives better coverage than the fixed grid search.
N_ITER_SEARCH = 50 
# ---------------------

# --- 1. Load Data ---
# We use a try/except block to handle file loading errors gracefully.
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    sample_submission_df = pd.read_csv('sample_submission.csv')
except FileNotFoundError as e:
    print(f"Error: One or more files not found. Ensure 'train.csv', 'test.csv', and 'sample_submission.csv' are available. {e}")
    exit()

print("Data loaded successfully.")

# Separate IDs needed for the final submission file
test_ids = test_df['id']

# Remove the 'id' column from both dataframes as it's not a feature
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1) 

# --- 2. Separate Features (X) and Target (y) ---
X = train_df.drop('WeightCategory', axis=1)
y = train_df['WeightCategory']

# --- 3. Data Cleaning, Imputation, and Feature Engineering (NEW STEP) ---
print("3. Feature Engineering and Imputation...")

# Identify columns by type
numerical_cols_raw = X.select_dtypes(include=np.number).columns
categorical_cols_raw = X.select_dtypes(include=['object']).columns

# A. Calculate BMI: Weight / (Height^2). This is highly predictive.
def calculate_bmi(df):
    # Replace zero height with a tiny number (1e-6) to prevent division by zero
    df['Height'] = df['Height'].replace(0, 1e-6) 
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    return df

# Apply BMI calculation to both datasets
X = calculate_bmi(X)
test_df = calculate_bmi(test_df)
print("   -> Added highly predictive 'BMI' feature.")

# B. Imputation: Fill missing values
# Imputation for Numerical Features: Fill missing values with the median (robust to outliers)
for col in numerical_cols_raw:
    median_val = X[col].median()
    X[col] = X[col].fillna(median_val)
    test_df[col] = test_df[col].fillna(median_val)

# Imputation for Categorical Features: Fill missing values with 'Missing'
for col in categorical_cols_raw:
    X[col] = X[col].fillna('Missing')
    test_df[col] = test_df[col].fillna('Missing')
print("   -> Imputed missing values with median/Missing.")


# --- 4. Pre-processing: Handling Categorical Data (OHE) ---
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

X_processed = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X_test_processed = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Align columns
missing_cols = set(X_processed.columns) - set(X_test_processed.columns)
for c in missing_cols:
    X_test_processed[c] = 0
X_test_processed = X_test_processed[X_processed.columns]


# --- 5. Pre-processing: Feature Scaling ---
# The 'BMI' feature is automatically included here for scaling.
scaler = StandardScaler()
numerical_cols = X_processed.select_dtypes(include=[np.number]).columns.tolist()

X_processed[numerical_cols] = scaler.fit_transform(X_processed[numerical_cols])
X_test_processed[numerical_cols] = scaler.transform(X_test_processed[numerical_cols])


# --- 6. Target Encoding ---
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

target_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\n--- Target Class Mapping (for reference) ---")
for category, code in target_mapping.items():
    print(f"Code {code}: {category}")
print("-" * 40)


# --- 7. Hyperparameter Tuning using RandomizedSearchCV ---

print(f"Starting Randomized Hyperparameter Tuning with {N_ITER_SEARCH} total trials...")

# Define the parameter DISTRIBUTIONS to sample from
param_distributions = {
    # Number of trees: Test a wider, random range
    'n_estimators': randint(100, 700), 
    # Learning rate: Sample continuously between 0.005 and 0.2
    'learning_rate': uniform(0.005, 0.195), 
    # Tree depth: Sample integers between 3 and 10
    'max_depth': randint(3, 11), 
    # Subsample (row sampling): Sample between 0.5 and 1.0
    'subsample': uniform(0.5, 0.5),
    # Colsample (feature sampling): Sample between 0.5 and 1.0
    'colsample_bytree': uniform(0.5, 0.5), 
    # L2 regularization: Test a few discrete values
    'reg_lambda': [0.1, 1, 10]
}

# Initialize the base XGBoost model
base_model = XGBClassifier(
    objective='multi:softmax',
    use_label_encoder=False, 
    eval_metric='mlogloss',
    random_state=42
)

# Initialize RandomizedSearchCV
# We set n_iter=50 random combinations
# cv=3 means 3-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=base_model, 
    param_distributions=param_distributions, 
    n_iter=N_ITER_SEARCH, # Number of randomized trials
    scoring='accuracy', # The metric to optimize
    cv=3, 
    verbose=2, # Increased verbosity to track progress
    random_state=42, # Ensure reproducibility of the random sampling
    n_jobs=-1 # Use all available cores
)

# Perform the search
random_search.fit(X_processed, y_encoded)

# The best model found after tuning
model = random_search.best_estimator_

print("\nTuning complete.")
print(f"Total Randomized Trials: {N_ITER_SEARCH}")
print(f"Best parameters found: {random_search.best_params_}")


# --- 8. Evaluate on Training Data and Generate Predictions for the Test Set ---

# Calculate and print the training accuracy using the tuned model
y_train_pred_encoded = model.predict(X_processed)
train_accuracy = accuracy_score(y_encoded, y_train_pred_encoded)
print(f"Tuned Model Training Accuracy: {train_accuracy:.4f}")

print("Generating predictions on the test set...")
y_pred_encoded = model.predict(X_test_processed)

# Convert the numerical predictions back into the original category names
y_pred_categories = label_encoder.inverse_transform(y_pred_encoded)


# --- 9. Create Submission File ---

submission_df = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': y_pred_categories
})

# Save the final submission file
submission_filepath = 'submission_xgboost_random_search_bmi.csv' # Updated filename
submission_df.to_csv(submission_filepath, index=False)

print(f"\nSuccessfully generated predictions and saved to: {submission_filepath}")
print("\n--- Submission Head (First 5 Rows) ---")
print(submission_df.head().to_string(index=False))
