In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Load the data
df_train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv', index_col='id')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv', index_col='id')

# Data Summary Statistics
print("Train Data Description:")
print(df_train.describe())

X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]

# Convert 'sex' column to category
X['Sex'] = X['Sex'].astype('category')
df_test['Sex'] = df_test['Sex'].astype('category')

# Feature engineering

# Step 1: Train the model
model = LGBMRegressor(verbose=0, random_state=1)
model.fit(X, y)

# Step 2: Get feature importances
importances = model.feature_importances_

# Create a DataFrame for better handling
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Step 3: Sort features by importance
feature_importances = feature_importances.sort_values(by='importance', ascending=True)

# Print feature importances
print("\nFeature Importances:")
print(feature_importances)

kf = KFold(n_splits=7, shuffle=True, random_state=1)

best_i = 0
best_score = 1
for i in range(3):
    # Step 5: Select features that contribute to the top N% of total importance
    selected_features = feature_importances[i:]
    
    # Extract the column names of the selected features
    selected_columns = selected_features['feature']
    
    # Create the new DataFrame with only the selected features
    X2 = X[selected_columns]
    scores = cross_val_score(model, X2, y, scoring='neg_mean_squared_log_error', cv=kf, n_jobs=-1)
    mean_score = -np.mean(scores)
    RMLSE = np.sqrt(mean_score)
    print(f"RMSLE for removing {i} least important features: {RMLSE}")
    
    if RMLSE < best_score:
        best_score = RMLSE
        best_i = i

print(f"\nBest RMLSE is {best_score} and {best_i} least important features were removed\n")
# Remove extra features
selected_features = feature_importances[best_i:]
X = X[selected_features['feature']]
df_test = df_test[selected_features['feature']]


# Modeling and Evaluation

# CatBoost columns
cat_columns = X.select_dtypes(include='category').columns.tolist()

# List of models to evaluate
catboost_model = CatBoostRegressor(random_state=1, cat_features=cat_columns, verbose=False)
lgbm_model = LGBMRegressor(verbose=-1, random_state=1)
xgb_model = XGBRegressor(verbose=0, random_state=1, enable_categorical=True)

# Fit the models on the training data
catboost_model.fit(X, y)
lgbm_model.fit(X, y)
xgb_model.fit(X, y)

# Make predictions on the test set
catboost_preds = catboost_model.predict(df_test)
lgbm_preds = lgbm_model.predict(df_test)
xgb_preds = xgb_model.predict(df_test)

# Averaging predictions
final_preds = np.round((catboost_preds + lgbm_preds + xgb_preds) / 3).astype('int')

# Submission
submission = pd.DataFrame({'id': df_test.index, 'Rings': final_preds})
submission.to_csv("submission.csv", header=True, index=False)

# Final output of the submission file
print("\nSubmission Head:")
print(submission.head())
