In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


In [2]:
pip install optuna-integration[lightgbm]

Collecting optuna-integration[lightgbm]
  Downloading optuna_integration-4.1.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.1.0-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.1.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

import lightgbm as lgb
import optuna
from optuna.integration import LightGBMTunerCV

import warnings
warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')
sample_sub = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')

# Inspect datasets
print("Train Data Shape:", df_train.shape)
print("Test Data Shape:", df_test.shape)


Train Data Shape: (1200000, 21)
Test Data Shape: (800000, 20)


In [4]:
# Save test IDs for submission
test_ids = df_test['id']

# Drop 'id' columns from train and test datasets
df_train.drop(columns=['id'], inplace=True)
df_test.drop(columns=['id'], inplace=True)

# Extract date features
def create_date_features(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    df['Year'] = df['Policy Start Date'].dt.year
    df['Month'] = df['Policy Start Date'].dt.month
    df['Day'] = df['Policy Start Date'].dt.day
    df['Day_of_Week'] = df['Policy Start Date'].dt.dayofweek

    # Cyclical features
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)

    df.drop(columns=['Policy Start Date'], inplace=True)
    return df

df_train = create_date_features(df_train)
df_test = create_date_features(df_test)

# Add custom features
def add_custom_features(df):
    df['contract_length'] = pd.cut(
        df['Insurance Duration'].fillna(99),
        bins=[-float('inf'), 1, 3, float('inf')],
        labels=[0, 1, 2]
    ).astype(int)
    return df

df_train = add_custom_features(df_train)
df_test = add_custom_features(df_test)

# Separate numerical and categorical columns
numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns.drop('Premium Amount')
categorical_cols = df_train.select_dtypes(include=['object']).columns

# Impute missing values
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

df_train[numerical_cols] = imputer_num.fit_transform(df_train[numerical_cols])
df_test[numerical_cols] = imputer_num.transform(df_test[numerical_cols])

df_train[categorical_cols] = imputer_cat.fit_transform(df_train[categorical_cols])
df_test[categorical_cols] = imputer_cat.transform(df_test[categorical_cols])

# Frequency encode categorical features
def frequency_encoding(train, test, columns):
    for col in columns:
        freq_map = train[col].value_counts().to_dict()
        train[col] = train[col].map(freq_map)
        test[col] = test[col].map(freq_map)
    return train, test

df_train, df_test = frequency_encoding(df_train, df_test, categorical_cols)

# Log-transform features
df_train['Annual Income'] = np.log1p(df_train['Annual Income'])
df_test['Annual Income'] = np.log1p(df_test['Annual Income'])

# Log-transform the target variable
y = np.log1p(df_train['Premium Amount'])
X = df_train.drop(columns=['Premium Amount'])


In [5]:
# Define Optuna objective function
def objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', -1, 15),
        'num_leaves': trial.suggest_int('num_leaves', 31, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10),
        'device': 'gpu'  # Change to 'cpu' if GPU causes issues
    }

    model = lgb.LGBMRegressor(**params)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_log_error')
    rmsle = np.mean(np.sqrt(-cv_scores))
    return rmsle

# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)

# Best parameters
best_params = study.best_params
print("Best Parameters:", best_params)


[I 2024-12-16 22:42:44,781] A new study created in memory with name: no-name-d617360f-3668-44c4-92ba-f77f10842afe


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1031
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 27
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 27 dense feature groups (25.63 MB) transferred to GPU in 0.027899 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 6.593848
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1033
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 27
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 27 dense feature groups (25.63 MB) transferred to GPU in 0.043402 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 6.594073
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1031
[LightGBM] [Info] Number of data points in the train set: 960000, number of used

[I 2024-12-16 22:43:55,725] Trial 0 finished with value: 0.15920257552804845 and parameters: {'n_estimators': 579, 'learning_rate': 0.065201181842362, 'max_depth': 4, 'num_leaves': 295, 'min_child_samples': 27, 'subsample': 0.7138252090325714, 'colsample_bytree': 0.9283248461518304, 'reg_alpha': 0.022386805215316607, 'reg_lambda': 0.06010828839566318}. Best is trial 0 with value: 0.15920257552804845.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1031
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 27
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 27 dense feature groups (25.63 MB) transferred to GPU in 0.026241 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 6.593848
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1033
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 27
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM

[I 2024-12-16 22:47:01,919] Trial 1 finished with value: 0.15914181279070372 and parameters: {'n_estimators': 1410, 'learning_rate': 0.06067502507670206, 'max_depth': 8, 'num_leaves': 70, 'min_child_samples': 26, 'subsample': 0.8607210279367381, 'colsample_bytree': 0.7897998437008988, 'reg_alpha': 0.0060563352933326545, 'reg_lambda': 0.005718586915585909}. Best is trial 1 with value: 0.15914181279070372.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1031
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 27
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 27 dense feature groups (25.63 MB) transferred to GPU in 0.027113 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 6.593848
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1033
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 27
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM

[I 2024-12-16 22:49:42,203] Trial 2 finished with value: 0.16036028441823338 and parameters: {'n_estimators': 634, 'learning_rate': 0.1593420797578512, 'max_depth': 9, 'num_leaves': 254, 'min_child_samples': 15, 'subsample': 0.6446974614249471, 'colsample_bytree': 0.8916462961358764, 'reg_alpha': 1.6684663707493057, 'reg_lambda': 0.01769003152775274}. Best is trial 1 with value: 0.15914181279070372.


Best Parameters: {'n_estimators': 1410, 'learning_rate': 0.06067502507670206, 'max_depth': 8, 'num_leaves': 70, 'min_child_samples': 26, 'subsample': 0.8607210279367381, 'colsample_bytree': 0.7897998437008988, 'reg_alpha': 0.0060563352933326545, 'reg_lambda': 0.005718586915585909}


In [6]:
# Train final LightGBM model with best parameters
final_model = lgb.LGBMRegressor(**best_params)
final_model.fit(X, y)

# Predict on test data
test_preds = np.expm1(final_model.predict(df_test))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1029
[LightGBM] [Info] Number of data points in the train set: 1200000, number of used features: 27
[LightGBM] [Info] Start training from score 6.593889


In [7]:
# Prepare submission file
submission = pd.DataFrame({'id': test_ids, 'Premium Amount': test_preds})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
