<a href="https://www.kaggle.com/code/sabra15/insurance-premium-estimation-eda-lgbm?scriptVersionId=214875332" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
train['Health Score'].dropna().astype(int).nunique()
test['Health Score'].dropna().astype(int).nunique()

In [None]:
target = train['Premium Amount']

numeric_columns = train.select_dtypes(exclude=['object']).columns.tolist()
numeric_columns.remove('id')
numeric_columns.remove('Premium Amount')

scaler = StandardScaler()
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
test[numeric_columns] = scaler.fit_transform(test[numeric_columns])

train['Premium Amount'] = np.log1p(train['Premium Amount'])

In [None]:
# replace missing values
for df in [train, test]:
    df.fillna({
        'Age': train['Age'].mean(),
        'Annual Income': train['Annual Income'].mean(),
        'Number of Dependents': train['Number of Dependents'].mean(),
        'Health Score': train['Health Score'].mean(),
        'Previous Claims': train['Previous Claims'].mean(),
        'Vehicle Age': train['Vehicle Age'].mean(),
        'Credit Score': train['Credit Score'].mean(),
        'Insurance Duration': train['Insurance Duration'].mean(),
        'Marital Status': 'Unknown',
        'Occupation': 'Unknown',
        'Customer Feedback': 'Unknown',
    }, inplace=True)

# Split Policy Start Date into day, date, month and year
for df in [train, test]:
    policy_start_date = pd.to_datetime(df['Policy Start Date'])
    df['Year'] = policy_start_date.dt.year
    df['Month'] = policy_start_date.dt.month
    df['Day'] = policy_start_date.dt.day
    df['Year_sin'] = np.sin(2 * np.pi * df['Year'])
    df.drop('Policy Start Date',axis=1,inplace=True)

# convert columns to category type wherever possible
categorical_columns  = train.select_dtypes(include = "object").columns
for df in [train, test]:
    for col in categorical_columns:
        df[col] = df[col].astype('category')
    df = pd.get_dummies(df, columns=categorical_columns)

combined = pd.concat([train, test], axis=0, ignore_index=True)

for df in [train, test]:
    for col in categorical_columns:
        freq_encoding = combined[col].value_counts().to_dict()
        df[f"{col}_freq"] = df[col].map(freq_encoding).astype('float')
        df.drop(col,axis=1,inplace=True)


In [None]:
def rmsle(y_true, y_pred):
    y_pred = np.maximum(0, y_pred)  # Clip predicted values to be non-negative
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
X = train.drop(columns=['id', 'Premium Amount'])
y_log = train['Premium Amount']

lgb_params = {
    'num_leaves': 71,
    'learning_rate': 0.05412467152424433,
    'n_estimators': 595,
    'max_depth': 12,
    'min_data_in_leaf': 97,
    'bagging_fraction': 0.5200288825838669,
    'feature_fraction': 0.9881738491942492,
    'n_jobs': -1,
    'verbose': -1
}

def train_model():
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    oof = np.zeros(len(X))
    models = []

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold + 1}")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[valid_idx]

        lgbm_model = LGBMRegressor(**lgb_params)

        lgbm_model.fit(X_train, y_train)
        oof[valid_idx] = np.maximum(0, lgbm_model.predict(X_valid))
        fold_rmsle = rmsle(np.expm1(y_valid), np.expm1(oof[valid_idx]))
        print(f"Fold {fold + 1} RMSLE: {fold_rmsle}")
        models.append(lgbm_model)

    return models, oof

In [None]:
models,oof = train_model()

In [None]:
print(rmsle(np.expm1(train['Premium Amount']), np.expm1(oof)))
# 1.0457401258689165

In [None]:
X_test = test.drop(columns=['id'])
test_predictions = np.zeros(len(test))

for model in models:
    test_predictions += np.maximum(0, np.expm1(model.predict(X_test))) / len(models)

submission = test[['id']].copy()
submission['Premium Amount'] = test_predictions
submission.to_csv('submission.csv', index = False)
submission.head()