In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [107]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [108]:
train.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [109]:
train.isnull().sum()

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64

In [110]:
def date(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    
    df['Year'] = df['Policy Start Date'].dt.year
    df['Month'] = df['Policy Start Date'].dt.month
    df['Day'] = df['Policy Start Date'].dt.day
    df['Quarter'] = df['Policy Start Date'].dt.quarter
    df['Day of Week'] = df['Policy Start Date'].dt.dayofweek
    
    df.drop('Policy Start Date', axis=1, inplace=True)
    
    return df

train = date(train)
test = date(test)

In [111]:
import numpy as np

def add_cyclic_features(df):
    # Add sine and cosine for Month
    # df['Year_sin'] = np.sin(2 * np.pi * (df['Year'] - df['Year'].min()) / (df['Year'].max() - df['Year'].min()))
    # df['Year_cos'] = np.cos(2 * np.pi * (df['Year'] - df['Year'].min()) / (df['Year'].max() - df['Year'].min()))
    
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)  # Assume maximum days in a month
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
    
    # df['Quarter_sin'] = np.sin(2 * np.pi * df['Quarter'] / 4)
    # df['Quarter_cos'] = np.cos(2 * np.pi * df['Quarter'] / 4)
    
    # df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['Day of Week'] / 7)
    # df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['Day of Week'] / 7)
    
    # Drop the original numeric columns if they are no longer needed
    df.drop([ 'Month', 'Day'], axis=1, inplace=True)

    return df

# Apply to your datasets
train = add_cyclic_features(train)
test = add_cyclic_features(test)


In [112]:
train["Annual_Income_Health_Score_Ratio"] = train["Health Score"] / train["Annual Income"]
test["Annual_Income_Health_Score_Ratio"] = test["Health Score"] / test["Annual Income"]

train["Annual_Income_Age_Ratio"] = train["Annual Income"] / train["Age"]
test["Annual_Income_Age_Ratio"] = test["Annual Income"] / test["Age"]

train["Credit_Age"] = train["Credit Score"] / train["Age"]
test["Credit_Age"] = test["Credit Score"] / test["Age"]

train["Vehicle_Age_Insurance_Duration"] = train["Vehicle Age"] / train["Insurance Duration"]
test["Vehicle_Age_Insurance_Duration"] = test["Vehicle Age"] / test["Insurance Duration"]

average_income = train['Annual Income'].mean()
train['Is High Income'] = (train['Annual Income'] > average_income).astype(int)
test['Is High Income'] = (test['Annual Income'] > average_income).astype(int)

train['Property Location Type'] = train['Location'] + '_' + train['Property Type']
test['Property Location Type'] = test['Location'] + '_' + test['Property Type']

train['Income_to_Dependents_Ratio'] = train['Annual Income']/ (train['Number of Dependents'].fillna(0)+1)
test['Income_to_Dependents_Ratio'] = test['Annual Income']/ (test['Number of Dependents'].fillna(0)+1)

train['Income_per_Dependent'] = train['Annual Income'] / (train['Number of Dependents'] + 1)
test['Income_per_Dependent'] = test['Annual Income'] / (test['Number of Dependents'] + 1)

train.drop(columns=['Property Type', 'id','Number of Dependents','Annual Income'], inplace=True)
test.drop(columns=['Property Type', 'id','Number of Dependents','Annual Income'], inplace=True)


In [113]:
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type == 'float64':
            df[col] = df[col].astype('float32')
        elif col_type == 'int64':
            df[col] = df[col].astype('int32')
    return df

train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

In [114]:
# Define the columns to remove based on low feature importance



In [115]:
pip install xgbtune

Note: you may need to restart the kernel to use updated packages.


In [116]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgbtune import tune_xgb_model

# Target column
target_col = 'Premium Amount'

# Columns to drop after preprocessing
columns_to_drop = [
    'cat__Property Location Type_Rural_Apartment',
       'cat__Property Location Type_Rural_Condo',
       'cat__Property Location Type_Rural_House',
       'cat__Property Location Type_Suburban_Apartment',
       'cat__Property Location Type_Suburban_Condo',
       'cat__Property Location Type_Suburban_House',
       'cat__Property Location Type_Urban_Apartment',
       'cat__Property Location Type_Urban_Condo',
       'cat__Property Location Type_Urban_House',
       'cat__Smoking Status_No',
       'cat__Smoking Status_Yes','cat__Gender_Female', 'cat__Gender_Male','num__Is High Income'
]

# Select numerical and categorical columns
numerical_cols = train.select_dtypes(include=['float32', 'int32']).columns.tolist()
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

# Define preprocessing pipeline
preprocessing = ColumnTransformer([
    ('num', make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), numerical_cols),
    ('cat', make_pipeline(SimpleImputer(strategy='constant', fill_value='unknown'),
                          OneHotEncoder(handle_unknown='ignore')), categorical_cols)
], remainder='drop')

# Prepare training and testing data
X_train = train.drop(columns=[target_col]).copy()
X_test = test.copy()

# Log-transform the target variable
y_train = np.log1p(train[target_col])

# Fit and transform the training data
X_train_preprocessed = preprocessing.fit_transform(X_train)
X_test_preprocessed = preprocessing.transform(X_test)

# Get the feature names generated by the preprocessing pipeline
feature_names = preprocessing.get_feature_names_out()

# Convert preprocessed arrays back to DataFrame for easier column manipulation
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=feature_names)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=feature_names)

# Drop columns based on `columns_to_drop`
X_train_final = X_train_preprocessed_df.drop(columns=columns_to_drop, errors='ignore')
X_test_final = X_test_preprocessed_df.drop(columns=columns_to_drop, errors='ignore')

print(X_train_final.columns)
# Convert the final DataFrame back to numpy arrays for model training
X_train_final = X_train_final.values
X_test_final = X_test_final.values


# Define initial parameters for XGBoost
params = {'eval_metric': 'rmsle', 'tree_method': 'hist', 'device': 'cuda'}

# Tune and train the XGBoost model
params, round_count = tune_xgb_model(params, X_train_final, y_train)


Index(['num__Age', 'num__Health Score', 'num__Previous Claims',
       'num__Vehicle Age', 'num__Credit Score', 'num__Insurance Duration',
       'num__Year', 'num__Quarter', 'num__Day of Week', 'num__Month_sin',
       'num__Month_cos', 'num__Day_sin', 'num__Day_cos',
       'num__Annual_Income_Health_Score_Ratio', 'num__Annual_Income_Age_Ratio',
       'num__Credit_Age', 'num__Vehicle_Age_Insurance_Duration',
       'num__Income_to_Dependents_Ratio', 'num__Income_per_Dependent',
       'cat__Marital Status_Divorced', 'cat__Marital Status_Married',
       'cat__Marital Status_Single', 'cat__Marital Status_unknown',
       'cat__Education Level_Bachelor's', 'cat__Education Level_High School',
       'cat__Education Level_Master's', 'cat__Education Level_PhD',
       'cat__Occupation_Employed', 'cat__Occupation_Self-Employed',
       'cat__Occupation_Unemployed', 'cat__Occupation_unknown',
       'cat__Location_Rural', 'cat__Location_Suburban', 'cat__Location_Urban',
       'cat__Policy

In [117]:
dtrain = xgb.DMatrix(X_train_final, label=y_train)
final_model = xgb.train(params, dtrain, num_boost_round=round_count)

dtest = xgb.DMatrix(X_test_final)
y_pred = final_model.predict(dtest)
y_pred_final = np.expm1(y_pred)

In [118]:
sub = pd.read_csv("sample_submission.csv")
output = pd.DataFrame({"id":sub.id, "Premium Amount":y_pred_final})
output.to_csv('submission_17.csv', index=False)


output.head()

Unnamed: 0,id,Premium Amount
0,1200000,961.22876
1,1200001,749.13385
2,1200002,784.875305
3,1200003,840.701111
4,1200004,764.442261
