In this notebook, we use clean_data.csv to create a model to predict HALC
- RUN ORDER = 3/5

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv("clean_data.csv")

- We will create a RF to predict HALC using all columns except made_claim

- Again, we are parsing data where HALC is not 0
  - Is it heavily unbalanced
  - We want predicted HALC to a continuous number because the 0's would have already been predicted

In [6]:
df = df[df['HALC'] != 0]

In [7]:
df = df.drop(columns=['made_claim'])

In [8]:
# remove outliers in df to better represent the data
Q1 = df['LC'].quantile(0.25)
Q3 = df['LC'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['LC'] >= lower_bound) & (df['LC'] <= upper_bound)]

In [9]:
# temp = df.pop('LC')
# df['LC'] = temp

In [10]:
df.columns

Index(['is_channel_broker', 'total_pol_year', 'total_pol_held', 'max_policies',
       'max_products', 'canceled_policies', 'is_halfyearly', 'net_premium',
       'risk_type', 'is_urban', 'is_multidriver', 'regis_year', 'horsepower',
       'cylinder_cap', 'market_value', 'door_count', 'is_petrol',
       'vehicle_weight', 'pol_start_day', 'pol_start_month', 'pol_start_year',
       'last_renewal_day', 'last_renewal_month', 'last_renewal_year',
       'next_renewal_day', 'next_renewal_month', 'next_renewal_year',
       'license_issue_date_day', 'license_issue_date_month',
       'license_issue_date_year', 'is_youngin', 'is_adult', 'is_middleaged',
       'is_old', 'LC', 'HALC'],
      dtype='object')

In [11]:
X = df.drop(columns=['HALC'])
y = df['HALC']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
rf_HALC = RandomForestRegressor(n_estimators=100, random_state=42)
rf_HALC.fit(X_train, y_train)

In [14]:
y_pred = rf_HALC.predict(X_test)

- RMSE: 369.99

In [16]:
mean_squared_error(y_test, y_pred, squared=False)

369.9973676670591

In [17]:
import joblib
joblib.dump(rf_HALC, 'predict_HALC.pkl')

['predict_HALC.pkl']

In [18]:
X_test.columns

Index(['is_channel_broker', 'total_pol_year', 'total_pol_held', 'max_policies',
       'max_products', 'canceled_policies', 'is_halfyearly', 'net_premium',
       'risk_type', 'is_urban', 'is_multidriver', 'regis_year', 'horsepower',
       'cylinder_cap', 'market_value', 'door_count', 'is_petrol',
       'vehicle_weight', 'pol_start_day', 'pol_start_month', 'pol_start_year',
       'last_renewal_day', 'last_renewal_month', 'last_renewal_year',
       'next_renewal_day', 'next_renewal_month', 'next_renewal_year',
       'license_issue_date_day', 'license_issue_date_month',
       'license_issue_date_year', 'is_youngin', 'is_adult', 'is_middleaged',
       'is_old', 'LC'],
      dtype='object')

In [19]:
y

18         98.70500
24       1256.34100
37         44.29260
40        902.17800
42        485.74000
            ...    
37352     313.47000
37377     101.85000
37392     113.97725
37417     432.11710
37425     698.92725
Name: HALC, Length: 3811, dtype: float64

In [20]:
df

Unnamed: 0,is_channel_broker,total_pol_year,total_pol_held,max_policies,max_products,canceled_policies,is_halfyearly,net_premium,risk_type,is_urban,...,next_renewal_year,license_issue_date_day,license_issue_date_month,license_issue_date_year,is_youngin,is_adult,is_middleaged,is_old,LC,HALC
18,0,3,2,2,1,0,1,341.78,3,0,...,2019,27,11,2008,0,1,0,0,25.975,98.70500
24,1,4,2,2,2,0,0,300.01,3,0,...,2019,25,1,2000,0,0,1,0,752.300,1256.34100
37,0,4,2,2,1,0,0,252.45,3,0,...,2019,15,10,1987,0,0,1,0,33.555,44.29260
40,1,4,6,6,1,0,1,331.29,3,0,...,2019,17,6,2002,0,1,0,0,334.140,902.17800
42,1,2,2,2,1,2,1,309.16,3,0,...,2018,16,3,2010,0,1,0,0,242.870,485.74000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37352,0,3,4,4,1,1,0,283.22,3,1,...,2016,13,11,1978,0,0,1,0,208.980,313.47000
37377,0,1,1,1,1,0,0,209.08,3,0,...,2019,6,4,1976,0,0,0,1,48.500,101.85000
37392,1,3,2,2,1,0,0,119.52,3,0,...,2017,10,9,1984,0,0,1,0,57.275,113.97725
37417,1,6,1,2,1,0,1,271.43,3,1,...,2019,30,9,1994,0,0,1,0,939.385,432.11710


In [21]:
np.mean(df['LC'])

309.7022916749167