# Set up

In [1]:
import pandas as pd
import zipfile 
from zipfile import ZipFile 


file_name = "playground-series-s4e7.zip"
with ZipFile(file_name, 'r') as zip: 
    # printing all the contents of the zip file 
    zip.printdir()
df_zip = zipfile.ZipFile(file_name)
train = pd.read_csv(df_zip.open('train.csv'))

# import sample
import random

n = len(train) # Calculate number of rows in file
s = n//10  # sample size of 10%
skip = sorted(random.sample(range(1, n+1), n-s))   
train_sample = pd.read_csv(df_zip.open('train.csv'), skiprows = skip)
train_sample


File Name                                             Modified             Size
sample_submission.csv                          2024-06-24 13:46:18     99708270
test.csv                                       2024-06-24 13:46:24    433918183
train.csv                                      2024-06-24 13:46:48    662779095


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,5,Female,31,1,47.0,1,< 1 Year,No,28150.0,152.0,197,0
1,8,Female,26,1,28.0,1,< 1 Year,No,31639.0,152.0,36,0
2,11,Female,25,1,10.0,0,< 1 Year,Yes,2630.0,152.0,30,0
3,24,Female,42,1,11.0,0,1-2 Year,Yes,21473.0,26.0,152,0
4,26,Female,43,1,35.0,1,1-2 Year,No,38089.0,152.0,284,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1150474,11504754,Male,58,1,28.0,0,> 2 Years,Yes,35992.0,26.0,197,0
1150475,11504756,Male,33,1,47.0,1,< 1 Year,No,36660.0,152.0,28,0
1150476,11504757,Female,57,1,28.0,0,> 2 Years,Yes,69344.0,122.0,163,0
1150477,11504781,Male,26,1,8.0,1,< 1 Year,No,45198.0,152.0,242,0


In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


 # Reduce Memory

In [3]:
# drop id
train = train.drop('id', axis = 1)

# convert obj to cat
categorical = ["Region_Code", "Policy_Sales_Channel", "Gender", "Vehicle_Damage", "Vehicle_Age"]
train[["Region_Code", "Policy_Sales_Channel"]] = train[["Region_Code", "Policy_Sales_Channel"]].astype('int32')
train[categorical] = train[categorical].astype("category")

# train sample
train[["Region_Code", "Policy_Sales_Channel"]] = train[["Region_Code", "Policy_Sales_Channel"]].astype('int32')
train[categorical] = train[categorical].astype("category")

# convert numerics into int 32 

import numpy as np
# integer
d = dict.fromkeys(train.select_dtypes(np.int64).columns, np.int32)
train = train.astype(d)

# float
d2 = dict.fromkeys(train.select_dtypes(np.float64).columns, np.int32)
train = train.astype(d2)

## Subsample

In [4]:
train_sample = train_sample.drop('id', axis = 1)

# convert obj to cat
train_sample[["Region_Code", "Policy_Sales_Channel"]] = train_sample[["Region_Code", "Policy_Sales_Channel"]].astype('int32')
train_sample[categorical] = train_sample[categorical].astype("category")

# train_sample sample
train_sample[["Region_Code", "Policy_Sales_Channel"]] = train_sample[["Region_Code", "Policy_Sales_Channel"]].astype('int32')
train_sample[categorical] = train_sample[categorical].astype("category")

# convert numerics into int 32 

import numpy as np
# integer
d = dict.fromkeys(train_sample.select_dtypes(np.int64).columns, np.int32)
train_sample = train_sample.astype(d)

# float
d2 = dict.fromkeys(train_sample.select_dtypes(np.float64).columns, np.int32)
train_sample = train_sample.astype(d2)


# Split

In [5]:
from sklearn.model_selection import train_test_split
X = train.drop("Response", axis = 1)
y = train["Response"]


X_train, X_val, y_train, y_val = train_test_split(X,y,
                                                    test_size = 0.2,
                                                    random_state = 42,
                                                    stratify = y)

## Sample

In [6]:
Xs = train_sample.drop("Response", axis = 1)
ys = train_sample["Response"]


Xs_train, Xs_val, ys_train, ys_val = train_test_split(Xs,ys,
                                                    test_size = 0.2,
                                                    random_state = 42,
                                                    stratify = ys)

## Categorical pipeline

In [7]:
unique_values = X_train.select_dtypes(include = ['object', 'category']).nunique()
print(unique_values)

Gender                    2
Region_Code              53
Vehicle_Age               3
Vehicle_Damage            2
Policy_Sales_Channel    151
dtype: int64


In [8]:
test = X_train.select_dtypes(include = ['object', 'category']).columns.values.tolist()
test.remove("Vehicle_Age")
test

['Gender', 'Region_Code', 'Vehicle_Damage', 'Policy_Sales_Channel']

In [28]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder

binary_cat = unique_values.index[unique_values == 2].tolist()
ordinal_cat = ["Vehicle_Age"]
target_cat = unique_values.index[unique_values > 20].tolist()
categorical = X_train.select_dtypes(include = ['object', 'category']).columns.values.tolist()
categorical.remove("Vehicle_Age")

vehicle_age_categories = ['< 1 Year', '1-2 Year', '> 2 Years']

# ordinal encoder
ordinal = Pipeline(steps=[
    ("ordinal", OrdinalEncoder(categories=[vehicle_age_categories],
                               handle_unknown = 'use_encoded_value',
                               unknown_value = np.nan)),
])
# target encoder
target = Pipeline(steps=[
    ("target", TargetEncoder(handle_missing = 'return_nan'
    )),
])

# ohe encoder
ohe = transformer = Pipeline(steps=[
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])

# ohe encoder
dummy = Pipeline(steps=[
    ("dummy", OneHotEncoder(drop = 'first',
                            handle_unknown="ignore")),
])

# numerical transformer
num_col = ['Age','Annual_Premium','Vintage']

num_pipe = Pipeline(steps=[
    ("standardize", StandardScaler()) # standardize
])

# Preprocessing Pipelines

In [29]:

default_pipe = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_col),
        ("ohe",ohe, categorical),
        ("ord", ordinal, ordinal_cat),
        ("dummy", dummy, binary_cat)
    ],
    remainder = "passthrough"
)

target_pipe = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_col),
        ("target",target, target_cat),
        ("ord", ordinal, ordinal_cat),
        ("dummy", dummy, binary_cat)
    ],
    remainder = "passthrough"
)

# RF

In [49]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth = 5, n_estimators = 300,
                            min_samples_split = 2, verbose = 1) 

pipeline1 = Pipeline(steps=[
          ('column_tran',default_pipe),
          ('model',rf)
     ])

pipeline2 = Pipeline(steps=[
          ('column_tran',target_pipe),
          ('model',rf)
     ])


In [50]:
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score

def train_model(clf,preprocess, x_train,y_train,x_test,y_test):
     
     pipeline = Pipeline(steps=[
          ('preprocess',preprocess),
          ('model',clf)
     ])

     pipeline.fit(x_train,y_train)
     y_pred = pipeline.predict(x_test)
     acc = accuracy_score(y_test,y_pred)
     ps = precision_score(y_test,y_pred)
     rec = recall_score(y_test,y_pred)
     f1 = f1_score(y_test,y_pred)
     return acc , ps ,rec, f1

In [36]:
train_model(rf, default_pipe, Xs_train, ys_train, Xs_val, ys_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   33.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.9s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.8774424587998053, 0.0, 0.0, 0.0)

In [51]:
pipeline1.fit(Xs_train, ys_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.9min finished
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [52]:
pred = pipeline1.predict(Xs_val)
accuracy_score(ys_val, pred)
precision_score(ys_val,pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    2.7s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.0

<bound method IndexOpsMixin.nunique of 290767     0
375630     0
666997     0
18516      1
516170     0
          ..
920382     0
276283     0
858028     0
1083645    0
781224     0
Name: Response, Length: 230096, dtype: int32>

In [None]:
Train_model(rf, target_pipe, Xs_train, ys_train, Xs_val, ys_val)