# Random Forest tuning

# Set up

In [2]:
import pandas as pd
import zipfile
from zipfile import ZipFile 
file_name = "playground-series-s4e7.zip"
with ZipFile(file_name, 'r') as zip: 
    # printing all the contents of the zip file 
    zip.printdir()
df_zip = zipfile.ZipFile(file_name)
train = pd.read_csv(df_zip.open('train.csv'))

File Name                                             Modified             Size
sample_submission.csv                          2024-06-24 13:46:18     99708270
test.csv                                       2024-06-24 13:46:24    433918183
train.csv                                      2024-06-24 13:46:48    662779095


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


In [4]:
# drop id
train = train.drop('id', axis = 1)

# Define category

In [5]:

# convert obj to cat
categorical = ["Region_Code", "Policy_Sales_Channel", "Gender", "Vehicle_Damage", "Vehicle_Age"]
train[["Region_Code", "Policy_Sales_Channel"]] = train[["Region_Code", "Policy_Sales_Channel"]].astype('int32')
train[categorical] = train[categorical].astype("category")

# Reduce memory

In [6]:
# convert numerics into int 32 

import numpy as np
# integer
d = dict.fromkeys(train.select_dtypes(np.int64).columns, np.int32)
train = train.astype(d)

# float
d2 = dict.fromkeys(train.select_dtypes(np.float64).columns, np.int32)
train = train.astype(d2)

# Split

In [7]:
from sklearn.model_selection import train_test_split
X = train.drop("Response", axis = 1)
y = train["Response"]


X_train, X_val, y_train, y_val = train_test_split(X,y,
                                                    test_size = 0.2,
                                                    random_state = 42,
                                                    stratify = y)

In [8]:
X_train.info()
X_train.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9203838 entries, 1129106 to 6241578
Data columns (total 10 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int32   
 2   Driving_License       int32   
 3   Region_Code           category
 4   Previously_Insured    int32   
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  category
 9   Vintage               int32   
dtypes: category(5), int32(5)
memory usage: 298.4 MB


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
1129106,Female,25,1,28,0,< 1 Year,Yes,30775,152,256
9554468,Male,48,1,28,0,1-2 Year,Yes,35693,124,11
5397130,Male,42,1,45,0,1-2 Year,Yes,27863,124,172
1915003,Female,23,1,46,1,< 1 Year,No,22345,152,130
2508839,Female,42,1,28,0,> 2 Years,Yes,34367,26,169


# Pipe

In [9]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# grab columns by type
num_col = ['Age','Annual_Premium','Vintage']

cat_col = X_train.select_dtypes(include = ['object', 'category']).columns.tolist()

# numerical transformer
num_pipe = Pipeline(steps=[
    ("standardize", StandardScaler()) # standardize
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_col)
    ],
    remainder = "passthrough"
)

# XGB model

In [11]:
import xgboost as xgb
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

clf = xgb.XGBClassifier(objective = 'binary:logistic', enable_categorical=True, max_cat_to_onehot=1,
                        tree_method="hist", early_stopping_rounds=2
)

clf.fit(X_train, y_train,eval_set=[(X_val, y_val)], verbose=True)



[0]	validation_0-logloss:0.34437
[1]	validation_0-logloss:0.31881
[2]	validation_0-logloss:0.30207
[3]	validation_0-logloss:0.29046
[4]	validation_0-logloss:0.28219
[5]	validation_0-logloss:0.27625
[6]	validation_0-logloss:0.27193
[7]	validation_0-logloss:0.26876
[8]	validation_0-logloss:0.26611
[9]	validation_0-logloss:0.26442
[10]	validation_0-logloss:0.26308
[11]	validation_0-logloss:0.26197
[12]	validation_0-logloss:0.26085
[13]	validation_0-logloss:0.26025
[14]	validation_0-logloss:0.25981
[15]	validation_0-logloss:0.25947
[16]	validation_0-logloss:0.25885
[17]	validation_0-logloss:0.25853
[18]	validation_0-logloss:0.25808
[19]	validation_0-logloss:0.25785
[20]	validation_0-logloss:0.25742
[21]	validation_0-logloss:0.25721
[22]	validation_0-logloss:0.25708
[23]	validation_0-logloss:0.25697
[24]	validation_0-logloss:0.25654
[25]	validation_0-logloss:0.25648
[26]	validation_0-logloss:0.25625
[27]	validation_0-logloss:0.25615
[28]	validation_0-logloss:0.25597
[29]	validation_0-loglos

# Predict

In [12]:
y_pred = clf.predict(X_val)
print(confusion_matrix(y_val, y_pred))


[[1997192   20756]
 [ 255048   27964]]


# Dmatrices

In [18]:
xgtrain = xgb.DMatrix(X_train,y_train, enable_categorical= True)
xgtest  = xgb.DMatrix(X_val,y_val, enable_categorical= True)

In [None]:
from sklearn.metrics import f1_score, recall_score, confusion_matrix,roc_auc_score

params_1 = {"objective": "binary:logistic"}

n = 1000

results = xgb.cv(params_1,
                 xgtrain,
                 num_boost_round = n,
                 nfold=5,
                 metrics = ["logloss","auc","error"],
                 early_stopping_rounds=20
                 )


## Hyperopt

# k fold

In [16]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


clf = xgb.XGBClassifier(objective = 'binary:logistic', enable_categorical=True, max_cat_to_onehot=1,
                        tree_method="hist", early_stopping_rounds=2
)

clf.fit(X_train, y_train,eval_set=[(X_val, y_val)], verbose=True)

kfold = KFold(n_splits=5)
results = cross_val_score(clf, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/sklearn.py", line 1531, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/training.py", line 182, in train
    if cb_container.after_iteration(bst, i, dtrain, evals):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/callback.py", line 261, in after_iteration
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/callback.py", line 261, in <genexpr>
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/callback.py", line 446, in after_iteration
    raise ValueError(msg)
ValueError: Must have at least 1 validation dataset for early stopping.
