# Catboost Classifier

## Importing the necessary libraries

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib

## Loading the Processed Dataset

In [16]:
data = pd.read_csv("preprocessed.csv")

In [17]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Date_month,Date_day
0,2008-12-01,30,13.4,22.9,0.6,2.4,8.3,4.0,44.0,5.0,...,1007.7,1007.1,8.0,7.0,16.9,21.8,0,0,12,1
1,2008-12-02,30,7.4,25.1,0.0,3.6,10.0,2.0,44.0,0.0,...,1010.6,1007.8,7.0,7.0,17.2,24.3,0,0,12,2
2,2008-12-03,30,12.9,25.7,0.0,2.6,4.4,5.0,46.0,5.0,...,1007.6,1008.7,7.0,2.0,21.0,23.2,0,0,12,3
3,2008-12-04,30,9.2,28.0,0.0,14.6,8.9,11.0,24.0,13.0,...,1017.6,1012.8,7.0,7.0,18.1,26.5,0,0,12,4
4,2008-12-05,30,17.5,32.3,1.0,5.4,3.0,4.0,41.0,12.0,...,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0,12,5


In [18]:
data.shape

(145460, 25)

In [19]:
df = data.sample(n = 12000)

In [20]:
df.shape

(12000, 25)

## Dividing the dataset into Independent and Dependent features

In [7]:
X = df.drop(["RainTomorrow", "Date"], axis=1)
y = df["RainTomorrow"]

### Train test split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, stratify = y, random_state = 0)

In [22]:
y_train

138485    0
101583    0
79033     0
135684    1
62943     0
         ..
110895    1
108693    0
59033     0
2199      0
37479     0
Name: RainTomorrow, Length: 9600, dtype: int64

## Handling imbalanced dataset

### Catboost Classifier

In [23]:
cat = CatBoostClassifier(iterations=25, eval_metric = "AUC")
cat.fit(X_train, y_train)

Learning rate set to 0.5
0:	total: 4.93ms	remaining: 118ms
1:	total: 13.1ms	remaining: 151ms
2:	total: 17.3ms	remaining: 127ms
3:	total: 21.5ms	remaining: 113ms
4:	total: 28.9ms	remaining: 115ms
5:	total: 33.1ms	remaining: 105ms
6:	total: 37.6ms	remaining: 96.6ms
7:	total: 45.4ms	remaining: 96.4ms
8:	total: 51.1ms	remaining: 90.8ms
9:	total: 56ms	remaining: 84ms
10:	total: 62.8ms	remaining: 80ms
11:	total: 69.4ms	remaining: 75.1ms
12:	total: 74.9ms	remaining: 69.2ms
13:	total: 80.9ms	remaining: 63.6ms
14:	total: 85.3ms	remaining: 56.9ms
15:	total: 92.5ms	remaining: 52ms
16:	total: 96.9ms	remaining: 45.6ms
17:	total: 102ms	remaining: 39.5ms
18:	total: 107ms	remaining: 33.8ms
19:	total: 112ms	remaining: 28ms
20:	total: 116ms	remaining: 22.2ms
21:	total: 121ms	remaining: 16.5ms
22:	total: 127ms	remaining: 11ms
23:	total: 131ms	remaining: 5.46ms
24:	total: 136ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x29fce0b2d50>

In [24]:
y_pred = cat.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1805   87]
 [ 287  221]]
0.8441666666666666
              precision    recall  f1-score   support

           0       0.86      0.95      0.91      1892
           1       0.72      0.44      0.54       508

    accuracy                           0.84      2400
   macro avg       0.79      0.69      0.72      2400
weighted avg       0.83      0.84      0.83      2400



In [25]:
metrics.roc_auc_score(y_test, y_pred, average=None)

0.6945281416989896

## Hyperparameter Optimization

In [26]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = [0.05,0.1, 0.2,0.3,0.5,0.6]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child samples parameters
min_child_samples=[3,4,5,6,7]

In [27]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_samples': min_child_samples}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5, 0.6], 'max_depth': [5, 10, 15, 20, 25, 30], 'subsample': [0.7, 0.6, 0.8], 'min_child_samples': [3, 4, 5, 6, 7]}


In [28]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
cat = CatBoostClassifier()

In [29]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
cat_random = RandomizedSearchCV(estimator = cat, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=100, n_jobs = -1)

In [30]:
cat_random.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


9 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\konda\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\konda\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users\konda\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 2395, in _fit
    

0:	learn: 0.6602069	total: 7.86ms	remaining: 2.35s
1:	learn: 0.6275246	total: 14.6ms	remaining: 2.17s
2:	learn: 0.6008960	total: 22.7ms	remaining: 2.24s
3:	learn: 0.5748685	total: 32.1ms	remaining: 2.37s
4:	learn: 0.5530125	total: 48.4ms	remaining: 2.85s
5:	learn: 0.5344464	total: 55.6ms	remaining: 2.72s
6:	learn: 0.5183176	total: 66.4ms	remaining: 2.78s
7:	learn: 0.5041449	total: 76.1ms	remaining: 2.78s
8:	learn: 0.4891921	total: 83.3ms	remaining: 2.69s
9:	learn: 0.4775771	total: 92.8ms	remaining: 2.69s
10:	learn: 0.4661927	total: 101ms	remaining: 2.64s
11:	learn: 0.4564323	total: 110ms	remaining: 2.63s
12:	learn: 0.4476695	total: 116ms	remaining: 2.55s
13:	learn: 0.4404187	total: 124ms	remaining: 2.54s
14:	learn: 0.4343335	total: 131ms	remaining: 2.49s
15:	learn: 0.4279110	total: 139ms	remaining: 2.47s
16:	learn: 0.4222231	total: 145ms	remaining: 2.41s
17:	learn: 0.4169686	total: 154ms	remaining: 2.41s
18:	learn: 0.4121254	total: 163ms	remaining: 2.41s
19:	learn: 0.4071414	total: 172

In [31]:
cat_random.best_params_

{'subsample': 0.6,
 'n_estimators': 300,
 'min_child_samples': 5,
 'max_depth': 5,
 'learning_rate': 0.05}

In [32]:
best_random_grid=cat_random.best_estimator_

In [33]:
from sklearn.metrics import accuracy_score
y_pred = best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('Accuracy score {}'.format(accuracy_score(y_test,y_pred)))
print('Classification report {}'.format(classification_report(y_test,y_pred)))

[[1821   71]
 [ 287  221]]
Accuracy score 0.8508333333333333
Classification report               precision    recall  f1-score   support

           0       0.86      0.96      0.91      1892
           1       0.76      0.44      0.55       508

    accuracy                           0.85      2400
   macro avg       0.81      0.70      0.73      2400
weighted avg       0.84      0.85      0.83      2400

