# Preparation

In [30]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [31]:
DIR_PATH = "/content/drive/MyDrive/Colab Notebooks/ML CCC"

RANDOM_STATE = 42

In [32]:
f2_score = lambda y_test, y_pred: fbeta_score(y_test, y_pred, beta=2)
f2_scorer = make_scorer(fbeta_score, beta=2)

In [33]:
X_train = pd.read_csv(os.path.join(DIR_PATH, 'X_train_prep.csv')) 
X_test = pd.read_csv(os.path.join(DIR_PATH, 'X_test_prep.csv'))
y_train = pd.read_csv(os.path.join(DIR_PATH, 'y_train_prep.csv'))
y_test = pd.read_csv(os.path.join(DIR_PATH, 'y_test_prep.csv'))

In [34]:
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

In [35]:
def plot_search_results(searcher):
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_successive_halving_iterations.html
    
    results = pd.DataFrame(searcher.cv_results_)
    results["params_str"] = results.params.apply(str)
    results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
    mean_scores = results.pivot(
        index="iter", columns="params_str", values="mean_test_score"
    )
    ax = mean_scores.plot(legend=False, alpha=0.6)

    labels = [
        f"iter={i}\nn_samples={searcher.n_resources_[i]}\nn_candidates={searcher.n_candidates_[i]}"
        for i in range(searcher.n_iterations_)
    ]

    ax.set_xticks(range(searcher.n_iterations_))
    ax.set_xticklabels(labels, rotation=45, multialignment="left")
    ax.set_title("Scores of candidates over iterations")
    ax.set_ylabel("mean test score", fontsize=15)
    ax.set_xlabel("iterations", fontsize=15)
    plt.tight_layout()
    plt.show()


# Hand-tuning

## Initialization

In [36]:
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.6,
    random_state=RANDOM_STATE,
)

## Phase 1
`learning_rate = 0.1 by default `

In [37]:
gb = GradientBoostingClassifier(
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[386 130]
 [ 50  16]]
0.822957412814721
0.19512195121951217


In [38]:
gb = GradientBoostingClassifier(
    learning_rate=0.01,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[243 273]
 [ 24  42]]
0.7825770759201061
0.36269430051813467


In [39]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[184 332]
 [ 16  50]]
0.7920378619153674
0.3869969040247678


In [40]:
gb = GradientBoostingClassifier(
    learning_rate=0.0001,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[247 269]
 [ 22  44]]
0.7211601672407898
0.38128249566724437


In [41]:
gb = GradientBoostingClassifier(
    learning_rate=0.00055,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[184 332]
 [ 16  50]]
0.7920378619153674
0.3869969040247678


## Phase 2
from phase 1:
```
learning_rate=0.001
(n_estimators=100 by default)
```
```
[[184 332]
 [ 16  50]]
0.7920378619153674
0.3869969040247678
```

In [42]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=150,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[202 314]
 [ 18  48]]
0.7794992215217336
0.38338658146964855


In [43]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=200,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[218 298]
 [ 21  45]]
0.7694984277540079
0.37067545304777594


In [44]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=120,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[202 314]
 [ 18  48]]
0.7794992215217336
0.38338658146964855


In [45]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=80,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[184 332]
 [ 16  50]]
0.7920378619153674
0.3869969040247678


In [46]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=60,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[184 332]
 [ 16  50]]
0.7920378619153674
0.3869969040247678


In [47]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[184 332]
 [ 16  50]]
0.7938475549596499
0.3869969040247678


In [48]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=20,
    random_state=RANDOM_STATE
    ).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[184 332]
 [ 16  50]]
0.7888404842713135
0.3869969040247678


## Phase 3
```
learning_rate=0.001,
n_estimators=40
(max_depth=3 by default)
```

```
[[184 332]
 [ 16  50]]
0.7938475549596499
0.3869969040247678
```

In [50]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    max_depth=4,
    random_state=RANDOM_STATE
).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[218 298]
 [ 23  43]]
0.774235187814638
0.35537190082644626


In [52]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    max_depth=5,
    random_state=RANDOM_STATE
).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[252 264]
 [ 32  34]]
0.7312604551127161
0.30249110320284694


In [54]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    max_depth=2,
    random_state=RANDOM_STATE
).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[125 391]
 [ 10  56]]
0.821989913333007
0.39381153305203936


In [56]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    max_depth=1,
    random_state=RANDOM_STATE
).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[ 80 436]
 [  2  64]]
0.8277262459749122
0.418848167539267


In [57]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    max_depth=20,
    random_state=RANDOM_STATE
).fit(X_train, y_train)
y_val_pred = gb.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_val, y_val_pred))

[[383 133]
 [ 47  19]]
0.953286332852226
0.22836538461538464


# Final evaluation

In [58]:
X_train_val = X_train.append(X_val, ignore_index=True)
X_train_val

Unnamed: 0,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,Account_length,...,Occupation_type_10,Occupation_type_11,Occupation_type_12,Occupation_type_13,Occupation_type_14,Occupation_type_15,Occupation_type_16,Occupation_type_17,Occupation_type_18,Occupation_type_19
0,0,0,1,0,0,0,0,-0.647202,-0.037197,1.657132,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,1.468683,0.972997,1.657132,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,1,0,0,1.468683,0.972997,-1.383327,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,1.468683,-0.037197,-1.991669,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,1.710836,2.286411,0.895521,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14901,1,1,0,0,1,0,0,1.665184,1.711170,-1.485146,...,0,0,0,0,0,0,0,0,0,0
14902,1,0,1,0,1,0,0,1.468683,0.972997,0.699769,...,0,0,0,0,0,0,0,0,0,0
14903,0,1,0,1,0,0,0,1.468683,-0.037197,0.227392,...,0,1,0,0,0,0,0,0,0,0
14904,0,0,1,0,1,0,0,1.468683,0.972997,0.115080,...,0,0,0,0,0,0,0,0,0,0


In [59]:
y_train_val = np.append(y_train, y_val)

In [60]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    max_depth=1,
    random_state=RANDOM_STATE
).fit(X_train_val, y_train_val)
y_test_pred = gb.predict(X_test)
y_train_val_pred = gb.predict(X_train_val)
print(confusion_matrix(y_test, y_test_pred))
print(f2_score(y_train_val, y_train_val_pred))
print(f2_score(y_test, y_test_pred))
print(confusion_matrix(y_train_val, y_train_val_pred))

[[748   0]
 [127   0]]
0.0
0.0
[[7678    0]
 [7228    0]]


In [61]:
gb = GradientBoostingClassifier(
    learning_rate=0.001,
    n_estimators=40,
    max_depth=1,
    random_state=RANDOM_STATE
).fit(X_train, y_train)
y_test_pred = gb.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(f2_score(y_train, gb.predict(X_train)))
print(f2_score(y_test, y_test_pred))

[[115 633]
 [  8 119]]
0.8277262459749122
0.4722222222222222
