here, no class weights are considered.

ensemble:
- ~~Stacking classifiers~~
- Bagging/Random subspaces
- (RF)
- Boosting: ada **+gb**

**nn**


# Preparation

In [1]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, \
                                GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid, RadiusNeighborsClassifier
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.linear_model import RidgeClassifier, PassiveAggressiveClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [2]:
DIR_PATH = "/content/drive/MyDrive/Colab Notebooks/ML CCC"

RANDOM_STATE = 42

In [3]:
f2_score = lambda y_test, y_pred: fbeta_score(y_test, y_pred, beta=2)
f2_scorer = make_scorer(fbeta_score, beta=2)

In [4]:
X_train = pd.read_csv(os.path.join(DIR_PATH, 'X_train_prep.csv')) 
X_test = pd.read_csv(os.path.join(DIR_PATH, 'X_test_prep.csv'))
y_train = pd.read_csv(os.path.join(DIR_PATH, 'y_train_prep.csv'))
y_test = pd.read_csv(os.path.join(DIR_PATH, 'y_test_prep.csv'))

In [5]:
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

# Dummy

In [6]:
dummy = DummyClassifier(strategy='uniform',
                        random_state=RANDOM_STATE).fit(X_train, y_train)

In [7]:
y_test_pred_dummy = dummy.predict(X_test)

In [8]:
confusion_matrix(y_test, y_test_pred_dummy)

array([[633, 631],
       [ 95,  98]])

In [9]:
f2_score(y_test, y_test_pred_dummy)

0.3264490339773485

In [10]:
f2_score(y_train, dummy.predict(X_train))

0.4999301968448973

# Bagging (hand tuning and selecting base models so that bagging perform better than dummy)
not applicable

bootstrap_feature=True a.k.a. random subspace (but with |subspace|=|space|) does improve the performance
- ~~`base_estimator=DecisionTreeClassifier`~~ wont be used
- ~~`LogisticRegression`~~: `ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
`
- ~~PassiveAggressiveClassifier~~
- ~~Perceptron~~
- **~~RidgeClassifier~~**: only slightly better, not worth trying
- ~~KNeighborsClassifier~~
- ~~RadiusNeighborsClassifier~~: unable to handle outliers `No neighbors found for test samples` 
- NearestCentroid
- ~~`ExtraTreeClassifier`~~
- ~~`LinearSVC`~~: `ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.`

In [11]:
bag = BaggingClassifier(
    n_estimators=250,
    base_estimator=NearestCentroid(),
    n_jobs=-1,
    random_state=RANDOM_STATE,
    bootstrap_features=True).fit(X_train, y_train)

In [12]:
y_test_pred_bag = bag.predict(X_test)

In [13]:
confusion_matrix(y_test, y_test_pred_bag)

array([[761, 503],
       [ 87, 106]])

In [14]:
f2_score(y_test, y_test_pred_bag)

0.3837798696596669

In [15]:
f2_score(y_train, bag.predict(X_train))

0.6213656753613557

# Ada (hand tuning used the usable base models above)
not applicable, 

- for DecisionTreeClassifier (with default `algorithm='SAMME.R'`) and RidgeClassifier with `algorithm='SAMME'`, the score are always around 0.20 no matter how the models are tuned

- and RidgeClassifier with `algorithm='SAMME'`, the score are always around 0.34 no matter how the models are tuned


- `NearestCentroid doesn't support sample_weight.`


In [16]:
ada = AdaBoostClassifier(base_estimator=RidgeClassifier(),
                         algorithm='SAMME',
                         n_estimators=150).fit(X_train, y_train)

In [17]:
y_test_pred_ada = ada.predict(X_test)

In [18]:
confusion_matrix(y_test, y_test_pred_ada)

array([[785, 479],
       [100,  93]])

In [19]:
f2_score(y_test, y_test_pred_ada)

0.34598214285714285

In [20]:
f2_score(y_train, ada.predict(X_train))

0.6682518178550692

# GB

In [21]:
gb = GradientBoostingClassifier(
    learning_rate=0.001
).fit(X_train, y_train)

In [22]:
y_test_pred_gb = gb.predict(X_test)

In [23]:
confusion_matrix(y_test, y_test_pred_gb)

array([[443, 821],
       [ 49, 144]])

In [24]:
f2_score(y_test, y_test_pred_gb)

0.4145077720207254

In [25]:
f2_score(y_train, gb.predict(X_train))

0.7920378619153674

# NN

In [26]:
nn = MLPClassifier(hidden_layer_sizes=(1000,),
                   learning_rate_init=0.001,
                   alpha=4.,
).fit(X_train, y_train)

In [27]:
y_test_pred_nn = nn.predict(X_test)

In [28]:
confusion_matrix(y_test, y_test_pred_nn)

array([[797, 467],
       [ 98,  95]])

In [29]:
f2_score(y_test, y_test_pred_nn)

0.356071964017991

In [30]:
f2_score(y_train, nn.predict(X_train))

0.6810928300225609