In [1]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.svm import SVC
from xgboost import XGBClassifier

## Load BPE dataset

In [2]:
def load_data(path):
    with open(path, 'rb') as f:
        return np.load(f)

In [3]:
## Load multiple features dataset
X_train_multi = load_data('../data/dataset/X_train_bpe_multi.npy')
y_train_multi = load_data('../data/dataset/y_train_bpe_multi.npy')

X_test_multi = load_data('../data/dataset/X_test_bpe_multi.npy')
y_test_multi = load_data('../data/dataset/y_test_bpe_multi.npy')

## Load single feature dataset
X_train_single = load_data('../data/dataset/X_train_bpe_single.npy')
y_train_single = load_data('../data/dataset/y_train_bpe_single.npy')

X_test_single = load_data('../data/dataset/X_test_bpe_single.npy')
y_test_single = load_data('../data/dataset/y_test_bpe_single.npy')

multi_dataset = [X_train_multi, y_train_multi, X_test_multi, y_test_multi]
single_dataset = [X_train_single, y_train_single, X_test_single, y_test_single]

In [4]:
print(X_train_multi.shape)
print(X_test_multi.shape)

print(X_train_single.shape)
print(X_test_single.shape)

(2369, 605)
(593, 605)
(2369, 300)
(593, 300)


## Oversampling data (Minor class)

In [5]:
sm = SMOTE(random_state=42, k_neighbors=10, n_jobs=-1)

X_train_multiover, y_train_multiover = sm.fit_sample(multi_dataset[0], multi_dataset[1])
X_train_singleover, y_train_singleover = sm.fit_sample(single_dataset[0], single_dataset[1])

In [6]:
np.unique(y_train_multiover, return_counts=True)

(array([0, 1]), array([1840, 1840]))

In [7]:
np.unique(y_train_singleover, return_counts=True)

(array([0, 1]), array([1840, 1840]))

## Baseline model (SVC, RFC, XGBC)

In [8]:
def scoring(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"accuracy: {acc:.2f} | precision: {pre:.2f} | recall: {rec:.2f} | f score: {f1:.2f}")

In [9]:
svc_model = SVC(random_state=42)
rfc = RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=6)
xgboost = XGBClassifier(random_state=42, n_jobs=6)

In [10]:
svc_model.fit(X_train_singleover, y_train_singleover)
y_pred = svc_model.predict(X_test_single)
scoring(y_test_single, y_pred)

accuracy: 0.84 | precision: 0.59 | recall: 0.73 | f score: 0.65


In [11]:
rfc.fit(X_train_singleover, y_train_singleover)
y_pred = rfc.predict(X_test_single)
scoring(y_test_single, y_pred)

accuracy: 0.86 | precision: 0.68 | recall: 0.57 | f score: 0.62


In [12]:
xgboost.fit(X_train_singleover, y_train_singleover)
y_pred = xgboost.predict(X_test_single)
scoring(y_test_single, y_pred)



accuracy: 0.86 | precision: 0.66 | recall: 0.62 | f score: 0.64


## Fine Tuning models

In [13]:
models = {
    "XGBoost" : {"algo" : XGBClassifier(random_state=42, verbosity=1, n_jobs=-1),
                 "parameter" : {'min_child_weight': [1, 5, 10],
                                'gamma': [0.5, 1, 1.5, 2, 5],
                                'subsample': [0.6, 0.8, 1.0],
                                'colsample_bytree': [0.6, 0.8, 1.0],
                                'max_depth': [3, 4, 5]}
                },
    
    "Random Forest" : {"algo" : RandomForestClassifier(random_state=42, n_jobs=-1),
                       "parameter" : {"n_estimators" : [100, 200, 300, 400, 500],
                                      "criterion" : ['gini', 'entropy'],
                                      "max_features" : ['auto', 'sqrt', 'log2']}
                      }
}

## Random Forest Classifier

In [14]:
def fine_tuning(model, model_param: dict, x_train, y_train, n_iteration, score, split):
    search = RandomizedSearchCV(model, model_param, n_iter=n_iteration, scoring=score, cv=split, verbose=1, n_jobs=-1)
    search.fit(x_train, y_train)
    print("Model={} \nScore={} ".format(model, search.best_score_))
    metric = search.best_score_
    best_param = search.best_estimator_
    return best_param, metric

In [16]:
rfc_best_param, metric = fine_tuning(models['Random Forest']['algo'], models['Random Forest']['parameter'], 
                                     X_train_singleover, y_train_singleover, 50, "precision", 5)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.9min finished


Model=RandomForestClassifier(n_jobs=-1, random_state=42) 
Score=0.8968708528827921 


In [17]:
print(rfc_best_param)

RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)


In [18]:
print("Evaluation Performance")
y_pred = rfc_best_param.predict(X_test_single)
scoring(y_test_single, y_pred)

Evaluation Performance
accuracy: 0.86 | precision: 0.69 | recall: 0.58 | f score: 0.63


## XGBoost Classifier

In [19]:
folds = 3
param_comb = 10

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)

In [20]:
random_search = RandomizedSearchCV(models['XGBoost']['algo'], param_distributions=models['XGBoost']['parameter'], 
                                   n_iter=param_comb, scoring='precision', n_jobs=-1, 
                                   cv=skf.split(X_train_singleover, y_train_singleover), 
                                   verbose=3, random_state=42)

random_search.fit(X_train_singleover, y_train_singleover)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  30 | elapsed: 16.8min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 17.8min finished




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f9970b0ef90>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing...
                                           num_parallel_tree=None,
                                           random_state=42, reg_alpha=None,
                                           reg_lambda=None,
                                           scale_pos_weight=No

In [21]:
print("Evaluation Performance")
y_pred = random_search.predict(X_test_single)
scoring(y_test_single, y_pred)

Evaluation Performance
accuracy: 0.85 | precision: 0.63 | recall: 0.60 | f score: 0.62
