### Tackle domain 2 classification (imbalanced learning)

In [1]:
from scipy import sparse
temp_sparse_ = sparse.load_npz( "domain2_X_y_csr.npz" )
temp_loaded = temp_sparse_.toarray()
n_samples, n_features = temp_loaded.shape
n_features -= 1 #Since the last column is actually the label
X = temp_loaded[:,:n_features]
y = temp_loaded[:,n_features]
del temp_loaded, temp_sparse_

In [18]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.svm import SVC

In [3]:
import numpy as np
import pandas as pd

In [4]:
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [61]:
def evaluation(estimator, X, y):
    y_pred = estimator.predict(X)
    return balanced_accuracy_score(y, y_pred, adjusted=True)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2, shuffle=True, stratify=y)

In [56]:
def counter_labels( y_labels ):
    unique, cts = np.unique(y_labels, return_counts=True)
    cts = cts / cts.sum()
    return dict(zip(unique, cts))
counter_labels( y_train ), counter_labels( y_test )

({0.0: 0.8846153846153846, 1.0: 0.11538461538461539},
 {0.0: 0.8846153846153846, 1.0: 0.11538461538461539})

In [58]:
y_train.size, y_test.size

(7800, 5200)

In [59]:
from itertools import product
evaluations = {}
ratios = np.arange(0.5, 1.0, 0.05) #Sampling ratios
k_neighbors_possiblevals = np.arange(3,20,1)
n_neighbors_enn_possiblevals = np.arange(3,20,1)
percentiles = np.arange(10,100,5)
params = {}
for i, param in enumerate(product( ratios,
                                     k_neighbors_possiblevals,
                                     n_neighbors_enn_possiblevals,
                                     percentiles
                                    )):
    params[i] = (sampling_strategy_smote,
         k_neighbors,
         n_neighbors_enn,
         percentile)
    # selector = SelectPercentile( chi2, percentile=percentile)
    # X_selected = selector.fit_transform( X_train, y_train )
    # smote = SMOTE(sampling_strategy=sampling_strategy_smote, random_state=0, k_neighbors=k_neighbors, n_jobs=-1)
    # enn = EditedNearestNeighbours(n_neighbors=n_neighbors_enn,n_jobs=-1)
    # resampler = SMOTEENN(random_state=0, smote=smote, enn=enn)
    # X_resampled, y_resampled = resampler.fit_resample(X_selected, y_train)
    # estimator = MultinomialNB(alpha=alpha)
    # estimator.fit(X_resampled,y_resampled)
    # evaluations[ (sampling_strategy_smote,
    #                 k_neighbors,
    #                 n_neighbors_enn,
    #                 percentile,
    #                 alpha) ] = evaluation(estimator, selector.transform(X_test), y_test)
    # print(evaluations)

In [71]:
#5-fold cross validation:
from imblearn.pipeline import make_pipeline
sampling_strategy_smote = 0.5
k_neighbors = 3
n_neighbors_enn = 5
percentile = 10
selector = SelectPercentile( chi2, percentile=percentile )
smote = SMOTE(sampling_strategy=sampling_strategy_smote, random_state=0, k_neighbors=k_neighbors, n_jobs=-1)
enn = EditedNearestNeighbours(n_neighbors=n_neighbors_enn)
resampler = SMOTEENN(random_state=0, smote=smote, enn=enn)
estimator = LogisticRegressionCV( scoring = evaluation )
pipeline = make_pipeline(selector, resampler, estimator )

TypeError: EditedNearestNeighbours.__init__() got an unexpected keyword argument 'nn'

In [65]:
%%time
pipeline.fit( X_train, y_train )



CPU times: user 33.4 s, sys: 17.8 s, total: 51.2 s
Wall time: 17.3 s


In [67]:
evaluation( pipeline, X_test, y_test )

0.5242028985507248

In [69]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold( n_splits=10, shuffle=True, random_state=0 )
for i, (train_index, test_index) in enumerate(cv.split(X, y)):
    print(f"Fold {i}:")
    X_train = X[train_index,:]
    y_train = y[train_index]
    X_test = X[test_index,:]
    y_test = y[test_index]
    pipeline.fit(X_train, y_train)
    print( evaluation(pipeline, X_test, y_test) )

Fold 0:




0.5330434782608697
Fold 1:




0.5214492753623188
Fold 2:




0.5492753623188404
Fold 3:




0.5762318840579712
Fold 4:




0.5252173913043476
Fold 5:




0.5768115942028986
Fold 6:




0.5655072463768116
Fold 7:




0.6536231884057973
Fold 8:




0.59768115942029
Fold 9:




0.5495652173913044


In [None]:
import pickle as pkl

In [49]:
with open("domain2.mdl", "wb") as writefile:
    pkl.dump( (estimator, selector), writefile )

In [50]:
estimator = MultinomialNB(alpha=alpha)
estimator.fit(X,y)

In [51]:
from sklearn.metrics import precision_score, recall_score

In [52]:
y_pred = estimator.predict(X_test)
precision_score(y_test, y_pred)

0.7962962962962963

In [54]:
recall_score(y_test, y_pred)

0.14333333333333334

In [55]:
unique, counts = np.unique( y_test, return_counts=True)
dict(zip(unique, counts))

{0.0: 2300, 1.0: 300}