In [17]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
%config Completer.use_jedi = False

In [23]:
#df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)  # MCC = 0.237 / ROC_AUC = 0.748
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)  # MCC = 0.225 / ROC_AUC = 0.732
X = df.drop('trans_loss', axis=1, inplace=False)
y = df['trans_loss'].values

In [24]:
print(X.shape)
print(y.shape)
print(Counter(y))

(185413, 193)
(185413,)
Counter({0: 175069, 1: 10344})


# Baseline Logistic Regression

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=7)


In [28]:
%%time
lr = LogisticRegression(solver='liblinear', class_weight='balanced', C=0.1) #
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

CPU times: user 20.9 s, sys: 175 ms, total: 21.1 s
Wall time: 21.1 s


In [29]:
print(confusion_matrix(y_test, y_pred))
print(f'Accuracy = {accuracy_score(y_test, y_pred)}')
print(f'Balanced Accuracy = {balanced_accuracy_score(y_test, y_pred)}')
print(f'Recall = {recall_score(y_test, y_pred)}')
print(f'\nClassification Report:\n {classification_report(y_test, y_pred)}')
print(f'ROC_AUC = {roc_auc_score(y_test, y_pred)}')
print(f'MCC = {matthews_corrcoef(y_test, y_pred)}')
probs = lr.predict_proba(X_test)
probs = probs[:, 1]
precision, recall, _ = precision_recall_curve(y_test, probs, pos_label=1)
y_score = lr.decision_function(X_test)
average_precision = average_precision_score(y_test, y_score)
print('Average precision/PR_AUC: {0:0.2f}'.format(average_precision))

[[29992 13776]
 [  571  2015]]
Accuracy = 0.6904905725503732
Balanced Accuracy = 0.7322225831682572
Recall = 0.7791956689868523

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.69      0.81     43768
           1       0.13      0.78      0.22      2586

    accuracy                           0.69     46354
   macro avg       0.55      0.73      0.51     46354
weighted avg       0.93      0.69      0.77     46354

ROC_AUC = 0.7322225831682572
MCC = 0.2249180196722494
Average precision/PR_AUC: 0.18


# RF with Random Over sampling

In [10]:
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy=0.5)
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(y_over))
print(X_over.shape)

Counter({0: 131301, 1: 65650})
(196951, 193)


In [11]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=40)
#clf = LogisticRegression(solver='liblinear', C=0.1) # 
clf.fit(X_over, y_over)
y_pred = clf.predict(X_test)

CPU times: user 1min 1s, sys: 604 ms, total: 1min 2s
Wall time: 1min 2s


In [12]:
print(confusion_matrix(y_test, y_pred))
print(confusion_matrix(y_test, y_pred, normalize='all'))
print(f'Accuracy = {accuracy_score(y_test, y_pred)}')
print(f'Balanced Accuracy = {balanced_accuracy_score(y_test, y_pred)}')
print(f'Recall = {recall_score(y_test, y_pred)}')
print(f'\nClassification Report:\n {classification_report(y_test, y_pred)}')
print(f'ROC_AUC = {roc_auc_score(y_test, y_pred)}')
print(f'MCC = {matthews_corrcoef(y_test, y_pred)}')
probs = clf.predict_proba(X_test)
probs = probs[:, 1]
precision, recall, _ = precision_recall_curve(y_test, probs, pos_label=1)
#y_score = clf.decision_function(X_test)
#average_precision = average_precision_score(y_test, y_score)
#print('Average precision/PR_AUC: {0:0.2f}'.format(average_precision))

[[43420   348]
 [ 2479   107]]
[[0.93670449 0.00750744]
 [0.05347974 0.00230832]]
Accuracy = 0.9390128144280968
Balanced Accuracy = 0.5167128145125186
Recall = 0.04137664346481052

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     43768
           1       0.24      0.04      0.07      2586

    accuracy                           0.94     46354
   macro avg       0.59      0.52      0.52     46354
weighted avg       0.91      0.94      0.92     46354

ROC_AUC = 0.5167128145125186
MCC = 0.07781528095589256


In [13]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics.scorer import make_scorer
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
#from imblearn.under_sampling import RandomUnderSampler # MWB
# define pipeline
steps = [('over', RandomOverSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

def my_mcc(y_true,y_pred):
    mcc = matthews_corrcoef(y_true, y_pred)
    return mcc

my_scorer = make_scorer(my_mcc, greater_is_better=True)

scores = cross_val_score(pipeline, X, y, scoring=my_scorer, cv=cv, n_jobs=-1)
print(scores)
score = np.mean(scores)
print('MCC: %.3f' % score)



[0.10517026 0.09073182 0.09853585 0.10502665 0.1027909 ]
MCC: 0.100
CPU times: user 176 ms, sys: 460 ms, total: 636 ms
Wall time: 28.3 s


In [15]:
%%time
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score
model = RandomForestClassifier(n_estimators=100, max_depth=40)
over = RandomOverSampler(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=1.0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

# define pipeline
#pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
pipeline = Pipeline(steps=[('u', under), ('m', model)])
scores = cross_val_score(pipeline, X, y, scoring=my_scorer, cv=cv, n_jobs=-1)
print(scores)
score = np.mean(scores)
print('MCC: %.3f' % score)

[0.25435809 0.25374443 0.25316007 0.24969889 0.24902289]
MCC: 0.252
CPU times: user 133 ms, sys: 129 ms, total: 262 ms
Wall time: 7.63 s


In [16]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score
#model = RandomForestClassifier(n_estimators=100, max_depth=40)
model = GradientBoostingClassifier(n_estimators=100, max_depth=11, 
                                   min_samples_split=900, min_samples_leaf=50,
                                   subsample=0.85, learning_rate=0.1, max_features=12)
over = RandomOverSampler(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=1.0)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)

scoring = {'prec':'precision', 'mcc': my_scorer}
# define pipeline
#pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
pipeline = Pipeline(steps=[('u', under), ('m', model)])
scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
print(scores)

{'fit_time': array([1.44662499, 1.37447858]), 'score_time': array([1.18791246, 1.1572578 ]), 'test_prec': array([0.13350919, 0.13379393]), 'test_mcc': array([0.25389369, 0.25439932])}
