In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

  return f(*args, **kwds)


In [4]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
y = df['trans_loss'].values

In [5]:
print(X.shape)
print(y.shape)
print(Counter(y))

(185413, 192)
(185413,)
Counter({0: 175069, 1: 10344})


# Baseline Logistic Regression

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=7)

In [7]:
%%time
lr = LogisticRegression(solver='liblinear', C=0.1) # 
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

CPU times: user 20.6 s, sys: 150 ms, total: 20.7 s
Wall time: 20.7 s


In [8]:
print(confusion_matrix(y_test, y_pred))
print(f'Accuracy = {accuracy_score(y_test, y_pred)}')
print(f'Balanced Accuracy = {balanced_accuracy_score(y_test, y_pred)}')
print(f'Recall = {recall_score(y_test, y_pred)}')
print(f'\nClassification Report:\n {classification_report(y_test, y_pred)}')
print(f'ROC_AUC = {roc_auc_score(y_test, y_pred)}')
print(f'MCC = {matthews_corrcoef(y_test, y_pred)}')
probs = lr.predict_proba(X_test)
probs = probs[:, 1]
precision, recall, _ = precision_recall_curve(y_test, probs, pos_label=1)
y_score = lr.decision_function(X_test)
average_precision = average_precision_score(y_test, y_score)
print('Average precision/PR_AUC: {0:0.2f}'.format(average_precision))

[[43716    52]
 [ 2535    51]]
Accuracy = 0.9441903611338828
Balanced Accuracy = 0.5092667475543904
Recall = 0.019721577726218097

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     43768
           1       0.50      0.02      0.04      2586

    accuracy                           0.94     46354
   macro avg       0.72      0.51      0.50     46354
weighted avg       0.92      0.94      0.92     46354

ROC_AUC = 0.5092667475543904
MCC = 0.09033806064448234
Average precision/PR_AUC: 0.20


In [9]:
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(y_over))
print(X_over.shape)

Counter({0: 131301, 1: 131301})
(262602, 192)


In [10]:
%%time
lr = LogisticRegression(solver='liblinear', C=0.1) # 
lr.fit(X_over, y_over)
y_pred = lr.predict(X_test)

CPU times: user 42.7 s, sys: 195 ms, total: 42.9 s
Wall time: 42.9 s


In [11]:
print(confusion_matrix(y_test, y_pred))
print(f'Accuracy = {accuracy_score(y_test, y_pred)}')
print(f'Balanced Accuracy = {balanced_accuracy_score(y_test, y_pred)}')
print(f'Recall = {recall_score(y_test, y_pred)}')
print(f'\nClassification Report:\n {classification_report(y_test, y_pred)}')
print(f'ROC_AUC = {roc_auc_score(y_test, y_pred)}')
print(f'MCC = {matthews_corrcoef(y_test, y_pred)}')
probs = lr.predict_proba(X_test)
probs = probs[:, 1]
precision, recall, _ = precision_recall_curve(y_test, probs, pos_label=1)
y_score = lr.decision_function(X_test)
average_precision = average_precision_score(y_test, y_score)
print('Average precision/PR_AUC: {0:0.2f}'.format(average_precision))

[[29262 14506]
 [  438  2148]]
Accuracy = 0.6776114251197307
Balanced Accuracy = 0.7495985476681308
Recall = 0.8306264501160093

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.67      0.80     43768
           1       0.13      0.83      0.22      2586

    accuracy                           0.68     46354
   macro avg       0.56      0.75      0.51     46354
weighted avg       0.94      0.68      0.76     46354

ROC_AUC = 0.7495985476681308
MCC = 0.23879605950463956
Average precision/PR_AUC: 0.19


In [12]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics.scorer import make_scorer
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
#from imblearn.under_sampling import RandomUnderSampler # MWB
# define pipeline
steps = [('over', RandomOverSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

def my_mcc(y_true,y_pred):
    mcc = matthews_corrcoef(y_true, y_pred)
    return mcc

my_scorer = make_scorer(my_mcc, greater_is_better=True)

scores = cross_val_score(pipeline, X, y, scoring=my_scorer, cv=cv, n_jobs=-1)
print(scores)
score = np.mean(scores)
print('MCC: %.3f' % score)

[0.1060214  0.09959869 0.07374727 0.10746776 0.09313987 0.12546632
 0.09630177 0.07834359 0.07512109 0.10351929 0.12281503 0.08927002
 0.09844225 0.1176752  0.116693   0.10653179 0.09794739 0.09616889
 0.10292896 0.08240991 0.10107292 0.09223481 0.10747739 0.08606741
 0.09800936 0.10007753 0.09555314 0.11221685 0.09505216 0.12506478]
F-measure: 0.100
CPU times: user 3.41 s, sys: 732 ms, total: 4.15 s
Wall time: 2min 29s


In [28]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score
model = RandomForestClassifier(n_estimators=100, max_depth=40)
over = RandomOverSampler(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=1.0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

# define pipeline
pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
#pipeline = Pipeline(steps=[('u', under), ('m', model)])
scores = cross_val_score(pipeline, X, y, scoring=my_scorer, cv=cv, n_jobs=-1)
print(scores)
score = np.mean(scores)
print('MCC: %.3f' % score)

[0.25001179 0.25662432 0.25458582 0.2581572  0.25465168]
MCC: 0.255


In [31]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score
model = RandomForestClassifier(n_estimators=100, max_depth=40)
over = RandomOverSampler(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=1.0)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scoring = {'prec':'precision', 'mcc': my_scorer}
# define pipeline
#pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
pipeline = Pipeline(steps=[('u', under), ('m', model)])
scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
print(scores)

{'fit_time': array([7.86820555, 7.96624255, 7.64433503, 7.98609805, 8.01649499,
       7.79712152, 7.62201571, 7.64926338, 7.54900122, 7.96243   ,
       7.80578208, 7.93241429, 7.61831403, 7.82596898, 8.11925149,
       7.62759042, 7.2636919 , 7.56525493, 7.63514376, 7.26644588,
       7.52141929, 7.6420083 , 7.61117697, 7.34517503, 5.32696557,
       6.94490027, 6.58053899, 5.06340718, 6.99250984, 6.3908267 ]), 'score_time': array([0.82484603, 0.85238099, 0.84089088, 0.88418031, 0.8258698 ,
       0.82290244, 0.84870172, 0.8121717 , 0.78223372, 0.80407786,
       0.79156661, 0.77422667, 0.89169025, 0.8223393 , 0.88192916,
       0.78949738, 0.82191157, 0.79568744, 0.83147097, 0.85289145,
       0.81077909, 0.84969711, 0.7840755 , 0.83111954, 0.6190033 ,
       0.70042515, 0.49695635, 0.51940942, 0.48555779, 0.54561663]), 'test_prec': array([0.13518654, 0.13160198, 0.13111207, 0.13217065, 0.13017837,
       0.1333234 , 0.13155582, 0.13168664, 0.13160607, 0.13111888,
       0.13445378,

In [34]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score
#model = RandomForestClassifier(n_estimators=100, max_depth=40)
model = GradientBoostingClassifier(n_estimators=100, max_depth=11, 
                                   min_samples_split=900, min_samples_leaf=50,
                                   subsample=0.85, learning_rate=0.1, max_features=12)
over = RandomOverSampler(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=1.0)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)

scoring = {'prec':'precision', 'mcc': my_scorer}
# define pipeline
#pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
pipeline = Pipeline(steps=[('u', under), ('m', model)])
scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
print(scores)

{'fit_time': array([1.34051442, 1.36084175]), 'score_time': array([1.05538368, 1.03332877]), 'test_prec': array([0.13266837, 0.13456205]), 'test_mcc': array([0.25483995, 0.25835715])}


In [35]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score
#model = RandomForestClassifier(n_estimators=100, max_depth=40)
model = LogisticRegression()
#model = GradientBoostingClassifier(n_estimators=100, max_depth=11, 
#                                   min_samples_split=900, min_samples_leaf=50,
#                                   subsample=0.85, learning_rate=0.1, max_features=12)
over = RandomOverSampler(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=1.0)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)

scoring = {'prec':'precision', 'mcc': my_scorer}
# define pipeline
#pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
pipeline = Pipeline(steps=[('u', under), ('m', model)])
scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
print(scores)

{'fit_time': array([1.24604654, 1.00842834]), 'score_time': array([0.2119596 , 0.23433828]), 'test_prec': array([0.12310488, 0.12345714]), 'test_mcc': array([0.22958409, 0.22876001])}
