# import libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, auc, precision_recall_curve, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

# read data

In [2]:
train = pd.read_parquet('./datafiles/train_reduced.parquet')
test = pd.read_parquet('./datafiles/test_reduced.parquet')

In [5]:
print(len(train))
print(len(test))

96821
25017


In [3]:
x_train = train.drop(columns=["transcript_id", "transcript_position", "seq", "gene_id", "label"])
y_train = train[['label']]

In [4]:
x_test = test.drop(columns=["transcript_id", "transcript_position", "seq", "gene_id", "label"])
y_test = test[['label']]

In [8]:
print(x_train)
print(y_train)

        whole_mean_dt_1  whole_mean_sd_1  whole_mean_curr_1  whole_mean_dt_2  \
18             0.007340         2.977180         108.360000         0.007782   
19             0.008988         3.961489         118.638298         0.007403   
20             0.011065         7.299608         115.549020         0.009377   
21             0.006904         2.803571         119.142857         0.010334   
22             0.006961         4.949231         108.373077         0.009155   
...                 ...              ...                ...              ...   
121833         0.009594         3.294164         118.232877         0.007300   
121834         0.008393         4.511014         110.969565         0.010305   
121835         0.008161         3.918438         113.968750         0.006877   
121836         0.008044         3.191228         109.354386         0.007419   
121837         0.008788         4.090577         105.807692         0.006907   

        whole_mean_sd_2  whole_mean_cur

# original dataset

## logistic regression

In [117]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [118]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23934    29]
 [ 1022    32]]
roc auc: 0.7898
pr auc: 0.2181


## decision tree

In [119]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [120]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23033   930]
 [  749   305]]
roc auc: 0.6253
pr auc: 0.2831


## random forest

In [121]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [122]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23901    62]
 [  901   153]]
roc auc: 0.8967
pr auc: 0.4493


## xgboost

In [123]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [124]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23741   222]
 [  747   307]]
roc auc: 0.9112
pr auc: 0.4257


## lightgbm

In [8]:
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train.values.ravel())

In [15]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, make_scorer
import numpy as np

lgb_model = LGBMClassifier()
# y_train = y_train.values.ravel()

# Define the hyperparameter distribution
param_dist = {
    'num_leaves': np.arange(10, 100, 10),
    'min_data_in_leaf': np.arange(100, 1000, 100),
    'max_depth': [-1, 10, 20],
    'verbose': [-1]
}

# Custom scoring function that sums the roc_auc and pr_auc
def combined_auc_score(y_true, y_proba):
    roc_auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    return (roc_auc + pr_auc)/2

# Create a custom scorer using make_scorer, with needs_proba=True since we need predicted probabilities
combined_scorer = make_scorer(combined_auc_score, needs_proba=True)

# Set up RandomizedSearchCV with the custom combined scorer
random_search = RandomizedSearchCV(
    estimator=lgb_model, 
    param_distributions=param_dist, 
    n_iter=50,
    scoring=combined_scorer, 
    cv=5, 
    random_state=42,
    verbose = 3
)

# Perform the random search
random_search.fit(x_train, y_train)

# Output the best parameters based on the combined auc score
print(f"Best parameters found (based on combined AUC score): {random_search.best_params_}")
print(f"Best combined AUC score (roc_auc + pr_auc): {random_search.best_score_}")

lgbm = random_search.best_estimator_



Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END max_depth=-1, min_data_in_leaf=300, num_leaves=70, verbose=-1;, score=0.698 total time=   1.9s
[CV 2/5] END max_depth=-1, min_data_in_leaf=300, num_leaves=70, verbose=-1;, score=0.677 total time=   1.8s
[CV 3/5] END max_depth=-1, min_data_in_leaf=300, num_leaves=70, verbose=-1;, score=0.671 total time=   1.9s
[CV 4/5] END max_depth=-1, min_data_in_leaf=300, num_leaves=70, verbose=-1;, score=0.683 total time=   1.9s
[CV 5/5] END max_depth=-1, min_data_in_leaf=300, num_leaves=70, verbose=-1;, score=0.671 total time=   1.8s
[CV 1/5] END max_depth=-1, min_data_in_leaf=100, num_leaves=70, verbose=-1;, score=0.699 total time=   1.8s
[CV 2/5] END max_depth=-1, min_data_in_leaf=100, num_leaves=70, verbose=-1;, score=0.680 total time=   1.8s
[CV 3/5] END max_depth=-1, min_data_in_leaf=100, num_leaves=70, verbose=-1;, score=0.674 total time=   1.7s
[CV 4/5] END max_depth=-1, min_data_in_leaf=100, num_leaves=70, verbose=-1

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found (based on combined AUC score): {'verbose': -1, 'num_leaves': 70, 'min_data_in_leaf': 800, 'max_depth': 10}
Best combined AUC score (roc_auc + pr_auc): 0.6864055614287855


In [126]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lgbm.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, lgbm.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lgbm.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lgbm.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23779   184]
 [  751   303]]
roc auc: 0.9185
pr auc: 0.4545


## catboost

In [128]:
cb = CatBoostClassifier()
cb.fit(x_train, y_train)

NameError: name 'CatBoostClassifier' is not defined

In [11]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, make_scorer
import numpy as np

catboost_model = CatBoostClassifier(silent=True)

# Define the hyperparameter distribution
param_dist = {
    'depth': np.arange(4, 10),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'iterations': np.arange(100, 1000, 100)
}

# Custom scoring function that sums the roc_auc and pr_auc
def combined_auc_score(y_true, y_proba):
    roc_auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    return (roc_auc + pr_auc)/2

# Create a custom scorer using make_scorer, with needs_proba=True since we need predicted probabilities
combined_scorer = make_scorer(combined_auc_score, needs_proba=True)

# Set up RandomizedSearchCV with the custom combined scorer
random_search = RandomizedSearchCV(
    estimator=catboost_model, 
    param_distributions=param_dist, 
    n_iter=50,
    scoring=combined_scorer, 
    cv=5, 
    random_state=42,
    verbose = 3
)

# Perform the random search
random_search.fit(x_train, y_train)

# Output the best parameters based on the combined auc score
print(f"Best parameters found (based on combined AUC score): {random_search.best_params_}")
print(f"Best combined AUC score (roc_auc + pr_auc): {random_search.best_score_}")

cb = random_search.best_estimator_



Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END depth=6, iterations=500, learning_rate=0.2;, score=0.696 total time=   5.7s
[CV 2/5] END depth=6, iterations=500, learning_rate=0.2;, score=0.677 total time=   5.5s
[CV 3/5] END depth=6, iterations=500, learning_rate=0.2;, score=0.664 total time=   5.5s
[CV 4/5] END depth=6, iterations=500, learning_rate=0.2;, score=0.685 total time=   5.5s
[CV 5/5] END depth=6, iterations=500, learning_rate=0.2;, score=0.672 total time=   5.6s
[CV 1/5] END depth=4, iterations=800, learning_rate=0.07333333333333333;, score=0.696 total time=   6.0s
[CV 2/5] END depth=4, iterations=800, learning_rate=0.07333333333333333;, score=0.689 total time=   6.0s
[CV 3/5] END depth=4, iterations=800, learning_rate=0.07333333333333333;, score=0.673 total time=   6.0s
[CV 4/5] END depth=4, iterations=800, learning_rate=0.07333333333333333;, score=0.683 total time=   6.1s
[CV 5/5] END depth=4, iterations=800, learning_rate=0.07333333333333333;,

In [12]:
print(f'confusion matrix:\n {confusion_matrix(y_test, cb.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, cb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, cb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, cb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23815   148]
 [  750   304]]
roc auc: 0.9269
pr auc: 0.4978


# scaled dataset
-- attempted both StandardScaler and MinMaxScaler, no significant improvement in performances observed

In [140]:
# scaler = StandardScaler()
scaler = MinMaxScaler()

In [141]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## logistic regression

In [41]:
lr = LogisticRegression()
lr.fit(x_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [42]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23934    29]
 [ 1021    33]]
roc auc: 0.8475
pr auc: 0.2491


## decision tree

In [43]:
dt = DecisionTreeClassifier()
dt.fit(x_train_scaled, y_train)

In [44]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23040   923]
 [  732   322]]
roc auc: 0.6335
pr auc: 0.2967


## random forest

In [45]:
rf = RandomForestClassifier()
rf.fit(x_train_scaled, y_train)

  return fit_method(estimator, *args, **kwargs)


In [46]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23901    62]
 [  897   157]]
roc auc: 0.8954
pr auc: 0.44


## xgboost

In [142]:
xgb = XGBClassifier()
xgb.fit(x_train_scaled, y_train)

In [143]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23741   222]
 [  747   307]]
roc auc: 0.9112
pr auc: 0.4257


## lightgbm

In [144]:
lgbm = LGBMClassifier()
lgbm.fit(x_train_scaled, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 4421, number of negative: 92400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27965
[LightGBM] [Info] Number of data points in the train set: 96821, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.045662 -> initscore=-3.039761
[LightGBM] [Info] Start training from score -3.039761


In [145]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lgbm.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lgbm.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lgbm.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lgbm.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23778   185]
 [  743   311]]
roc auc: 0.9194
pr auc: 0.4621


## catboost

In [None]:
cb = CatBoostClassifier()
cb.fit(x_train_scaled, y_train)

NameError: name 'CatBoostClassifier' is not defined

In [None]:
print(f'confusion matrix:\n {confusion_matrix(y_test, cb.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, cb.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, cb.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, cb.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

# oversampling (class 1)
-- no significant improvements for default params, mainly increase in AUC values (both ROC and PR) for decision tree, but in general lower accuracy observed across all 4 models

-- also tested ADASYN (different oversampling technique), similar results obtained but slightly worse than SMOTE

In [146]:
Counter(y_train.label)

Counter({0: 92400, 1: 4421})

In [151]:
# sm = SMOTE()
sm = ADASYN()
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

In [148]:
Counter(y_train_sm.label)

Counter({0: 92400, 1: 92400})

## logistic regression

In [52]:
lr = LogisticRegression()
lr.fit(x_train_sm, y_train_sm)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[17663  6300]
 [  282   772]]
roc auc: 0.8102
pr auc: 0.2057


## decision tree

In [54]:
dt = DecisionTreeClassifier()
dt.fit(x_train_sm, y_train_sm)

In [55]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[21918  2045]
 [  660   394]]
roc auc: 0.6442
pr auc: 0.2809


## random forest

In [56]:
rf = RandomForestClassifier()
rf.fit(x_train_sm, y_train_sm)

  return fit_method(estimator, *args, **kwargs)


In [57]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23485   478]
 [  612   442]]
roc auc: 0.9132
pr auc: 0.421


## xgboost

In [58]:
xgb = XGBClassifier()
xgb.fit(x_train_sm, y_train_sm)

In [59]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23508   455]
 [  705   349]]
roc auc: 0.8917
pr auc: 0.3385


## lightgbm

In [152]:
lgbm = LGBMClassifier()
lgbm.fit(x_train_sm, y_train_sm)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 93014, number of negative: 92400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28815
[LightGBM] [Info] Number of data points in the train set: 185414, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501656 -> initscore=0.006623
[LightGBM] [Info] Start training from score 0.006623


In [153]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lgbm.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, lgbm.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lgbm.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lgbm.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23494   469]
 [  759   295]]
roc auc: 0.882
pr auc: 0.2926


## catboost

In [None]:
cb = CatBoostClassifier()
cb.fit(x_train_sm, y_train_sm)

NameError: name 'CatBoostClassifier' is not defined

In [None]:
print(f'confusion matrix:\n {confusion_matrix(y_test, cb.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, cb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, cb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, cb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

# scaling > oversampling
-- using standardscaler and smote since the individual components perform better than minmaxscaler and adasyn respectively

In [154]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [87]:
Counter(y_train.label)

Counter({0: 92400, 1: 4421})

In [155]:
sm = SMOTE()
x_train_scaled_sm, y_train_scaled_sm = sm.fit_resample(x_train_scaled, y_train)

In [89]:
Counter(y_train_sm.label)

Counter({0: 92400, 1: 92400})

## logistic regression

In [90]:
lr = LogisticRegression()
lr.fit(x_train_scaled_sm, y_train_scaled_sm)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [91]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[18707  5256]
 [  244   810]]
roc auc: 0.8531
pr auc: 0.2278


## decision tree

In [92]:
dt = DecisionTreeClassifier()
dt.fit(x_train_scaled_sm, y_train_scaled_sm)

In [93]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[21848  2115]
 [  586   468]]
roc auc: 0.6779
pr auc: 0.3243


## random forest

In [94]:
rf = RandomForestClassifier()
rf.fit(x_train_scaled_sm, y_train_scaled_sm)

  return fit_method(estimator, *args, **kwargs)


In [95]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23396   567]
 [  551   503]]
roc auc: 0.9164
pr auc: 0.4316


## xgboost

In [96]:
xgb = XGBClassifier()
xgb.fit(x_train_scaled_sm, y_train_scaled_sm)

In [97]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23533   430]
 [  687   367]]
roc auc: 0.8916
pr auc: 0.3732


## lightgbm

In [156]:
lgbm = LGBMClassifier()
lgbm.fit(x_train_scaled_sm, y_train_scaled_sm)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 92400, number of negative: 92400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28815
[LightGBM] [Info] Number of data points in the train set: 184800, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [157]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lgbm.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lgbm.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lgbm.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lgbm.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23503   460]
 [  710   344]]
roc auc: 0.8929
pr auc: 0.3381


## catboost

In [None]:
cb = CatBoostClassifier()
cb.fit(x_train_scaled_sm, y_train_scaled_sm)

NameError: name 'CatBoostClassifier' is not defined

In [None]:
print(f'confusion matrix:\n {confusion_matrix(y_test, cb.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, cb.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, cb.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, cb.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

# oversampling > scaling

In [98]:
Counter(y_train.label)

Counter({0: 92400, 1: 4421})

In [158]:
sm = SMOTE()
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

In [100]:
Counter(y_train_sm.label)

Counter({0: 92400, 1: 92400})

In [159]:
scaler = StandardScaler()
x_train_sm_scaled = scaler.fit_transform(x_train_sm)
x_test_sm_scaled = scaler.transform(x_test)

## logistic regression

In [102]:
lr = LogisticRegression()
lr.fit(x_train_sm_scaled, y_train_sm)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [103]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[18595  5368]
 [  231   823]]
roc auc: 0.855
pr auc: 0.2286


## decision tree

In [104]:
dt = DecisionTreeClassifier()
dt.fit(x_train_sm_scaled, y_train_sm)

In [105]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[21951  2012]
 [  579   475]]
roc auc: 0.6834
pr auc: 0.3324


## random forest

In [107]:
rf = RandomForestClassifier()
rf.fit(x_train_sm_scaled, y_train_sm)

  return fit_method(estimator, *args, **kwargs)


In [108]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23476   487]
 [  598   456]]
roc auc: 0.9173
pr auc: 0.4353


## xgboost

In [109]:
xgb = XGBClassifier()
xgb.fit(x_train_sm_scaled, y_train_sm)

In [110]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23524   439]
 [  688   366]]
roc auc: 0.8946
pr auc: 0.3677


## lightgbm

In [160]:
lgbm = LGBMClassifier()
lgbm.fit(x_train_sm_scaled, y_train_sm)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 92400, number of negative: 92400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028959 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28815
[LightGBM] [Info] Number of data points in the train set: 184800, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [161]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lgbm.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lgbm.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lgbm.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lgbm.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23494   469]
 [  720   334]]
roc auc: 0.8884
pr auc: 0.3204


## catboost

In [None]:
cb = CatBoostClassifier()
cb.fit(x_train_sm_scaled, y_train_sm)

NameError: name 'CatBoostClassifier' is not defined

In [None]:
print(f'confusion matrix:\n {confusion_matrix(y_test, cb.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, cb.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, cb.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, cb.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

# end