# import libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, auc, precision_recall_curve, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

# read data

In [2]:
train = pd.read_parquet('./datafiles/train_reduced.parquet')
test = pd.read_parquet('./datafiles/test_reduced.parquet')

In [3]:
print(len(train))
print(len(test))

96821
25017


In [6]:
x_train = train.drop(columns=["transcript_id", "transcript_position", "seq", "gene_id", "label"])
y_train = train[['label']]

In [8]:
x_test = test.drop(columns=["transcript_id", "transcript_position", "seq", "gene_id", "label"])
y_test = test[['label']]

In [9]:
print(x_train)
print(y_train)

        whole_mean_dt_1  whole_mean_sd_1  whole_mean_curr_1  whole_mean_dt_2  \
18             0.007340         2.977180         108.360000         0.007782   
19             0.008988         3.961489         118.638298         0.007403   
20             0.011065         7.299608         115.549020         0.009377   
21             0.006904         2.803571         119.142857         0.010334   
22             0.006961         4.949231         108.373077         0.009155   
...                 ...              ...                ...              ...   
121833         0.009594         3.294164         118.232877         0.007300   
121834         0.008393         4.511014         110.969565         0.010305   
121835         0.008161         3.918438         113.968750         0.006877   
121836         0.008044         3.191228         109.354386         0.007419   
121837         0.008788         4.090577         105.807692         0.006907   

        whole_mean_sd_2  whole_mean_cur

# original dataset

## logistic regression

In [10]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23934    29]
 [ 1022    32]]
accuracy: 0.958
roc auc: 0.7898
pr auc: 0.2181


## decision tree

In [12]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [13]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23033   930]
 [  737   317]]
accuracy: 0.9334
roc auc: 0.631
pr auc: 0.2922


## random forest

In [14]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [15]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23903    60]
 [  895   159]]
accuracy: 0.9618
roc auc: 0.8952
pr auc: 0.4434


## xgboost

In [16]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [17]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23741   222]
 [  747   307]]
accuracy: 0.9613
roc auc: 0.9112
pr auc: 0.4257


# scaled dataset
-- attempted both StandardScaler and MinMaxScaler, no significant improvement in performances observed

In [39]:
# scaler = StandardScaler()
scaler = MinMaxScaler()

In [40]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## logistic regression

In [41]:
lr = LogisticRegression()
lr.fit(x_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [42]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23934    29]
 [ 1021    33]]
roc auc: 0.8475
pr auc: 0.2491


## decision tree

In [43]:
dt = DecisionTreeClassifier()
dt.fit(x_train_scaled, y_train)

In [44]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23040   923]
 [  732   322]]
roc auc: 0.6335
pr auc: 0.2967


## random forest

In [45]:
rf = RandomForestClassifier()
rf.fit(x_train_scaled, y_train)

  return fit_method(estimator, *args, **kwargs)


In [46]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23901    62]
 [  897   157]]
roc auc: 0.8954
pr auc: 0.44


## xgboost

In [47]:
xgb = XGBClassifier()
xgb.fit(x_train_scaled, y_train)

In [48]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23741   222]
 [  747   307]]
roc auc: 0.9112
pr auc: 0.4257


# oversampling (class 1)
-- no significant improvements for default params, mainly increase in AUC values (both ROC and PR) for decision tree, but in general lower accuracy observed across all 4 models

-- also tested ADASYN (different oversampling technique), similar results obtained but slightly worse than SMOTE

In [49]:
Counter(y_train.label)

Counter({0: 92400, 1: 4421})

In [50]:
# sm = SMOTE()
sm = ADASYN()
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

In [51]:
Counter(y_train_sm.label)

Counter({1: 93014, 0: 92400})

## logistic regression

In [52]:
lr = LogisticRegression()
lr.fit(x_train_sm, y_train_sm)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[17663  6300]
 [  282   772]]
roc auc: 0.8102
pr auc: 0.2057


## decision tree

In [54]:
dt = DecisionTreeClassifier()
dt.fit(x_train_sm, y_train_sm)

In [55]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[21918  2045]
 [  660   394]]
roc auc: 0.6442
pr auc: 0.2809


## random forest

In [56]:
rf = RandomForestClassifier()
rf.fit(x_train_sm, y_train_sm)

  return fit_method(estimator, *args, **kwargs)


In [57]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23485   478]
 [  612   442]]
roc auc: 0.9132
pr auc: 0.421


## xgboost

In [58]:
xgb = XGBClassifier()
xgb.fit(x_train_sm, y_train_sm)

In [59]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23508   455]
 [  705   349]]
roc auc: 0.8917
pr auc: 0.3385


# scaling > oversampling
-- using standardscaler and smote since the individual components perform better than minmaxscaler and adasyn respectively

In [86]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [87]:
Counter(y_train.label)

Counter({0: 92400, 1: 4421})

In [88]:
sm = SMOTE()
x_train_scaled_sm, y_train_scaled_sm = sm.fit_resample(x_train_scaled, y_train)

In [89]:
Counter(y_train_sm.label)

Counter({0: 92400, 1: 92400})

## logistic regression

In [90]:
lr = LogisticRegression()
lr.fit(x_train_scaled_sm, y_train_scaled_sm)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [91]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[18707  5256]
 [  244   810]]
roc auc: 0.8531
pr auc: 0.2278


## decision tree

In [92]:
dt = DecisionTreeClassifier()
dt.fit(x_train_scaled_sm, y_train_scaled_sm)

In [93]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[21848  2115]
 [  586   468]]
roc auc: 0.6779
pr auc: 0.3243


## random forest

In [94]:
rf = RandomForestClassifier()
rf.fit(x_train_scaled_sm, y_train_scaled_sm)

  return fit_method(estimator, *args, **kwargs)


In [95]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23396   567]
 [  551   503]]
roc auc: 0.9164
pr auc: 0.4316


## xgboost

In [96]:
xgb = XGBClassifier()
xgb.fit(x_train_scaled_sm, y_train_scaled_sm)

In [97]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23533   430]
 [  687   367]]
roc auc: 0.8916
pr auc: 0.3732


# oversampling > scaling

In [98]:
Counter(y_train.label)

Counter({0: 92400, 1: 4421})

In [99]:
sm = SMOTE()
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

In [100]:
Counter(y_train_sm.label)

Counter({0: 92400, 1: 92400})

In [101]:
scaler = StandardScaler()
x_train_sm_scaled = scaler.fit_transform(x_train_sm)
x_test_sm_scaled = scaler.transform(x_test)

## logistic regression

In [102]:
lr = LogisticRegression()
lr.fit(x_train_sm_scaled, y_train_sm)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [103]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[18595  5368]
 [  231   823]]
roc auc: 0.855
pr auc: 0.2286


## decision tree

In [104]:
dt = DecisionTreeClassifier()
dt.fit(x_train_sm_scaled, y_train_sm)

In [105]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[21951  2012]
 [  579   475]]
roc auc: 0.6834
pr auc: 0.3324


## random forest

In [107]:
rf = RandomForestClassifier()
rf.fit(x_train_sm_scaled, y_train_sm)

  return fit_method(estimator, *args, **kwargs)


In [108]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23476   487]
 [  598   456]]
roc auc: 0.9173
pr auc: 0.4353


## xgboost

In [109]:
xgb = XGBClassifier()
xgb.fit(x_train_sm_scaled, y_train_sm)

In [110]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test_sm_scaled))}')
# print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test_sm_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test_sm_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test_sm_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23524   439]
 [  688   366]]
roc auc: 0.8946
pr auc: 0.3677


# end