In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import warnings
import pickle
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_pickle('emtab_normalized.pkl')
Y = pd.read_pickle('emtab_tags.pkl')

In [3]:
dataset

Unnamed: 0,34,89,197,198,391,395,413,446,560,602,...,17863,17949,17958,18119,18198,18261,18355,18813,18841,18917
0,0.000000,0.000000,0.000000,0.000000,0.069684,0.287356,0.000000,0.070881,0.305077,0.204110,...,0.298851,0.000000,0.147510,0.054371,0.059500,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.025579,0.000000,0.000000,0.000000,0.000000,0.000000,0.043669,0.008579,0.000000,0.024705,...,0.000000,0.023508,0.035708,0.100932,0.108024,0.019129,0.082086,0.000000,0.076517,0.613685
2,0.011825,0.016659,0.286707,0.000000,0.016517,0.000000,0.000000,0.210013,0.000000,0.036285,...,0.017709,0.506399,0.069928,0.034636,0.017629,0.020281,0.051160,0.000000,0.056193,0.006286
3,0.021302,0.012004,0.098601,0.034574,0.117531,0.092025,0.007703,0.273906,0.058620,0.078438,...,0.114847,0.065271,0.069284,0.040024,0.047001,0.025143,0.052524,0.076026,0.020245,0.000000
4,0.000000,0.000000,0.000000,0.349311,0.000000,0.000000,0.000000,0.244628,0.000000,0.264162,...,0.257851,0.041894,0.000000,0.023456,0.000000,0.000000,0.279339,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6173,0.000000,0.115521,0.180740,0.166361,0.000000,0.039360,0.000000,0.223301,0.000000,0.083872,...,0.040934,0.266028,0.161637,0.037237,0.008150,0.089297,0.179597,0.051343,0.043296,0.000000
6174,0.075690,0.011848,0.098611,0.000000,0.108661,0.060552,0.152053,0.101566,0.038572,0.086021,...,0.000000,0.130965,0.018650,0.026147,0.012538,0.042882,0.108678,0.078987,0.053286,0.039158
6175,0.078676,0.083129,0.007226,0.079809,0.006868,0.000000,0.106684,0.013973,0.120279,0.010059,...,0.000000,0.000000,0.014539,0.113574,0.035187,0.103725,0.391837,0.000000,0.155778,0.000000
6176,0.037349,0.026309,0.082323,0.025258,0.134771,0.044819,0.056273,0.148142,0.095166,0.098689,...,0.000000,0.137831,0.013804,0.031645,0.009280,0.018544,0.014504,0.000112,0.069021,0.055479


In [4]:
Y

Unnamed: 0,result
0,0
1,0
2,0
3,0
4,0
...,...
6173,1
6174,1
6175,1
6176,1


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset, Y, test_size=0.33, shuffle=True)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

# Support vector machine classifier
svmc = svm.SVC(kernel='linear', C=1, probability=True)

# Logistic regression classifier
lrc = LogisticRegression(random_state=0)

In [None]:
voting_clf = VotingClassifier(
estimators=[('lrc', lrc), ('rfc', rfc), ('svmc', svmc)],
voting='soft',
weights=[1, 3, 1])

In [None]:
voting_clf.fit(X_train, Y_train)

In [None]:
for clf in (lrc, rfc, svmc, voting_clf):
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    print(f"Classifier: {clf.__class__.__name__}, Accuracy: {accuracy_score(Y_test, Y_pred)},  F1: {f1_score(Y_test, Y_pred)}, ROC-AUC: {roc_auc_score(Y_test, Y_pred)}")

In [None]:
# predict class probabilities for all classifiers
probas = [c.fit(X_train, Y_train).predict_proba(X_test) for c in (lrc, rfc, svmc, voting_clf)]

# get class probabilities for the first sample in the dataset
class1_1 = [pr[0, 0] for pr in probas]
class2_1 = [pr[0, 1] for pr in probas]

In [None]:
N = 4  # number of groups
ind = np.arange(N)  # group positions
width = 0.35  # bar width

fig, ax = plt.subplots()

# bars for classifier 1-3
p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color="green", edgecolor="k")
p2 = ax.bar(
    ind + width,
    np.hstack(([class2_1[:-1], [0]])),
    width,
    color="lightgreen",
    edgecolor="k",
)

# bars for VotingClassifier
p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color="blue", edgecolor="k")
p4 = ax.bar(
    ind + width, [0, 0, 0, class2_1[-1]], width, color="steelblue", edgecolor="k"
)

# plot annotations
plt.axvline(2.8, color="k", linestyle="dashed")
ax.set_xticks(ind + width)
ax.set_xticklabels(
    [
        "LogisticRegression",
        "Random Forest Classifier",
        "Support Vector Machine",
        "VotingClassifier(Soft Voting)",
    ],
    rotation=40,
    ha="right",
)
plt.ylim([0, 1])
plt.title("Class probabilities for sample 1 by different classifiers")
plt.legend([p1[0], p2[0]], ["class 1", "class 2"], loc="upper left")
plt.tight_layout()
plt.show()