Здесь используется датасет данных о хронических заболеваниях пациентов с коронавирусом и их симптомах. Цель - вычислить, насколько изменяется вероятность подхватить тот или иной симптом при наличии тех или иных хронических заболеваний путём построения логистической мультиклассовой регрессии с пересекающимися классами.

In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve

df = pd.read_csv('patient_data.csv')
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,cancer,diabetes,hypertension,coronary_heart_disease,renal_disease,immunodeficiency,hepatit,old_age,gender,fever,cough,shortness of breath,diarrhea,fatigue,vomit,interstitial abnormalities,ground-glass opacity
0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0


Названия хронических заболеваний и симптомов соответственно в списках diseases_name и symptoms_name. 

In [29]:
diseases_name = ['cancer', 'diabetes', 'hypertension','coronary_heart_disease','renal_disease',
                 'immunodeficiency','hepatit', 'old_age', 'gender']
df = df.sample(frac=1).reset_index(drop=True)

X = df[diseases_name]

symptoms_name = ['fever', 'cough','shortness of breath','diarrhea', 'fatigue', 'vomit', 'interstitial abnormalities',
                 'ground-glass opacity']
Y = df[symptoms_name]
frac = 650
train_X, test_X = X[:frac], X[frac:]
train_Y, test_Y = Y[:frac], Y[frac:]
print('Сейчас датасет содержит данные о {} пациентах и обучается на данных о {} пациентах.'.format(df.shape[0], frac))

Сейчас датасет содержит данные о 877 пациентах и обучается на данных о 630 пациентах.


In [35]:
X.corr() # параметры почти не коррелируют друг с другом(кроме очевидного старого возраста)

Unnamed: 0,cancer,diabetes,hypertension,coronary_heart_disease,renal_disease,immunodeficiency,hepatit,old_age,gender
cancer,1.0,-0.004739,0.073227,0.061846,-0.019141,-0.003629,-0.008132,0.073373,0.009172
diabetes,-0.004739,1.0,0.493128,0.251364,0.206047,-0.012121,0.068131,0.151515,0.052307
hypertension,0.073227,0.493128,1.0,0.350787,0.271509,-0.014412,0.051631,0.272396,0.004621
coronary_heart_disease,0.061846,0.251364,0.350787,1.0,0.036729,-0.008655,-0.019397,0.220691,-0.010596
renal_disease,-0.019141,0.206047,0.271509,0.036729,1.0,-0.006022,-0.013496,0.100572,0.065725
immunodeficiency,-0.003629,-0.012121,-0.014412,-0.008655,-0.006022,1.0,-0.002558,-0.031154,-0.03793
hepatit,-0.008132,0.068131,0.051631,-0.019397,-0.013496,-0.002558,1.0,-0.009044,0.067451
old_age,0.073373,0.151515,0.272396,0.220691,0.100572,-0.031154,-0.009044,1.0,0.001355
gender,0.009172,0.052307,0.004621,-0.010596,0.065725,-0.03793,0.067451,0.001355,1.0


In [30]:
class LogisticRegression():
    def __init__(self, lr=1e-3, num_steps=10000, tol=1e-5):
        self.X = None
        self.Y = None
        self.beta = None
        self.num_steps = num_steps
        self.lr = lr
        self.tol = 1e-5
    def fit(self, X, Y):
        intercept = np.ones((X.shape[0], 1))
        self.X = np.hstack((intercept, X.copy()))
        self.Y = Y.copy()
        self.beta = np.ones(self.X.shape[1])
        self.logistic_regression(self.X, self.Y)

    def sigmoid(self, X):
        z = X @ self.beta
        return 1 / (1 + np.exp(-z))

    def loss(self):
        h = self.sigmoid(self.X)
        cost = (((-self.Y).T @ np.log(h))-((1-self.Y).T @ np.log(1-h))).mean()
        return cost

    def logistic_regression(self, X, Y):
        loss_history = [0]
        for step in range(self.num_steps):
            predictions = self.sigmoid(self.X)
            self.beta += self.lr * np.dot(self.X.T, self.Y - predictions)
            loss_step = self.loss()
            if abs(loss_step - loss_history[step]) < self.tol:
                break
            loss_history.append(loss_step)
        return self.beta

    def predict(self, X):
        X_ = X.copy()
        intercept = np.ones((X_.shape[0], 1))
        X_ = np.hstack((intercept, X_))
        return np.round(self.sigmoid(X_))

    def predict_proba(self, X):
        X_ = X.copy()
        intercept = np.ones((X_.shape[0], 1))
        X_ = np.hstack((intercept, X_))
        return self.sigmoid(X_)


Будем использовать метод один против всех и обучим по классификатору для каждого класса.

In [31]:
class OnevsRestClassifier():
    def __init__(self, model, params = dict()):
        self.num_classes = 0
        self.clfs = []
        self.model = model
        self.params = params

    def fit(self, X, Y):
        self.num_classes = Y.shape[1]
        classes = Y.columns
        for i in range(self.num_classes):
            clf = self.model(**self.params)
            clf.fit(X, Y[classes[i]])
            self.clfs.append(clf)
        print('Fitted!')

    def predict(self, X): 
        preds = []
        for i in range(self.num_classes):
            pred = self.clfs[i].predict(X)
            preds.append(pred)
        return np.array(preds).T

    def predict_proba(self, X):
        preds = []
        for i in range(self.num_classes):
            pred = self.clfs[i].predict_proba(X)
            preds.append(pred)
        return np.array(preds).T

    def logloss(self, y_true, y_score):
        return -((1 - y_true) * np.log(1 - y_score) + y_true * np.log(y_score)).mean()

    def get_loss(self, y_true, y_score):
        acc = 0
        for i in range(self.num_classes):
            acc += self.logloss(y_true[:, i],y_score[:, i])
        return acc

num_steps = 20000
lr = 1e-3
df = df.sample(frac=1).reset_index(drop=True)
X = df[diseases_name]
Y = df[symptoms_name]
train_X, test_X = X[:frac], X[frac:]
train_Y, test_Y = Y[:frac], Y[frac:]
params = {'lr': lr, 'num_steps':num_steps}
clf = OnevsRestClassifier(LogisticRegression, params)
clf.fit(train_X, train_Y)
preds = clf.predict_proba(test_X)
print("Log-loss metrics:", clf.get_loss(test_Y.values, preds)) 

Fitted!
Log-loss metrics: 3.8896965899925364


In [25]:
def accuracy_hamming(preds, y):
    acc = 0
    tps = []
    tns = []
    for i in range(y.shape[0]):
        y_ = (y.reset_index(drop=True).loc[i]).to_numpy()
        pred_ = preds[i]
        cl = 0
        cr = 0
        for j in range(len(y_)):
            if y_[j] != pred_[j] and y_[j] == 1:
                cl += 1
            if y_[j] != pred_[j] and pred_[j] == 1:
                cr += 1
        acc += (cr + cl) / y.shape[1]
    return acc/(y.shape[0])

print('Hamming multilabel accuracy:', accuracy_hamming(preds, test_Y)) #чем ближе к 0 тем лучше

Hamming multilabel accuracy: 0.29873646209386284


In [27]:
print(roc_auc_score(test_Y.values, preds, average='micro')) 

0.8095369322710712


Вообще, в датасете так процентно распределены симптомы и болезни(даже соотносится с официальной статистикой):

In [36]:
for col in diseases_name:
    print(col, df[col].sum()/df.shape[0])

cancer 0.011402508551881414
diabetes 0.11402508551881414
hypertension 0.15393386545039908
coronary_heart_disease 0.06157354618015964
renal_disease 0.03078677309007982
immunodeficiency 0.0011402508551881414
hepatit 0.005701254275940707
old_age 0.45952109464082097
gender 0.5575826681870011


In [37]:
for col in symptoms_name:
    print(col, df[col].sum()/df.shape[0])

fever 0.8027366020524516
cough 0.38426453819840367
shortness of breath 0.3557582668187001
diarrhea 0.07639680729760548
fatigue 0.30444697833523376
vomit 0.09464082098061574
interstitial abnormalities 0.05473204104903079
ground-glass opacity 0.35461801596351195
