# Imports

In [2]:
import pandas as pd
import catboost
import numpy as np
import matplotlib.pyplot as plt



In [44]:
from catboost import CatBoostClassifier

In [14]:
from sklearn.preprocessing import LabelEncoder

In [42]:
from sklearn.model_selection import train_test_split

In [53]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, consensus_score

# Load and preprocessing data

## Load data

In [34]:
# загрузка датасетов
dataset = pd.read_csv('data/dataset.csv')
symptom_data = pd.read_csv('data/symptom_Description.csv')
symptom_precaution = pd.read_csv('data/symptom_precaution.csv')
symptom_severity = pd.read_csv('data/Symptom-severity.csv')

In [35]:
# заполнение отсутстующих значений
dataset = dataset.fillna('0')

In [36]:
for col in dataset.columns[1:]:
    dataset[col] = dataset[col].apply(lambda x: x.replace(' ', ''))

## LE

In [37]:
# Кодирование болезней и симптомов
symptom_le = LabelEncoder()
symptom_le.fit([*symptom_severity.Symptom.values, '0'])
disease_le = LabelEncoder()
disease_le.fit(dataset.Disease.unique())

## Encode dataset 

In [38]:
# кодирование данных
for col in dataset.columns[1:]:
    dataset[col] = symptom_le.transform(dataset[col].values)
dataset[dataset.columns[0]] = disease_le.transform(dataset[dataset.columns[0]].values)

In [40]:
# переименование столбцов
dataset = dataset.rename(columns={'Disease' : 'target'})

## Division

In [41]:
# разбиение датасета на Х и у
X = dataset[dataset.columns[1:]]
y = dataset['target'].to_frame()

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model create and train

In [None]:
import torch.nn as nn
import torch

In [45]:
# create class nn model
class DiseaseNN(nn.Module):
    def __init__(self, num_classes : int, input_shape: int=17):
        super(self.__class__, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_shape, 300),
            nn.ReLU(),
            nn.BatchNorm1d(300),
            nn.Linear(300, 100),
            nn.ReLU(),
            nn.BatchNorm1d(100),
            nn.Linear(100, num_classes)
        )
    
    def forward(self, input_data):
        out = self.model(input_data)
        return out

In [46]:
# make model 
n_epochs = 30
device = 'cpu'

model = DiseaseNN().to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.CrossEntropyLoss()

<catboost.core.CatBoostClassifier at 0x1329529a0>

In [None]:
# train
accuracy = metric.Accuracy('multiclass', num_classes = len(train_loader.dataset.classes))

train_loss = []
test_loss = []
train_accuracy = []
test_accuracy = []

for epoch in range(n_epochs):
    print("Number of epochs: ", epoch+1, "of 30")
    epoch_train_loss = []
    epoch_test_loss = []
    epoch_test_accuracy = []

    model.train(True)
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        y_pred = model(X_batch)
        loss = loss_func(y_pred, y_batch)
        loss.backward()
        opt.step()
        opt.zero_grad()
        epoch_train_loss.append(loss.item())
    train_loss.append(epoch_train_loss)

    model.train(False)
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            y_pred = model(X_batch)
            loss = loss_func(y_pred, y_batch)
            epoch_test_accuracy.append(accuracy(y_pred, y_batch))
            epoch_test_loss.append(loss)
    test_loss.append(epoch_test_loss)
    test_accuracy.append(epoch_test_accuracy)

# Check work model

In [48]:
y_pred = model.predict(X_test).reshape(-1)

In [55]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [56]:
print(f'accuracy {accuracy}')
print(f'Classification Report:\n {report}')

accuracy 0.997289972899729
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        39
           2       0.91      1.00      0.95        41
           3       1.00      1.00      1.00        36
           4       1.00      1.00      1.00        35
           5       1.00      1.00      1.00        36
           6       1.00      1.00      1.00        44
           7       1.00      1.00      1.00        32
           8       1.00      1.00      1.00        35
           9       1.00      1.00      1.00        30
          10       1.00      1.00      1.00        31
          11       1.00      1.00      1.00        40
          12       1.00      1.00      1.00        33
          13       1.00      0.91      0.95        45
          14       1.00      1.00      1.00        35
          15       1.00      1.00      1.00        28
          16       1.00      1

# Save model

In [57]:
model.save_model(
    'model.json',
    format='json'
)