# Imports

In [1]:
import pandas as pd
import catboost
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from catboost import CatBoostClassifier

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, consensus_score

# Load and preprocessing data

## Load data

In [6]:
# загрузка датасетов
dataset = pd.read_csv('data/dataset.csv')
symptom_data = pd.read_csv('data/symptom_Description.csv')
symptom_precaution = pd.read_csv('data/symptom_precaution.csv')
symptom_severity = pd.read_csv('data/Symptom-severity.csv')

In [7]:
# заполнение отсутстующих значений
dataset = dataset.fillna('0')

In [8]:
for col in dataset.columns[1:]:
    dataset[col] = dataset[col].apply(lambda x: x.replace(' ', ''))

## LE

In [17]:
symptom_le.transform(symptom_le.classes_)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132])

In [10]:
# Кодирование болезней и симптомов
symptom_le = LabelEncoder()
symptom_le.fit([*symptom_severity.Symptom.values, '0'])
disease_le = LabelEncoder()
disease_le.fit(dataset.Disease.unique())

## Encode dataset 

In [38]:
# кодирование данных
for col in dataset.columns[1:]:
    dataset[col] = symptom_le.transform(dataset[col].values)
dataset[dataset.columns[0]] = disease_le.transform(dataset[dataset.columns[0]].values)

## Division

In [41]:
# разбиение датасета на Х и у
X = dataset[dataset.columns[1:]]
y = dataset['Disease'].to_frame()

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model create and train

In [45]:
model = CatBoostClassifier(iterations=100, 
                           depth=6, 
                           learning_rate=0.1,
                           loss_function='MultiClass', 
                           verbose=False)

In [46]:
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1329529a0>

# Check work model

In [48]:
y_pred = model.predict(X_test).reshape(-1)

In [55]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [56]:
print(f'accuracy {accuracy}')
print(f'Classification Report:\n {report}')

accuracy 0.997289972899729
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        39
           2       0.91      1.00      0.95        41
           3       1.00      1.00      1.00        36
           4       1.00      1.00      1.00        35
           5       1.00      1.00      1.00        36
           6       1.00      1.00      1.00        44
           7       1.00      1.00      1.00        32
           8       1.00      1.00      1.00        35
           9       1.00      1.00      1.00        30
          10       1.00      1.00      1.00        31
          11       1.00      1.00      1.00        40
          12       1.00      1.00      1.00        33
          13       1.00      0.91      0.95        45
          14       1.00      1.00      1.00        35
          15       1.00      1.00      1.00        28
          16       1.00      1

# Save model and LE

In [None]:
model.save_model('model', format='json')

In [23]:
import json

dict_le = {'Disease' : {str(label) : value for label, value in zip(symptom_le.transform(symptom_le.classes_), symptom_le.classes_)},
           'Symptom' : {str(label) : value for label, value in zip(disease_le.transform(disease_le.classes_), disease_le.classes_)}}
 
with open('parameters.json', 'w') as file:
    json.dump(dict_le, file)