# Modelo de predição a doenças

# 1. Setup do projeto

In [1]:
import joblib
import os


import numpy as np
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

#Classificador
from sklearn.tree import DecisionTreeClassifier

print(f'Matplot version: {matplotlib.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Pandas version: {pd.__version__}')
print(f'sklearn version: {sklearn.__version__}')

DATA_PATH = os.path.join('..', 'data', 'raw')
DATA_PATH_PROCESSED = os.path.join('..', 'data', 'processed')
DATA_FILE = 'dataset.csv'
DATA_SYMPTON_DESC = 'symptom_Description.csv'
DATA_SYMPTOM_PREC = 'symptom_precaution.csv'

DISEASE_DATA = os.path.join(DATA_PATH, DATA_FILE)

DEPLOY_PATH = os.path.join('..', 'model')
DEPLOY_FILE = 'finalized_model.sav'

RANDOM_STATE = 42

Matplot version: 3.4.2
Numpy version: 1.21.0
Pandas version: 1.2.5
sklearn version: 0.24.2


# 2. Lendo os dados

In [2]:
def load_data(data_path, data_file):
    local_path = os.path.join(data_path, data_file)
    return pd.read_csv(local_path)

In [3]:
disease_sympton = load_data(DATA_PATH, DATA_FILE)
sympton_description = load_data(DATA_PATH, DATA_SYMPTON_DESC)
sympton_precaution = load_data(DATA_PATH, DATA_SYMPTOM_PREC)

## 2.1. Listagem das doenças

In [4]:
disease_sympton['Disease'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [5]:
len(disease_sympton['Disease'].unique())

41

## 2.2. Modelando o arquivo para treinamento

In [7]:
disease_sympton.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [41]:
cols = [i for i in disease_sympton.iloc[:,1:].columns]
cols

['Symptom_1',
 'Symptom_2',
 'Symptom_3',
 'Symptom_4',
 'Symptom_5',
 'Symptom_6',
 'Symptom_7',
 'Symptom_8',
 'Symptom_9',
 'Symptom_10',
 'Symptom_11',
 'Symptom_12',
 'Symptom_13',
 'Symptom_14',
 'Symptom_15',
 'Symptom_16',
 'Symptom_17']

In [16]:
tmp = pd.melt(disease_sympton.reset_index() ,id_vars = ['index'], value_vars = cols)
tmp['add1'] = 1
tmp.head()

Unnamed: 0,index,variable,value,add1
0,0,Symptom_1,itching,1
1,1,Symptom_1,skin_rash,1
2,2,Symptom_1,itching,1
3,3,Symptom_1,itching,1
4,4,Symptom_1,itching,1


In [49]:
# Pivot table
diseases = pd.pivot_table(tmp, values = 'add1', index = 'index', columns = 'value')

# Add labels column
diseases.insert(0,'label',disease_sympton['Disease'])

# Preenche NaN com zero
diseases = diseases.fillna(0)
diseases.columns = diseases.columns.str.strip()
diseases.iloc[:,1:] = diseases.iloc[:,1:].astype(int)
diseases

value,label,abdominal_pain,abnormal_menstruation,acidity,acute_liver_failure,altered_sensorium,anxiety,back_pain,belly_pain,blackheads,...,watering_from_eyes,weakness_in_limbs,weakness_of_one_body_side,weight_gain,weight_loss,yellow_crust_ooze,yellow_urine,yellowing_of_eyes,yellowish_skin,itching
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,Acne,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
diseases.shape

(4920, 132)

## 2.3. Salvando o arquivo processado

In [51]:
diseases.to_csv(DATA_PATH_PROCESSED + '\\Training.csv')
print('Arquivo salvo')

Arquivo salvo


# 3. Treinamento

## 3.1. Separação dos conjuntos para treinamento

In [52]:
x = diseases.drop('label', axis = 1)
x.head()

value,abdominal_pain,abnormal_menstruation,acidity,acute_liver_failure,altered_sensorium,anxiety,back_pain,belly_pain,blackheads,bladder_discomfort,...,watering_from_eyes,weakness_in_limbs,weakness_of_one_body_side,weight_gain,weight_loss,yellow_crust_ooze,yellow_urine,yellowing_of_eyes,yellowish_skin,itching
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [53]:
y = diseases['label']
y.head()

index
0    Fungal infection
1    Fungal infection
2    Fungal infection
3    Fungal infection
4    Fungal infection
Name: label, dtype: object

## 3.2. Mapeando as features em números com o LabelEncoder

In [54]:
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

In [55]:
y

array([15, 15, 15, ..., 38, 35, 27])

In [56]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, stratify=y, random_state=RANDOM_STATE)

# 4. Criando o modelo

## 4.1. Por que utilizaremos o classificador DecisionTreeClassifier?
### Este classificador em particular possuí um atributo tree_ , e é com este atributo que iremos definir uma função recursiva para o que o bot faça perguntas ao usuário

In [57]:
clf = DecisionTreeClassifier()

# 5. Testando o modelo

In [58]:
model = clf.fit(x_train, y_train)

In [59]:
cv_score = cross_val_score(model, x_test, y_test)
print(f'Acurácia do modelo: ', cv_score.mean()*100)


Acurácia do modelo:  98.23843334860284


In [60]:
predicoes = model.predict(x_test)
predicoes_transformadas = le.inverse_transform(predicoes)
print(f'Predições: ', predicoes)
print(f'Predições transformadas: ', predicoes_transformadas)

Predições:  [24 10 21 ... 34 31  5]
Predições transformadas:  ['Hyperthyroidism' 'Common Cold' 'Hepatitis D' ... 'Pneumonia'
 'Osteoarthristis' 'Arthritis']


# 6. Deploy do Modelo

In [61]:
local_path = os.path.join(DEPLOY_PATH, DEPLOY_FILE)
joblib.dump(model, local_path)

['..\\model\\finalized_model.sav']