In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn import pipeline,preprocessing,metrics,model_selection,ensemble, tree
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

# !pip install graphviz

In [2]:
data=pd.read_csv('animal_symptoms.csv')

In [3]:
data.disease = data.disease.fillna(method="ffill")

In [4]:
data.head()

Unnamed: 0,disease,symptom
0,bloat,distended left abdomen
1,bloat,discomfort
2,bloat,bellowing
3,bovine viral diarrhea BVD,fever
4,bovine viral diarrhea BVD,lethargy


In [5]:
n_unique = len(data['symptom'].unique())
n_unique

68

In [6]:
le = LabelEncoder()
binarized = le.fit_transform(data['symptom'])
print(binarized)

[19 18  3 21 33 34 37 36 38 16 13  0 55 66 50 61 32  8 63 21  5  6 13 67
 34 22 30 31 28 13 14 46 51 24  4 53 15 34 21 29 27 52 20  0 55  7 13 46
  1 16 47  5 59 25 45 65  9 13 34 48  0 54 10 43 10 41 21 57 44 17 30 11
 39 40 42 56  3  4 35 49 36 60 26 58 64 59 62 21 52 21 29 23 12  2 15]


In [7]:
oe = OneHotEncoder(sparse=False)
binarized = binarized.reshape(len(binarized), 1)
onehot_encoded = oe.fit_transform(binarized)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
len(onehot_encoded[0])

68

In [9]:
cols = np.array(data['symptom'].unique())
cols

array(['distended left abdomen', 'discomfort', 'bellowing', 'fever',
       'lethargy', 'loss of appetite', 'ocular discharge',
       'nasal discharge', 'oral lesions', 'diarrhea',
       'decreasing milk production', 'abortion', 'stillborn', 'weak calf',
       'retention of fetal membrane', 'swollen testicles',
       'lesions on skin', 'bold plaques', 'tufts of hair coming off',
       'blisters in the mouth', 'blisters on feet', 'weight loss',
       'frothing of mouth', 'lameness', 'late abortion', 'jaundice',
       'depressed appetite', 'reduced fertility', 'rubbing', 'hair loss',
       'biting', 'scratching', 'depression', 'lack of coordination',
       'isolation from herd', 'salivation', 'facial paralysis',
       'blood poisoning', 'anaemia', 'reduced live weight gain',
       'swelling in udders', 'hardness of udders',
       'redness or pain in udders', 'watery milk', 'clots in milk',
       'reduction in mobility', 'stillbirth', 'coughing', 'pus from nose',
       'prog

In [10]:
df_ohe = pd.DataFrame(columns=cols)
df_ohe.head()

Unnamed: 0,distended left abdomen,discomfort,bellowing,fever,lethargy,loss of appetite,ocular discharge,nasal discharge,oral lesions,diarrhea,...,mounting others,restlessness,swelling of head and neck,inflammation of the eyes,swelling in mouth,ulcers in mouth,tiredness,grinding teeth,dark coloured urine,anorexia


In [11]:
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [12]:
df_ohe.head()

Unnamed: 0,distended left abdomen,discomfort,bellowing,fever,lethargy,loss of appetite,ocular discharge,nasal discharge,oral lesions,diarrhea,...,mounting others,restlessness,swelling of head and neck,inflammation of the eyes,swelling in mouth,ulcers in mouth,tiredness,grinding teeth,dark coloured urine,anorexia
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
len(df_ohe)

95

In [14]:
df_disease = data['disease']
df_disease.head()

0                        bloat
1                        bloat
2                        bloat
3    bovine viral diarrhea BVD
4    bovine viral diarrhea BVD
Name: disease, dtype: object

In [15]:
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,distended left abdomen,discomfort,bellowing,fever,lethargy,loss of appetite,ocular discharge,nasal discharge,oral lesions,...,mounting others,restlessness,swelling of head and neck,inflammation of the eyes,swelling in mouth,ulcers in mouth,tiredness,grinding teeth,dark coloured urine,anorexia
0,bloat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bloat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bloat,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bovine viral diarrhea BVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,bovine viral diarrhea BVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_concat.drop_duplicates(keep='first', inplace=True)

In [17]:
df_concat.head()

Unnamed: 0,disease,distended left abdomen,discomfort,bellowing,fever,lethargy,loss of appetite,ocular discharge,nasal discharge,oral lesions,...,mounting others,restlessness,swelling of head and neck,inflammation of the eyes,swelling in mouth,ulcers in mouth,tiredness,grinding teeth,dark coloured urine,anorexia
0,bloat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bloat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bloat,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bovine viral diarrhea BVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,bovine viral diarrhea BVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
len(df_concat)

95

In [19]:
cols = df_concat.columns
cols

Index(['disease', 'distended left abdomen', 'discomfort', 'bellowing', 'fever',
       'lethargy', 'loss of appetite', 'ocular discharge', 'nasal discharge',
       'oral lesions', 'diarrhea', 'decreasing milk production', 'abortion',
       'stillborn', 'weak calf', 'retention of fetal membrane',
       'swollen testicles', 'lesions on skin', 'bold plaques',
       'tufts of hair coming off', 'blisters in the mouth', 'blisters on feet',
       'weight loss', 'frothing of mouth', 'lameness', 'late abortion',
       'jaundice', 'depressed appetite', 'reduced fertility', 'rubbing',
       'hair loss', 'biting', 'scratching', 'depression',
       'lack of coordination', 'isolation from herd', 'salivation',
       'facial paralysis', 'blood poisoning', 'anaemia',
       'reduced live weight gain', 'swelling in udders', 'hardness of udders',
       'redness or pain in udders', 'watery milk', 'clots in milk',
       'reduction in mobility', 'stillbirth', 'coughing', 'pus from nose',
       '

In [20]:
cols = cols[1:]
cols

Index(['distended left abdomen', 'discomfort', 'bellowing', 'fever',
       'lethargy', 'loss of appetite', 'ocular discharge', 'nasal discharge',
       'oral lesions', 'diarrhea', 'decreasing milk production', 'abortion',
       'stillborn', 'weak calf', 'retention of fetal membrane',
       'swollen testicles', 'lesions on skin', 'bold plaques',
       'tufts of hair coming off', 'blisters in the mouth', 'blisters on feet',
       'weight loss', 'frothing of mouth', 'lameness', 'late abortion',
       'jaundice', 'depressed appetite', 'reduced fertility', 'rubbing',
       'hair loss', 'biting', 'scratching', 'depression',
       'lack of coordination', 'isolation from herd', 'salivation',
       'facial paralysis', 'blood poisoning', 'anaemia',
       'reduced live weight gain', 'swelling in udders', 'hardness of udders',
       'redness or pain in udders', 'watery milk', 'clots in milk',
       'reduction in mobility', 'stillbirth', 'coughing', 'pus from nose',
       'progressive

In [21]:
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
num_lables = le.fit_transform(df_concat['disease'])
df_num_lables = pd.DataFrame(num_lables)
df_num_lables.columns = ['label']
df_concat = pd.concat([df_num_lables,df_concat], axis=1)
df_concat[:5]

Unnamed: 0,label,disease,distended left abdomen,discomfort,bellowing,fever,lethargy,loss of appetite,ocular discharge,nasal discharge,...,mounting others,restlessness,swelling of head and neck,inflammation of the eyes,swelling in mouth,ulcers in mouth,tiredness,grinding teeth,dark coloured urine,anorexia
0,0,blackleg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,bloat,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,blue tongue,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,3,bovine babesiosis (tick fever),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,bovine viral diarrhea BVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df_concat.to_csv("final_data.csv", index=False)

In [23]:
len(df_concat)

18

In [24]:
X = df_concat[cols]
y = df_concat['disease']


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)

In [26]:
len(X_train), len(y_train)

(12, 12)

In [27]:
len(X_test), len(y_test)

(6, 6)

In [28]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
# clf_dt = dt.fit(X, y)
clf_dt = rf.fit(X, y)

In [29]:
# export_graphviz(dt, out_file='./tree.dot', feature_names=cols)

In [30]:
# from graphviz import Source

# graph = Source(export_graphviz(dt, out_file=None, feature_names=cols))
# png_bytes = graph.pipe(format='png')

# with open('tree.png', 'wb') as f:
    # f.write(png_bytes)

In [31]:
disease_pred = clf_dt.predict(X_test)

In [32]:
disease_real = y_test.values

In [39]:
print(metrics.accuracy_score(y_test, disease_pred))

1.0


In [107]:
for i in range(0, len(disease_real)):
    if disease_pred[i] == disease_real[i]:
        print('Pred: {0} Actual: {1} '. format(disease_pred[i], disease_real[i]))

Pred: bloat Actual: bloat 
Pred: neosporosis Actual: neosporosis 
Pred: bovine viral diarrhea BVD Actual: bovine viral diarrhea BVD 
Pred: tuberculosis Actual: tuberculosis 
Pred: foot and mouth disease Actual: foot and mouth disease 
Pred: leptospirosis Actual: leptospirosis 


In [108]:
import joblib

In [109]:
joblib.dump(clf_dt, 'animals.sav')

['animals.sav']

In [110]:
modela = joblib.load('animals.sav')

In [111]:
modela.predict(X_test)

array(['bloat', 'neosporosis', 'bovine viral diarrhea BVD',
       'tuberculosis', 'foot and mouth disease', 'leptospirosis'],
      dtype=object)