### Načítanie knižníc

In [397]:
# nacitanie kniznic pre analyzu dat
import pandas as pd
import numpy as np

# nacitanie kniznic pre rozhodovacie stromy
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

# kniznica na zobrazenie frame-u
from IPython.display import IFrame

#knižnice pre znázornenie rozhodovacieho stromu
import pydotplus
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image

### Načítanie datasetu

In [398]:
# dataset sa nachadza na tejto adrese: https://www.kaggle.com/bitsofishan/covid19-patient-symptoms
IFrame('https://www.kaggle.com', width = 800, height = 450)

# u mna je lokalizovany lokalne v projekte
# nacitam jeho cestuip
csv_file_path = "../db/COVID-19-Survey.csv"


In [399]:
# nazvy stlpcov, s ktorymi budeme pracovat
# col_names = ['age', 'gender','body-temperature', 'dry-cough', 'sour-throat', 'breathing-problem', 'wearness', 'drowsiness', 'pain-in-chest', 'diabetes','change-in-sns','travel-history']

# nacitame dataset
covid_csv = pd.read_csv(csv_file_path, sep=";")

In [400]:
covid_csv.head(5)

Unnamed: 0,age,gender,body-temperature,dry-cough,sour-throat,breathing-problem,pain-in-chest,weakness,drowsiness,diabetes,change-in-sns,travel-history,result
0,20,F,37.0,0,0,0,0,0,0,0,0,0,0
1,19,F,37.2,1,0,0,0,0,0,0,0,0,0
2,55,M,38.9,1,1,1,1,1,1,0,1,0,1
3,40,M,37.8,0,0,0,1,0,1,1,1,1,2
4,33,F,37.3,0,1,1,0,0,0,0,1,1,2


In [401]:
covid_csv.describe()

Unnamed: 0,age,body-temperature,dry-cough,sour-throat,breathing-problem,pain-in-chest,weakness,drowsiness,diabetes,change-in-sns,travel-history,result
count,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0
mean,36.456693,37.794488,0.488189,0.551181,0.433071,0.370079,0.409449,0.425197,0.244094,0.629921,0.464567,0.937008
std,14.052484,0.870532,0.50184,0.499343,0.497463,0.484738,0.49368,0.496331,0.43125,0.484738,0.500718,0.675513
min,18.0,35.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.5,37.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,33.0,37.6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,45.0,38.55,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
max,80.0,39.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [402]:
# v datasete sa nachadza okrem ciselnych aj jeden stlpec s kategorialnou premennou
# get dummies prehodi kategorialne premenne na ciselne
new_covid_csv = pd.get_dummies(covid_csv, columns=["gender"], drop_first=False)

In [403]:
new_covid_csv


Unnamed: 0,age,body-temperature,dry-cough,sour-throat,breathing-problem,pain-in-chest,weakness,drowsiness,diabetes,change-in-sns,travel-history,result,gender_F,gender_M
0,20,37.0,0,0,0,0,0,0,0,0,0,0,1,0
1,19,37.2,1,0,0,0,0,0,0,0,0,0,1,0
2,55,38.9,1,1,1,1,1,1,0,1,0,1,0,1
3,40,37.8,0,0,0,1,0,1,1,1,1,2,0,1
4,33,37.3,0,1,1,0,0,0,0,1,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,37,38.9,1,1,1,0,1,0,0,1,0,0,1,0
123,35,37.8,0,0,0,1,0,1,1,1,1,1,0,1
124,22,37.3,0,0,1,0,0,0,0,1,0,1,1,0
125,29,38.9,0,1,1,1,1,1,0,1,0,0,0,1


In [404]:
# rozdelenie do trénovacej a testovacej množiny v pomere 80 : 20
X = new_covid_csv[['dry-cough', 'sour-throat', 'breathing-problem', 'weakness', 'drowsiness', 'pain-in-chest', 'diabetes','change-in-sns','travel-history', 'gender_F', 'gender_M']]
y = new_covid_csv['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [405]:
clf = DecisionTreeClassifier()

In [406]:
# Alt+ Shift + B
clf = clf.fit(X_train,y_train)

In [407]:
y_pred = clf.predict(X_test)

In [408]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6153846153846154


In [409]:
from sklearn.metrics import confusion_matrix

#meranie výkonu klasifikácie strojového učenia
print(confusion_matrix(y_test, y_pred))

[[ 5  1  1]
 [ 4 10  3]
 [ 0  1  1]]


## Generovanie rozhodovacieho stromu



In [410]:
col_names = ['dry-cough', 'sour-throat', 'breathing-problem', 'weakness', 'drowsiness', 'pain-in-chest', 'diabetes','change-in-sns','travel-history', 'gender_F', 'gender_M']

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
                filled=True, rounded=False,
                special_characters=True,
                feature_names = col_names,
                class_names=['0', '1', '2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('skuska.png')
Image(graph.create_png())

InvocationException: GraphViz's executables not found

In [None]:

# rozhodovaci strom

import numpy as np
from sklearn.tree import _tree

def tree_to_code(tree, feature_names, Y):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    pathto=dict()

    global k
    k = 0
    def recurse(node, depth, parent):
        global k
        indent = "  " * depth

        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            s= "{} <= {} ".format( name, threshold, node )
            if node == 0:
                pathto[node]=s
            else:
                pathto[node]=pathto[parent]+' & ' +s

            recurse(tree_.children_left[node], depth + 1, node)
            s="{} > {}".format( name, threshold)
            if node == 0:
                pathto[node]=s
            else:
                pathto[node]=pathto[parent]+' & ' +s
            recurse(tree_.children_right[node], depth + 1, node)
        else:
            k=k+1
            print(k,')',pathto[parent], tree_.value[node])
    recurse(0, 1, 0)


In [None]:
tree_to_code(clf, col_names, y)