In [79]:
import xml.etree.ElementTree as et
import tkinter.filedialog
import numpy as np
import pandas as pd
import matplotlib.cm as cm
from IPython.display import display
import matplotlib.pyplot as plt
from scipy import cluster
from scipy import spatial
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering

import matplotlib.pyplot as plt
from warnings import simplefilter

simplefilter("ignore", cluster.hierarchy.ClusterWarning)

label_encoder = preprocessing.LabelEncoder()
scalar = preprocessing.MinMaxScaler()



In [80]:
# In[153]:


def get_xml_root(path):
    etree = et.parse(path)
    return etree.getroot()


# In[154]:


def get_rules(xml_root):
    rule_attr_list = [xml.attrib for xml in xml_root.iter('rule')]
    rule_attr_dict = rule_attr_list.copy()
    rule_attr_data_frame = pd.DataFrame(list(rule_attr_dict))
    return rule_attr_data_frame


# In[155]:


def get_attributes(xml_root):
    attribute_attr_list = [a.attrib for a in xml_root.find('attributes').iter('name')]
    attribute_attr_dict = attribute_attr_list.copy()
    attributes_attr_data_frame = pd.DataFrame(list(attribute_attr_dict))
    attributes_attr_data_frame.drop('valueID', axis='columns', inplace=True)
    attributes_attr_data_frame.dropna(subset=['attributeID'], inplace=True)
    return attributes_attr_data_frame


# In[156]:


def init_rule_matrix(rule_data_frame, attribute_data_frame):
    for x in attribute_data_frame.values:
        rule_data_frame['attribute_' + x] = np.nan
    return rule_data_frame


# In[157]:


def get_rule_by_id(xml_root, id_rule):
    for rule in xml_root.iter('rule'):
        if rule.attrib['ruleID'] == id_rule:
            return rule


# In[158]:


def populate_matrix_with_conditions(xml_root, data_frame):
    for i in range(len(list(data_frame.ruleID))):
        for k in list(get_rule_by_id(xml_root, data_frame.ruleID[i]).find('conditions')):
            data_frame.loc[
                data_frame['ruleID'] == data_frame.ruleID[i], str(
                    'attribute_' + list(k)[0].attrib['attributeID'])] = list(k)[2].text
    return data_frame


# In[159]:


def populate_matrix_with_conclusions(xml_root, data_frame):
    for i in range(len(list(data_frame.ruleID))):
        for k in list(get_rule_by_id(xml_root, data_frame.ruleID[i]).find('conclusion')):
            data_frame.loc[
                data_frame['ruleID'] == data_frame.ruleID[i], str(
                    'attribute_' + list(k)[0].attrib['attributeID'])] = list(k)[2].text
    return data_frame


# In[160]:


def normilize_types_of_column_values(data_frame, column):
    try:
        data_frame[column] = pd.to_numeric(data_frame[column], downcast="float")
    except:
        data_frame[column] = data_frame[column].astype(str)


# In[161]:


def get_simbolic_values(xml_root):
    symbolic_value = []
    t = [a for a in xml_root.find('attributes').iter('symbolic_value')]
    for x in range(len(list(t))):
        symbolic_value.append(list(t)[x].find('name').text)
    return set(symbolic_value)

In [81]:
if __name__ == '__main__':
    data_list = ['./data/test_1B.xml', './data/test_2B.xml', './data/izaK416_16_11_2016_10_51_54.xml', './data/test_3B_26_08_2015_19_57_12.xml',  './data/test_5C_07_10_2015_19_46_50.xml' ]
    result = []
    for path in data_list:
        root = get_xml_root(path)
        rule_attrs = get_rules(root)
        attribute_attrs = get_attributes(root)
        init_data = init_rule_matrix(rule_attrs, attribute_attrs)
        populate_matrix_with_conditions(root, init_data)
        data = populate_matrix_with_conclusions(root, init_data)
        result.append(data)

In [69]:
condensed_chebyshev_distance = spatial.distance.pdist(indexed_data, metric='chebyshev')
distance_chebyshev_matrix = spatial.distance.squareform(condensed_chebyshev_distance)

In [70]:
chebyshev_matrix_frame = pd.DataFrame(distance_chebyshev_matrix, index=indexed_data.index, columns=indexed_data.index)

In [71]:
chebyshev_matrix_frame.head()

ruleID,10419,10420,10421,10422,10423,10424,10425,10426,10427,10428,...,11608,11609,11610,11611,11612,11613,11614,11615,11616,11617
ruleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10419,0.0,3.0,7.0,2.0,4.0,6.0,7.0,7.0,7.0,7.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
10420,3.0,0.0,7.0,3.0,4.0,6.0,7.0,7.0,7.0,4.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
10421,7.0,7.0,0.0,5.0,3.0,1.0,4.0,7.0,7.0,5.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
10422,2.0,3.0,5.0,0.0,2.0,4.0,7.0,7.0,7.0,7.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
10423,4.0,4.0,3.0,2.0,0.0,3.0,7.0,7.0,7.0,7.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0


In [72]:
condensed_canberra_distance = spatial.distance.pdist(indexed_data, metric='canberra')
distance_canberra_matrix = spatial.distance.squareform(condensed_canberra_distance)

In [73]:
canberra_matrix_frame = pd.DataFrame(distance_canberra_matrix, index=indexed_data.index, columns=indexed_data.index)

In [74]:
canberra_matrix_frame.head()

ruleID,10419,10420,10421,10422,10423,10424,10425,10426,10427,10428,...,11608,11609,11610,11611,11612,11613,11614,11615,11616,11617
ruleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10419,0.0,0.6,3.377778,2.142857,2.333333,3.2,3.698413,4.155556,3.888889,3.151404,...,5.608547,5.888889,6.155556,6.555556,5.253968,5.520635,5.920635,5.608547,5.34188,6.008547
10420,0.6,0.0,2.777778,2.742857,2.933333,2.6,3.253968,3.711111,3.444444,2.70696,...,5.164103,5.444444,5.711111,6.111111,4.809524,5.07619,5.47619,5.164103,4.897436,5.564103
10421,3.377778,2.777778,0.0,1.314286,1.2,0.333333,2.47619,3.444444,3.711111,3.278388,...,5.94188,6.222222,6.488889,6.888889,5.587302,5.853968,6.253968,5.94188,5.675214,6.34188
10422,2.142857,2.742857,1.314286,0.0,0.2,1.1,3.634921,4.055556,3.755556,3.008547,...,5.751404,6.031746,6.298413,6.698413,5.396825,5.663492,6.063492,5.751404,5.484737,6.151404
10423,2.333333,2.933333,1.2,0.2,0.0,0.933333,3.520635,3.888889,3.555556,3.208547,...,5.94188,6.222222,6.488889,6.888889,5.587302,5.853968,6.253968,5.94188,5.675214,6.34188


In [75]:
condensed_cityblock_distance = spatial.distance.pdist(indexed_data, metric='cityblock')
distance_cityblock_matrix = spatial.distance.squareform(condensed_cityblock_distance)

In [76]:
cityblock_matrix_frame = pd.DataFrame(distance_cityblock_matrix, index=indexed_data.index, columns=indexed_data.index)

In [77]:
cityblock_matrix_frame.head()

ruleID,10419,10420,10421,10422,10423,10424,10425,10426,10427,10428,...,11608,11609,11610,11611,11612,11613,11614,11615,11616,11617
ruleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10419,0.0,3.0,12.0,4.0,6.0,11.0,18.0,22.0,20.0,14.0,...,34.0,36.0,38.0,40.0,31.0,33.0,35.0,34.0,32.0,36.0
10420,3.0,0.0,9.0,7.0,9.0,8.0,15.0,19.0,17.0,11.0,...,31.0,33.0,35.0,37.0,28.0,30.0,32.0,31.0,29.0,33.0
10421,12.0,9.0,0.0,8.0,6.0,1.0,8.0,14.0,16.0,14.0,...,38.0,40.0,42.0,44.0,35.0,37.0,39.0,38.0,36.0,40.0
10422,4.0,7.0,8.0,0.0,2.0,7.0,16.0,20.0,18.0,12.0,...,36.0,38.0,40.0,42.0,33.0,35.0,37.0,36.0,34.0,38.0
10423,6.0,9.0,6.0,2.0,0.0,5.0,14.0,18.0,16.0,14.0,...,38.0,40.0,42.0,44.0,35.0,37.0,39.0,38.0,36.0,40.0
