In [74]:
import xml.etree.ElementTree as et
import tkinter.filedialog
import numpy as np
import pandas as pd
import matplotlib.cm as cm
from pylatex import Document, table, MultiColumn, math


from scipy import cluster
from scipy import spatial
from sklearn import metrics
from sklearn import preprocessing

In [75]:
from sklearn.cluster import AgglomerativeClustering

import matplotlib.pyplot as plt
from warnings import simplefilter

simplefilter("ignore", cluster.hierarchy.ClusterWarning)

label_encoder = preprocessing.LabelEncoder()
scalar = preprocessing.MinMaxScaler()

In [76]:
def generate_string_list(data):
    return ''.join('l' for x in range(len(data.columns)))


In [77]:
def generate_string_list(data):
    return ''.join('l' for x in range(len(data.columns)))
def genenerate_longtabu(data):
    print(len(generate_string_list(data)))
    geometry_options = {
        "margin": "2.54cm",
        "includeheadfoot": True
    }
    doc = Document(page_numbers=True, geometry_options=geometry_options)

    # Generate data table
    with doc.create(table.LongTable('c|${generate_string_list(data)}')) as data_table:
            data_table.add_hline()
            data_table.add_row(data.columns)
            data_table.add_hline()
            data_table.end_table_header()
            data_table.add_hline()
            data_table.add_row((MultiColumn(len(data.columns), align='r',
                                data='Continued on Next Page'),))
            data_table.add_hline()
            data_table.end_table_footer()
            data_table.add_hline()
            data_table.add_row((MultiColumn(3, align='r',
                                data='Not Continued on Next Page'),))
            data_table.add_hline()
            data_table.end_table_last_footer()
            row = ["Content1", "9", "Longer String"]
            for i in range(data):
                data_table.add_row(i)

    doc.generate_pdf("longtable", clean_tex=False)

In [7]:
# In[153]:


def get_xml_root():
    etree = et.parse(tkinter.filedialog.askopenfilename())
    return etree.getroot()


# In[154]:


def get_rules(xml_root):
    rule_attr_list = [xml.attrib for xml in xml_root.iter('rule')]
    rule_attr_dict = rule_attr_list.copy()
    rule_attr_data_frame = pd.DataFrame(list(rule_attr_dict))
    return rule_attr_data_frame


# In[155]:


def get_attributes(xml_root):
    attribute_attr_list = [a.attrib for a in xml_root.find('attributes').iter('name')]
    attribute_attr_dict = attribute_attr_list.copy()
    attributes_attr_data_frame = pd.DataFrame(list(attribute_attr_dict))
    attributes_attr_data_frame.drop('valueID', axis='columns', inplace=True)
    attributes_attr_data_frame.dropna(subset=['attributeID'], inplace=True)
    return attributes_attr_data_frame


# In[156]:


def init_rule_matrix(rule_data_frame, attribute_data_frame):
    for x in attribute_data_frame.values:
        rule_data_frame['attribute_' + x] = np.nan
    return rule_data_frame


# In[157]:


def get_rule_by_id(xml_root, id_rule):
    for rule in xml_root.iter('rule'):
        if rule.attrib['ruleID'] == id_rule:
            return rule


# In[158]:


def populate_matrix_with_conditions(xml_root, data_frame):
    for i in range(len(list(data_frame.ruleID))):
        for k in list(get_rule_by_id(xml_root, data_frame.ruleID[i]).find('conditions')):
            data_frame.loc[
                data_frame['ruleID'] == data_frame.ruleID[i], str(
                    'attribute_' + list(k)[0].attrib['attributeID'])] = list(k)[2].text
    return data_frame


# In[159]:


def populate_matrix_with_conclusions(xml_root, data_frame):
    for i in range(len(list(data_frame.ruleID))):
        for k in list(get_rule_by_id(xml_root, data_frame.ruleID[i]).find('conclusion')):
            data_frame.loc[
                data_frame['ruleID'] == data_frame.ruleID[i], str(
                    'attribute_' + list(k)[0].attrib['attributeID'])] = list(k)[2].text
    return data_frame


# In[160]:


def normilize_types_of_column_values(data_frame, column):
    try:
        data_frame[column] = pd.to_numeric(data_frame[column], downcast="float")
    except:
        data_frame[column] = data_frame[column].astype(str)


# In[161]:


def get_simbolic_values(xml_root):
    symbolic_value = []
    t = [a for a in xml_root.find('attributes').iter('symbolic_value')]
    for x in range(len(list(t))):
        symbolic_value.append(list(t)[x].find('name').text)
    return set(symbolic_value)


# In[162]:


def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)
    return cluster.hierarchy.dendrogram(linkage_matrix, **kwargs)


# In[163]:


def get_euclidean(data_for_linkage):
    for linkage in ('ward', 'complete', 'average', 'single'):
        condensed_euclidean_distance = spatial.distance.pdist(data_for_linkage, metric='euclidean')
        distance_euclidean_matrix = spatial.distance.squareform(condensed_euclidean_distance)
        linkage_distance = cluster.hierarchy.linkage(distance_euclidean_matrix, method=linkage)
        dendrogram = cluster.hierarchy.dendrogram(linkage_distance, labels=data_for_linkage.index, leaf_font_size=12,
                                                  leaf_rotation=45)
        n_clusters = len(set(dendrogram['color_list'])) - 1
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage)
        clusterer.fit(data_for_linkage)
        labels = clusterer.labels_
        silhouette_avg = metrics.silhouette_score(distance_euclidean_matrix, labels, metric='precomputed')
        plt.title(f'Hierarchical Clustering Dendrogram - {linkage} linkage , score: {silhouette_avg} ')
        plt.suptitle('Euclidean')
        plt.show()


def get_chebyshev(data_for_linkage):
    for linkage in ('complete', 'average', 'single'):
        condensed_chebyshev_distance = spatial.distance.pdist(data_for_linkage, metric='chebyshev')
        distance_chebyshev_matrix = spatial.distance.squareform(condensed_chebyshev_distance)
        linkage_distance = cluster.hierarchy.linkage(distance_chebyshev_matrix, method=linkage)
        dendrogram = cluster.hierarchy.dendrogram(linkage_distance, labels=data_for_linkage.index, leaf_font_size=12,
                                                  leaf_rotation=45)
        n_clusters = len(set(dendrogram['color_list'])) - 1
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage=linkage)
        clusterer.fit(distance_chebyshev_matrix)
        labels = clusterer.labels_
        silhouette_avg = metrics.silhouette_score(distance_chebyshev_matrix, labels, metric='precomputed')
        plt.title(f'Hierarchical Clustering Dendrogram - {linkage} linkage , score: {silhouette_avg} ')
        plt.suptitle('Chebyshev')
        plt.show()


if __name__ == '__main__':
    root = get_xml_root()
    rule_attrs = get_rules(root)
    attribute_attrs = get_attributes(root)
    init_data = init_rule_matrix(rule_attrs, attribute_attrs)
    populate_matrix_with_conditions(root, init_data)
    data = populate_matrix_with_conclusions(root, init_data)

# In[164]:


data.drop(data.columns[[0]], axis=1, inplace=True)

indexed_data = data

# In[169]:


indexed_data = indexed_data.set_index('ruleID')

for column in indexed_data.columns:
    normilize_types_of_column_values(indexed_data, column)

columns = list(indexed_data.columns)

numerical_columns = []
non_numerocal_columns = []
for col in columns:
    if col == 'ruleID':
        continue
    if np.issubdtype(indexed_data[col].dtype, np.number):
        numerical_columns.append(col)
    else:
        non_numerocal_columns.append(col)

# In[178]:


numerical_data = indexed_data[numerical_columns]

numerical_data = numerical_data.fillna(float(0))

if len(numerical_columns) > 0:
    scaled_data = scalar.fit_transform(numerical_data)
    indexed_data[numerical_columns] = scaled_data

symbolic_value = list(get_simbolic_values(root))
symbolic_value.sort()

for s in symbolic_value:
    for categorical in non_numerocal_columns:
        indexed_data.loc[indexed_data[categorical] == s, categorical] = symbolic_value.index(s)

for categorical in non_numerocal_columns:
    indexed_data.loc[indexed_data[categorical] == 'nan', categorical] = 8

# print(indexed_data)

# """Euclidean distance with ward linkage"""
# get_euclidean(indexed_data)
# get_chebyshev(indexed_data)

# condensed_chebyshev_distance = spatial.distance.pdist(indexed_data, metric='chebyshev')
# distance_chebyshev_matrix = spatial.distance.squareform(condensed_chebyshev_distance)
# linkage_1 = cluster.hierarchy.linkage(distance_chebyshev_matrix, method='single')
# dendrogram_1 = cluster.hierarchy.dendrogram(linkage_1, labels=indexed_data.index, leaf_font_size=12, leaf_rotation=45)
# n_clusters_1 = len(set(dendrogram['color_list'])) - 1
# cluster_1 = AgglomerativeClustering(n_clusters=n_clusters_1, affinity='precomputed', linkage='single')
# cluster_1.fit(indexed_data)
# labels_1 = cluster_1.labels_
# score_1 = metrics.silhouette_score(distance_chebyshev_matrix, labels_1, metric='precomputed')
# print(score_1)
# plt.figure(figsize=(20, 10))
# plt.show()

In [8]:
condensed_euclidean_distance = spatial.distance.pdist(indexed_data, metric='euclidean')
distance_euclidean_matrix = spatial.distance.squareform(condensed_euclidean_distance)

In [9]:
euclidean_matrix_frame = pd.DataFrame(distance_euclidean_matrix, index=indexed_data.index, columns=indexed_data.index)

In [10]:
pd.set_option("max_colwidth", 50)

In [69]:
euclidean_matrix_frame.columns

Index(['1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886',
       '1887',
       ...
       '2284', '2285', '2286', '2287', '2288', '2289', '2290', '2291', '2292',
       '2293'],
      dtype='object', name='ruleID', length=416)

In [90]:
a = math.Matrix(euclidean_matrix_frame, mtype='b', alignment='cr')

In [91]:
a.generate_tex("B")