In [None]:
import subprocess, os, sys
import time
import csv
import pandas as pd
import numpy as np
import pyamg
import random as rnd
import scipy as scp
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pylab as py
import pingouin as pg
import warnings
import gc

from sklearn.cluster import KMeans, DBSCAN, MeanShift, SpectralClustering

warnings.filterwarnings("ignore")

In [None]:
# nie wywoływać bez istotnego powodu!
wordVectors = {}
print("Loading word vectors...this will take a while")

start = time.time()
print('Początek obliczeń o godzinie', time.strftime("%H:%M:%S", time.localtime()))

checkpoint = time.time()

filename = 'C:/Users/priva/OneDrive/Desktop/STUDIA/Proseminarium MIM/cc.pl.300.vec/cc.pl.300.vec'
f = open(filename, "r", encoding='utf-8')
lines = f.readlines()
for i,line in enumerate(lines):
    if i % 100000 == 0:
        print(i,"/ 2000000. Od poprzedniego checkpointu minęło", round((time.time() - checkpoint)/60, 2), "minuty.")
        checkpoint = time.time()
    line = line
    token = line.split(' ')
    wordVectors[token[0]] = token[1:]

f.close()

print('Słowa wgrywały się', round((time.time() - start)/60, 2), 'minut.')

In [None]:
# argument sentences musi być listą, której elementami są zdania, których podobieństwo chcemy ocenić
def sentences_similarity(sentences: list, print_warning = False, print_description = False, path = ''):
    
    # liczba wgranych zdań
    N = len(sentences)
    
    if N < 2:
        stats = {}
        stats['nb of sentences'] = N
        stats['mean'] = 0.5
        stats['std dev'] = 0
        stats['min'] = 0.5
        stats['max'] = 0.5

        return stats
    
    # lista na podstawowe statystyki
    # jeśli tu jesteśmy, to znaczy że dostaliśmy przynajmniej dwa zdania do oceny
    stat_values = []
    
    if path != '':   
        with open(path, 'r', newline='', encoding="utf-8") as csvfile:
            projects = {}
            section = ""
            reader = csv.reader(csvfile, delimiter=';')
            for row in reader:
                if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                    section = str(row[0]).strip().lower()
                    header = next(reader)
                elif section == "projects":
                    projects[row[0]] = {}
                    for it, key in enumerate(header[1:]):
                        projects[row[0]][key.strip()] = row[it+1].strip()
                        
        Dist_Array_ling = pd.read_csv(path.replace('.pb', '.csv'))
        Dist_Array_ling.index = Dist_Array_ling.columns  
        
        for s1 in sentences:
            for s2 in sentences:
                if s1 == s2:
                    continue
                else:
                    for key1 in list(projects.keys()):
                        if projects[key1]['name'] == s1:
                            break
                    for key2 in list(projects.keys()):
                        if projects[key2]['name'] == s2:
                            break
                    # teraz key1 orz key2 to numery badanych projektów
                    
                    if str(np.asmatrix(Dist_Array_ling.loc[[key1], [key2]])[0, 0]) == 'nan':
                        continue
                    
                    if key1 < key2:
                        #print(f"Oceniamy zdania '{s1}' oraz '{s2}'. Podobieństwo: {1-np.asmatrix(Dist_Array_ling.loc[[key1], [key2]])[0, 0]}")
                        #print(f"Odpowiadają im klucze '{key1}' oraz '{key2}'.")
                        stat_values.append(1-np.asmatrix(Dist_Array_ling.loc[[key1], [key2]])[0, 0])
                        #print('Wartość zapisano.')
                                
    
    elif path == '':

        # Krok 1 -- ze zdań robimy słowa
        words_by_sentence, words = {}, set()
        for i,sentence in enumerate(sentences):
            words_by_sentence[i] = sentence.rstrip().split(' ')
            words_by_sentence[i] = [w.strip('.,!?"') for w in list(words_by_sentence[i])]
            words_by_sentence[i] = list(filter(lambda k: len(k) > 2, words_by_sentence[i]))
            words.update(words_by_sentence[i])

        # Krok 2 -- obliczamy wektory zdań
        wordVectorLength, zeroVectorCount = 300, 0
        docVectors = np.zeros( (N, wordVectorLength), dtype='float32')

        for i, word in enumerate(words):
            tokens = wordVectors.get(word)
            wv = np.asarray(tokens, dtype='float32')
            if word in wordVectors:
                for i in range(N):
                    if word in words_by_sentence[i]:
                        docVectors[i] = docVectors[i] + wv/np.linalg.norm(wv)
            else:
                zeroVectorCount = zeroVectorCount + 1

        if print_warning == True:
            print ('# words not found in fasttext..', zeroVectorCount)

        for i in range(N):
            docVectors[i] = docVectors[i]/np.linalg.norm(docVectors[i])

        # Krok 3 -- zapisujemy statystyki
        for i in range(N):
            for j in range(i):
                
                if print_description == True:
                    
                    print('Cosine Similarity:\n', 
                          sentences[i], '\n&\n', sentences[j], ':', 
                          np.dot(docVectors[i], docVectors[j]), '\n')
                    
                stat_values.append(np.dot(docVectors[i], docVectors[j]))

                
    # Krok 4 -- podsumowanie częstościowe

    # wartości które zapisujemy, to:
    # 1. liczba zdań
    # 2. średnie podobieństwo pomiędzy parami zdań
    # 3. odchylenie standardowe tego podobieństwa
    # 4. najmniejszą i największą wartość podobieństwa

    stats = {}
    stats['nb of sentences'] = N
    stats['mean'] = np.mean(stat_values)
    stats['std dev'] = np.std(stat_values)
    stats['min'] = min(stat_values)
    stats['max'] = max(stat_values)

    return stats

In [None]:
# argument clusters musi być słownikiem, którego elementami są listy projektów w poszczególnych klastrach
def assess_clustering(clusters: dict, stat = 'mean', path = ''):
    
    #print(path)
    
    score = []
    weights = []
    
    # wagi do średniej ważonej
    for c in list(clusters.values()):
        weights.append(sentences_similarity(c, path = path)['nb of sentences'])
        
    if stat == 'mean':
        for c in list(clusters.values()):
            score.append(sentences_similarity(c, path = path)['mean'])
        final_score = np.average(score, weights = weights)
        return final_score
    
    elif stat == 'std dev':
        for c in list(clusters.values()):
            score.append(sentences_similarity(c, path = path)['std dev'])
        final_score = np.average(score, weights = weights)
        return final_score
    
    elif stat == 'max':
        for c in list(clusters.values()):
            score.append(sentences_similarity(c, path = path)['max'])
        final_score = np.average(score, weights = weights)
        return final_score
    
    elif stat == 'min':
        for c in list(clusters.values()):
            score.append(sentences_similarity(c, path = path)['min'])
        final_score = np.average(score, weights = weights)
        return final_score
    
    elif stat == 'numbers':
        return weights

In [None]:
def projects_similarity(clusters: dict, path):
    
    # otwieramy plik i odczytujemy dane
    with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        meta = {}
        projects = {}
        votes = {}
        section = ""
        header = []
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                section = str(row[0]).strip().lower()
                header = next(reader)
            elif section == "meta":
                meta[row[0]] = row[1].strip()
            elif section == "projects":
                projects[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    projects[row[0]][key.strip()] = row[it+1].strip()
            elif section == "votes":
                votes[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    votes[row[0]][key.strip()] = row[it+1].strip()
    
    df = Clustering_from_file(path, variant = 'dataframe')[0]
    ans = []
    
    for c in list(clusters.values()):
        
        for proj1 in c:
            for proj2 in c:
                
                if proj1 == proj2:
                    continue
                
                for key1 in list(projects.keys()):
                    if projects[key1]['name'] == proj1:
                        break
                for key2 in list(projects.keys()):
                    if projects[key2]['name'] == proj2:
                        break
                # teraz key1 orz key2 to numery badanych projektów
                
                # 1 - odległość, bo oceniamy podobieństwo, a nie dystans
                ans.append(1 - df.loc[key1, key2])
                
    return np.mean(ans)

In [None]:
def Correlation(path, method: list = ['Pearson', 'Spearman', 'Kendall'], significance: bool = True, prob: float = 0.01):
    
    # print('Plik:', path)
    
    d = Clustering_from_file(path, variant = 'dataframe')[0]
    l = Clustering_from_file(path, variant = 'dataframe')[1]
    
    dict_df = {}
    count_pears = 0
    count_spear = 0
    count_kend = 0
    
    for column in list(d.columns):
        
        dict_df[column] = {}
        
        # usuwamy 'nan' z obu list
        
        x1 = []
        y1 = []
        
        for i in range(len(d[column].tolist())):
            if str(d[column].tolist()[i]) != 'nan' and str(l[column].tolist()[i]) != 'nan':
                x1.append(d[column].tolist()[i])
                y1.append(l[column].tolist()[i])
            
        if len(x1) < 2 or len(y1) < 2 or len(x1) != len(y1):
            #print(f"Wektory projektu {column} nie są równej długości lub wymiar jednego z nich jest mniejszy od 2.")
            continue
        # wszystkie 'nan' zostały usunięte
                       
        if 'Pearson' in method:
            
            dict_df[column]['Pearson'] = [round(scp.stats.pearsonr(x1, y1)[0], 4)]
            
            p = scp.stats.pearsonr(x1, y1)[1]

            if p<prob:
                count_pears += 1
                    
            if significance == True:
                if p<0.05 and p>0.01:
                    dict_df[column]['Pearson'].append('(*)')
                elif p<0.01 and p>0.001:
                    dict_df[column]['Pearson'].append('(**)')
                elif p<0.001:
                    dict_df[column]['Pearson'].append('(***)')
                elif p>0.05:
                    dict_df[column]['Pearson'].append('')
            dict_df[column]['Pearson'] = ' '.join(map(str, dict_df[column]['Pearson']))
            
        if 'Spearman' in method:
            #print('Spearman: ', scp.stats.spearmanr(d[column], l[column])[0],
            #  ', p-value: ', scp.stats.spearmanr(d[column], l[column])[1])
            
            dict_df[column]['Spearman'] = [round(scp.stats.spearmanr(x1, y1)[0], 4)]
            
            p = scp.stats.spearmanr(x1, y1)[1]
            if p<prob:
                count_spear += 1
                
            if significance == True:
                if p<0.05 and p>0.01:
                    dict_df[column]['Spearman'].append('(*)')
                elif p<0.01 and p>0.001:
                    dict_df[column]['Spearman'].append('(**)')
                elif p<0.001:
                    dict_df[column]['Spearman'].append('(***)')
                elif p>0.05:
                    dict_df[column]['Spearman'].append('')
            dict_df[column]['Spearman'] = ' '.join(map(str, dict_df[column]['Spearman']))
            
        if 'Kendall' in method:
            #print('Kendall: ', scp.stats.kendalltau(d[column], l[column])[0],
            #  ', p-value: ', scp.stats.kendalltau(d[column], l[column])[1])
            dict_df[column]['Kendall'] = [round(scp.stats.kendalltau(x1, y1)[0], 4)]
            
            p = scp.stats.kendalltau(x1, y1)[1]
            if p<prob:
                count_kend += 1
                
            if significance == True:
                if p<0.05 and p>0.01:
                    dict_df[column]['Kendall'].append('(*)')
                elif p<0.01 and p>0.001:
                    dict_df[column]['Kendall'].append('(**)')
                elif p<0.001:
                    dict_df[column]['Kendall'].append('(***)')
                elif p>0.05:
                    dict_df[column]['Kendall'].append('')
            dict_df[column]['Kendall'] = ' '.join(map(str, dict_df[column]['Kendall']))
        
    df = pd.DataFrame.from_dict(dict_df, orient = 'index', columns = method)
    
    return df, [count_pears, count_spear, count_kend]

In [None]:
def Correlation_summary(paths_list: list):
    summary_dict = {}
    for path in paths_list:
        
        with open(path, 'r', newline='', encoding="utf-8") as csvfile:
            meta = {}
            projects = {}
            votes = {}
            section = ""
            header = []
            reader = csv.reader(csvfile, delimiter=';')
            for row in reader:
                if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                    section = str(row[0]).strip().lower()
                    header = next(reader)
                elif section == "meta":
                    meta[row[0]] = row[1].strip()
                elif section == "projects":
                    projects[row[0]] = {}
                    for it, key in enumerate(header[1:]):
                        projects[row[0]][key.strip()] = row[it+1].strip()
                elif section == "votes":
                    votes[row[0]] = {}
                    for it, key in enumerate(header[1:]):
                        votes[row[0]][key.strip()] = row[it+1].strip()
                    
        if 'subunit' in list(meta.keys()):
            desc = meta['country'] + ', ' + meta['unit'] + ', ' + meta['subunit'] + ' ' + meta['instance']
        else:
            desc = meta['country'] + ', ' + meta['unit'] + ', ' + meta['instance']
        
        cor = Correlation(path, significance = False)
        
        summary_dict[desc] = {'Number of projects': len(projects.keys()),
                              'Pearson': np.mean([float(i) for i in cor[0]['Pearson'].tolist()]),
                              '# of Pearson *** signif. coefs.': cor[1][0],
                              'Spearman': np.mean([float(i) for i in cor[0]['Spearman'].tolist()]),
                              '# of Spearman *** signif. coefs.': cor[1][1],
                              'Kendall': np.mean([float(i) for i in cor[0]['Kendall'].tolist()]),
                              '# of Kendall *** signif. coefs.': cor[1][2]}
        
    return pd.DataFrame.from_dict(summary_dict, orient = 'index')

In [None]:
def normal_table(path, variant): 
    
    if variant not in ['ling', 'jacc']:
        raise ValueError("Argument 'variant' must be one of: 'ling', 'jacc'.")
        
    if variant == 'ling':
        df = Clustering_from_file(path, variant = 'dataframe')[1]
    elif variant == 'jacc':
        df = Clustering_from_file(path, variant = 'dataframe')[0]
    
    count_not_reject = 0
    count10 = 0
    count5 = 0
    count1 = 0
    count01 = 0
    
    for i, column in enumerate(list(df.columns)):
        vec = df[column].tolist()
        x = vec[:i] + vec[i+1:]
        
        if len(x) < 1:
            continue
        
        jarquebera = scp.stats.jarque_bera(x)
        p = jarquebera.pvalue
        if p <= 0.001:
            count01 += 1
        if p > 0.001 and p <= 0.01:
            count1 += 1
        if p > 0.01 and p <= 0.05:
            count5 += 1
        if p > 0.05 and p <= 0.1:
            count10 += 1
        if p > 0.1:
            count_not_reject += 1
            
    dict_for_table = {'Liczba projektów': len(list(df.columns)),
                      'p > 0.1': count_not_reject,
                      '0.05 < p <= 0.1': count10,
                      '0.01 < p <= 0.05': count5,
                      '0.001 < p <= 0.01': count1,
                      'p <= 0.001': count01}
    
    dataframe = pd.DataFrame.from_dict(dict_for_table, orient = 'index')
    dataframe.columns = [Clustering_from_file(path, variant = 'desc')]
    
    return dataframe

In [None]:
def Hamming(A, B) -> int:
    return len(A - B) + len(B - A)

def Jaccard(A, B) -> float:
    if len(A.union(B)) == 0:
        return 0
    else:
        return (len(A - B) + len(B - A))/len(A.union(B))

In [None]:
def Clustering_from_file(path, 
                         method = 'kmeans', 
                         N: int = 12, 
                         eps: float = 0.95, 
                         min_samples: int = 3, 
                         bandwidth: float = None, 
                         min_bin_freq: float = 1, 
                         show: bool = False,
                         variant = 'jacc'):
    
    ##############################################################################
    #                                    OPIS                                    #
    #path -> ścieżka do pliku z danymi .pb
    #N -> liczba klastrów (tylko do kmeans i spectral)
    #eps, min_samples -> tylko do dbscan
    #bandwidth -> tylko do meanshift
    #show -> jeśli 'True', wyświetla header z pliku pabulib, czyli info o wyborach
    ##############################################################################
    
    if variant not in ['jacc', 'ling', 'dataframe', 'desc']:
        raise ValueError("Argument 'variant' must be one of: 'jacc', 'ling', 'dataframe', 'desc'.")
        
    if method not in ['kmeans', 'dbscan', 'meanshift', 'spectral']:
        raise ValueError("Argument 'method' must be one of: 'kmeans', 'dbscan', 'meanshift', 'spectral'.")
    
    #Krok 1. Odczytujemy plik z danymi o wyborach

    with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        meta = {}
        projects = {}
        votes = {}
        section = ""
        header = []
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                section = str(row[0]).strip().lower()
                header = next(reader)
            elif section == "meta":
                meta[row[0]] = row[1].strip()
            elif section == "projects":
                projects[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    projects[row[0]][key.strip()] = row[it+1].strip()
            elif section == "votes":
                votes[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    votes[row[0]][key.strip()] = row[it+1].strip()

    if variant == 'desc':
        if 'subunit' in list(meta.keys()):
            desc = meta['country'] + ', ' + meta['unit'] + ', ' + meta['subunit'] + ' ' + meta['instance']
        else:
            desc = meta['country'] + ', ' + meta['unit'] + ', ' + meta['instance']
        return desc
    
    #Krok 2. Przygotowujemy dane
    
    projects_approval = {}
    for p in list(projects.keys()):
        S = []
        for k in list(votes.keys()):
            if p in set(votes[k]['vote'].split(",")):
                S.append(k)
        projects_approval[p] = S

    #tworzymy tablicę odległości między projektami
    n = len(list(projects.keys()))
    Dist_Array = np.zeros((n, n))
    path_csv = path
    Dist_Array_ling = pd.read_csv(path_csv.replace('.pb', '.csv'))
    Dist_Array_ling.index = Dist_Array_ling.columns

    for it1, k in enumerate(list(projects.keys())):
        for it2, l in enumerate(list(projects.keys())):
            if k == l:
                continue
            else:
                Dist_Array[it1, it2] = Jaccard(set(projects_approval[k]), set(projects_approval[l]))

    #tworzymy ramkę danych z tablicy odległości
    df = pd.DataFrame(data=Dist_Array, index=list(projects.keys()), columns=list(projects.keys()))
    df_ling = pd.DataFrame(data=Dist_Array_ling, index=list(projects.keys()), columns=list(projects.keys()))            

    if variant == 'dataframe':
        return df, df_ling
        # kod kończy się tutaj na stworzeniu i zwróceniu ramki danych z odległościami
        
    #Krok 3. Klastrujemy
    # wybór metody klastrowania:
    if method == 'kmeans':
        if variant == 'jacc':
            kmeans = KMeans(n_clusters=N).fit(df)
            labels = kmeans.labels_
        if variant == 'ling':
            N_ling = N
            kmeans_ling = KMeans(n_clusters=N).fit(df_ling)
            labels_ling = kmeans_ling.labels_

    elif method == 'spectral':
        if variant == 'jacc':
            spectral = SpectralClustering(n_clusters = N, eigen_solver = 'amg').fit(df)
            labels = spectral.labels_
        if variant == 'ling':
            N_ling = N
            spectral_ling = SpectralClustering(n_clusters = N, eigen_solver = 'amg').fit(df_ling)
            labels_ling = spectral_ling.labels_

    elif method == 'meanshift':
        if variant == 'jacc':
            meanshift = MeanShift(bandwidth = bandwidth, min_bin_freq = min_bin_freq).fit(df)
            labels = meanshift.labels_
            N = len(set(labels))
        if variant == 'ling':
            meanshift_ling = MeanShift(bandwidth = bandwidth, min_bin_freq = min_bin_freq).fit(df_ling)
            labels_ling = meanshift_ling.labels_
            N_ling = len(set(labels))
            
    elif method == 'dbscan':
        if variant == 'jacc':
            dbscan = DBSCAN(eps = eps, min_samples = min_samples).fit(df)
            labels = dbscan.labels_
            N = len(set(labels))
        if variant == 'ling':
            dbscan_ling = DBSCAN(eps = eps, min_samples = min_samples).fit(df_ling)
            labels_ling = dbscan_ling.labels_
            N_ling = len(set(labels_ling))
    ######################################################################################################

    # zwracanie wyników klastrowania
    if variant == 'jacc':
        Clusters_jacc = {}
        for k in range(N):
            Clusters_jacc[k] = []
            #lista na projekty w k-tym klastrze
            for it3, project in enumerate(list(projects.keys())):
                if list(labels)[it3] == k:
                    Clusters_jacc[k].append(projects[project]['name'])
        if method == 'dbscan':
            for it3, project in enumerate(list(projects.keys())):
                if list(labels)[it3] == -1:
                    Clusters_jacc[N-1].append(projects[project]['name'])       
        return Clusters_jacc
    
    if variant == 'ling':
        Clusters_ling = {}
        for k in range(N_ling):
            Clusters_ling[k] = []
            #lista na projekty w k-tym klastrze
            for it3, project in enumerate(list(projects.keys())):
                if list(labels_ling)[it3] == k:
                    Clusters_ling[k].append(projects[project]['name'])
        if method == 'dbscan':
            for it3, project in enumerate(list(projects.keys())):
                if list(labels)[it3] == -1:
                    Clusters_ling[N-1].append(projects[project]['name'])   
        return Clusters_ling

In [None]:
def cut_eps(d: dict):
    max_val = max(d.values())
    for i, key in enumerate(list(d.keys())):
        if d[key] > max_val - 1:
            min_eps = list(d.keys())[i]
            break
    for i, key in enumerate(reversed(list(d.keys()))):
        if d[key] > max_val - 1:
            max_eps = list(d.keys())[-i-1]
            break
    return min_eps, max_eps

In [None]:
def choose_eps(path, min_eps: float = 0.5, max_eps: float = 1.5, print_iter: bool = False, depth: int = 3):
    start = time.time()

    for j in range(3):
        if print_iter == True:
            print('Iteracja nr', j + 1)
        d = {}
        num = int(((max_eps - min_eps)*10**(j+1))) + 1
        for i, eps in enumerate(np.linspace(start = min_eps, stop = max_eps, num = num)):
            if print_iter == True and i % 5 == 0:
                print('\t', i + 1, '/', num, ', eps =', eps)
            c = Clustering_from_file(path, method = 'dbscan', eps = eps, min_bin_freq = 2, variant = 'jacc')
            d[eps] = len([len(cluster) for cluster in list(c.values())])

        min_eps = cut_eps(d)[0]
        max_eps = cut_eps(d)[1]

    stop = time.time()
    
    return d, max(d, key=d.get), round((stop - start)/60, 2)

In [None]:
def perform_random_clustering(projects, numbers = [], max_in_cluster = 8):
    
    count = 0 
    random_projects = [projects[key]['name'] for key in list(projects.keys())]
    rnd.shuffle(random_projects)

    random_clusters = {}
    
    if numbers == []:
        
        # numery klastrów
        k = 0
        
        while len(random_projects[count:]) >= max_in_cluster:

            # liczba projektów w obecnym klastrze
            r = rnd.randint(2, max_in_cluster)

            random_clusters[k] = random_projects[count:count+r]
            k += 1

            # liczba wykorzystanych dotąd projektów
            count = count + r

        random_clusters[k] = random_projects[count:]
        
    else:
        for k, r in enumerate(numbers):
            random_clusters[k] = random_projects[count:count+r]
            count = count + r
    
    return random_clusters

In [None]:
def final_assessment(path, 
                     n_iter: int = 20, 
                     n_random: int = 10, 
                     n_clusters: list = [5, 7, 9, 11, 13, 15, 17], 
                     stat = 'mean',
                     print_iter: bool = False):
    
    if stat not in ['mean', 'std dev', 'min', 'max']:
        raise ValueError("Parameter 'stat' must be one of: 'mean', 'std dev', 'min', 'max'")
    
    with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        meta = {}
        projects = {}
        votes = {}
        section = ""
        header = []
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                section = str(row[0]).strip().lower()
                header = next(reader)
            elif section == "meta":
                meta[row[0]] = row[1].strip()
            elif section == "projects":
                projects[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    projects[row[0]][key.strip()] = row[it+1].strip()
            elif section == "votes":
                votes[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    votes[row[0]][key.strip()] = row[it+1].strip()
    
    
    summary = {'ling': [], 'kmeans': [], 'spectral': [], 'dbscan': [], 'random': []}

    final_summary = {}

    if print_iter == True:
        print('Klastrowanie algorytmem dbscan')
        t1 = time.time()
        
    dbscan = Clustering_from_file(path, method = 'dbscan', eps = choose_eps(path, print_iter = print_iter)[1], min_bin_freq = 2, show = False, variant = 'jacc')
    summary['dbscan'].append(assess_clustering(dbscan, stat = stat, path = path))
    n = len(dbscan.keys())
    
    if print_iter == True:
        print('Czas klastrowania dbscan:', round((time.time() - t1)/60, 2), 'minuty. Liczba klastrów:', n)
    
    
    for n_k, k in enumerate(n_clusters):
        
        if print_iter == True:
            print('Liczba klastrów:', k, ', zostało:', len(n_clusters[n_k:])-1)
            
        if k >= len(projects.keys()):
            if print_iter == True:
                print('Liczba projektów:', len(projects.keys()),
                     '\nNie można podzielić na', k, 'klastrów.')
            break
        
        for i in range(n_iter):

            if print_iter == True:
                print('\tIteracja', i+1, '/', n_iter)

            ling = Clustering_from_file(path, method = 'kmeans', N = k, show = False, variant = 'ling')
            summary['ling'].append(assess_clustering(ling, stat = stat, path = path))
            
            kmeans = Clustering_from_file(path, method = 'kmeans', N = k, show = False, variant = 'jacc')
            summary['kmeans'].append(assess_clustering(kmeans, stat = stat, path = path))

            spectral = Clustering_from_file(path, method = 'spectral', N = k, show = False, variant = 'jacc')
            summary['spectral'].append(assess_clustering(spectral, stat = stat, path = path))
            

            nb = assess_clustering(kmeans, stat = 'numbers', path = path)
            
            
            # wykonujemy n_random losowych klastrowań na każde klastrowanie kmeans
            # jako wynik klastrowania losowego zapisujemy średnią z wyników
            m = []
            for j in range(n_random):
                c_random = perform_random_clustering(projects, numbers = nb)
                m.append(assess_clustering(c_random, stat = stat, path = path))

            summary['random'].append(np.mean(m))
            

        final_summary[k] = {'linguistic': round(np.mean(summary['ling']), 5),
                            'kmeans': round(np.mean(summary['kmeans']), 5),
                            'spectral': round(np.mean(summary['spectral']), 5),
                            'dbscan': 'NA',
                            'random': round(np.mean(summary['random']), 5)}
        
    if n in list(final_summary.keys()):
        final_summary[n]['dbscan'] = round(summary['dbscan'][0], 5)
    else:
        final_summary[n] = {'linguistic': 'NA',
                            'kmeans': 'NA',
                            'spectral': 'NA',
                            'dbscan': 0,
                            'random': 'NA'}
        
        nb1 = assess_clustering(dbscan, stat = 'numbers', path = path)  
        m1 = []
        for j in range(n_iter*n_random):
            c_random = perform_random_clustering(projects, numbers = nb1)
            m1.append(assess_clustering(c_random, stat = stat, path = path))
                
        final_summary[n]['dbscan'] = round(summary['dbscan'][0], 5)
        final_summary[n]['random'] = round(np.mean(m1), 5)
        
    return final_summary

In [None]:
def ba_summary(paths_list: list, 
               n_iter: int = 10, 
               n_random: int = 20, 
               n_clusters: list = [5, 10, 15],
               method = 'dict',
               stat = 'mean',
               print_desc: bool = False,
               print_iter: bool = False):

    print('Początek obliczeń o godzinie', time.strftime("%H:%M:%S", time.localtime()), '\n')
    before_loop = time.time()
    
    if method not in ['dict', 'dataframe']:
        raise ValueError("Argument 'method' must me one of 'dict', 'dataframe'.")
    
    final_dict = {}
    stopwatch = {}
    
    left = len(paths_list)
    for path in paths_list:
        left -= 1
        start = time.time()

        with open(path, 'r', newline='', encoding="utf-8") as csvfile:
            meta = {}
            projects = {}
            section = ""
            header = []
            reader = csv.reader(csvfile, delimiter=';')
            for row in reader:
                if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                    section = str(row[0]).strip().lower()
                    header = next(reader)
                elif section == "meta":
                    meta[row[0]] = row[1].strip()
                elif section == "projects":
                    projects[row[0]] = {}
                    for it, key in enumerate(header[1:]):
                        projects[row[0]][key.strip()] = row[it+1].strip()
                        
        after_file = time.time()
            
        if 'subunit' in list(meta.keys()):
            desc = meta['country'] + ', ' + meta['unit'] + ', ' + meta['subunit'] + ' ' + meta['instance']
        else:
            desc = meta['country'] + ', ' + meta['unit'] + ', ' + meta['instance']            

        if print_desc == True:
            print('Wybory:', desc,
                  '\n\tOtwieranie pliku zajęło', 
                  round((after_file - start)*1000, 2), 
                  'milisekundy.',
                 '\n\tPozostało plików:', left)
        
        tab_jacc = normal_table(path, variant = 'jacc')
        tab_ling = normal_table(path, variant = 'ling')
            
        y = {}
        y['Liczba projektów'] = {'info': len(list(projects.keys())), 
                                 'linguistic': '', 'kmeans': '', 'spectral': '', 'dbscan': '', 'random': ''}

        #y['% braku odrzuceń H0 [Jacc]'] = {'info': np.asmatrix(tab_jacc.iloc[[1], [0]])[0, 0]/np.asmatrix(tab_jacc.iloc[[0], [0]])[0, 0],
        #                                    'linguistic': '', 'kmeans': '', 'spectral': '', 'dbscan': '', 'random': ''}
        #y['% braku odrzuceń H0 [ling]'] = {'info': np.asmatrix(tab_ling.iloc[[1], [0]])[0, 0]/np.asmatrix(tab_ling.iloc[[0], [0]])[0, 0],
        #                                    'linguistic': '', 'kmeans': '', 'spectral': '', 'dbscan': '', 'random': ''}
        
        x = final_assessment(path, 
                n_iter = n_iter, 
                n_random = n_iter, 
                n_clusters = n_clusters,
                stat = stat,
                print_iter = print_iter)
        
        keys_list = list(x.keys())
        keys_list.sort()
        x = {key: x[key] for key in keys_list}
        
        for key in list(x.keys()):
            x[key]['info'] = ''
        
        y.update(x)
        final_dict[desc] = y

        now = time.time()
        
        if print_desc == True:
            print('\tOcena klastrowania zajęła', 
                  round((now - after_file)/60, 2),
                  'minuty.\n\tOd początku minęło', 
                  round((now - before_loop)/60, 2), 
                  'minuty.\n')
            
        stopwatch[desc] = {'number of projects': len(projects.keys()), 'time': round((now - after_file)/60, 2)}
        
    if method == 'dict':
        return final_dict, stopwatch
    
    elif method == 'dataframe':
        return pd.DataFrame.from_dict({(i,j): final_dict[i][j] 
                                       for i in final_dict.keys() 
                                       for j in final_dict[i].keys()}, 
                                       orient = 'index'), pd.DataFrame.from_dict(stopwatch, orient = 'index')