In [1]:
import os
import sys
import pickle
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
import numpy as np

sys.path.append('/home/illusionww/Documents/GitHub/pygraphs')
from pygraphs.measure import kernels

# kkmeans init experiments, results

In [2]:
CACHE_ROOT = './kkmeans_init_experiments2/by_column_and_kernel'
columns = [
    (100, 2, 0.2, 0.05),
    (100, 2, 0.3, 0.05),
    (100, 2, 0.3, 0.1),
    (100, 2, 0.3, 0.15),
    (102, 3, 0.3, 0.1),
    (100, 4, 0.3, 0.1),
    (100, 4, 0.3, 0.15),
    (200, 2, 0.3, 0.05),
    (200, 2, 0.3, 0.1),
    (200, 2, 0.3, 0.15),
    (201, 3, 0.3, 0.1),
    (200, 4, 0.3, 0.1),
    (200, 4, 0.3, 0.15)
]

In [3]:
n, k, p_in, p_out = columns[0]
column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
kernel = kernels[1]
with open(f'{CACHE_ROOT}/{column_str}_{kernel.name}_results.pkl', 'rb') as f:
    data = pickle.load(f)

In [4]:
data[0][0.0][5]

{'labels': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'inertia': array(nan, dtype=float32),
 'init': 'one',
 'score_ari': 0.0,
 'score_nmi': -2.0}

## Experiment 1: инициализация, обгоняющая на большинстве параметров

Для каждого графа определяем наилучшую инициализацию **для каждого параметра** (по inertia, ARI и NMI). После этого считаем статистику внутри каждого графа, сколько побед у каждого типа инициализаций (one, all, k-kmeans++), выбираем победителя для графа. После этого считаем статистику для всех графов, выбираем победителя. Делаем это для каждой меры и каждого сетапа генерации графов

In [5]:
def choose_init(param_results, choose_by_measure='score_ari', request_measure='score_ari'):
    results = {
        'one': np.nan,
        'all': np.nan,
        'k-means++': np.nan,
        'any': np.nan
    }
    for init_name in results.keys():
        best_chooseby = np.inf if choose_by_measure == 'inertia' else -np.inf
        best_request = np.nan
        filtered_by_init = [x for x in param_results if x['init'] == init_name] \
                           if init_name != 'any' else param_results
        for init_results in filtered_by_init:
            chooseby, request = init_results[choose_by_measure], init_results[request_measure]
            if (choose_by_measure == 'inertia' and best_chooseby > chooseby) or \
               (choose_by_measure != 'inertia' and best_chooseby < chooseby):
                best_chooseby, best_request = chooseby, request
        results[init_name] = best_request
    return results

def exp1(column, kernel):
    choose_by_measure, request_measure = 'inertia', 'score_ari'
    
    n, k, p_in, p_out = column
    column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
    with open(f'{CACHE_ROOT}/{column_str}_{kernel.name}_results.pkl', 'rb') as f:
        data = pickle.load(f)
        
    all_graphs_stat = defaultdict(lambda: 0)
    for graph_results in data:
        graph_stat = defaultdict(lambda: 0)
        for flat_param, param_results in graph_results.items():
            results = choose_init(param_results, choose_by_measure, request_measure)
            max_val = np.nanmin(list(results.values())) \
                      if request_measure == 'inertia' else np.nanmax(list(results.values()))
            graph_stat['one'] += int(results['one'] == max_val)
            graph_stat['all'] += int(results['all'] == max_val)
            graph_stat['k-means++'] += int(results['k-means++'] == max_val)
            graph_stat['any'] += int(results['any'] == max_val)
        max_val = max(graph_stat.values())
        all_graphs_stat['one'] += int(graph_stat['one'] == max_val)
        all_graphs_stat['all'] += int(graph_stat['all'] == max_val)
        all_graphs_stat['k-means++'] += int(graph_stat['k-means++'] == max_val)
        all_graphs_stat['any'] += int(graph_stat['any'] == max_val)
    return all_graphs_stat

In [71]:
cached = {}

def request_cache(column, kernel, func=exp1):
    n, k, p_in, p_out = column
    column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
    key = f'{column_str}_{kernel.name}'
    
    if key not in cached:
        value = func(column, kernel)
        cached[key] = value

    return cached[key]

In [72]:
with open('inertia.csv', 'w') as f:
    f.write('Inertia\t')
    for column in columns:
        n, k, p_in, p_out = column
        column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
        f.write(f'{column_str}\t\t\t\t')
    f.write('\n\t')
    for column in columns:
        f.write('one\tall\tk-means++\tany\t')
    f.write('\n')

    for kernel in tqdm(kernels):
        f.write(f'{kernel.name}\t')
        for column in columns:
            inertia = request_cache(column, kernel)
            f.write(f"{inertia['one']}\t{inertia['all']}\t{inertia['k-means++']}\t{inertia['any']}\t")
        f.write('\n')
        f.flush()

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))






## Experiment 2: инициализация дающая наилучший результат

Для каждого графа определяем наилучшую инициализацию **для каждого параметра** (по inertia, ARI и NMI). После этого считаем статистику внутри каждого графа, сколько побед у каждого типа инициализаций (one, all, k-kmeans++), выбираем победителя для графа. После этого считаем статистику для всех графов, выбираем победителя. Делаем это для каждой меры и каждого сетапа генерации графов

In [14]:
def choose_param(graph_results, init_name='one', choose_by_measure='inertia', request_measure='score_ari'):
    results = []
    for flat_param, param_results in graph_results.items():
        param_results = [x for x in param_results if x['init'] == init_name] \
                           if init_name != 'any' else param_results
        resultss = choose_init(param_results, choose_by_measure, request_measure)
        results.append(resultss[init_name])
    return np.nanmax(results) 
        
def exp2(column, kernel):
    choose_by_measure, request_measure = 'inertia', 'score_ari'
    
    n, k, p_in, p_out = column
    column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
    with open(f'{CACHE_ROOT}/{column_str}_{kernel.name}_results.pkl', 'rb') as f:
        data = pickle.load(f)
        
    all_graphs_stat = defaultdict(lambda: 0)
    all_graphs_avg = defaultdict(lambda: [])
    for graph_results in data:
        graph_stat = defaultdict(lambda: 0)
        for init_name in ['one', 'all', 'k-means++', 'any']:
            graph_stat[init_name] = choose_param(graph_results, init_name=init_name,
                                                 choose_by_measure=choose_by_measure,
                                                 request_measure=request_measure)
        
        all_graphs_avg['one'].append(graph_stat['one'])
        all_graphs_avg['all'].append(graph_stat['all'])
        all_graphs_avg['k-means++'].append(graph_stat['k-means++'])
        all_graphs_avg['any'].append(graph_stat['any'])
        
        max_val = max(graph_stat.values())
        all_graphs_stat['one'] += int(graph_stat['one'] == max_val)
        all_graphs_stat['all'] += int(graph_stat['all'] == max_val)
        all_graphs_stat['k-means++'] += int(graph_stat['k-means++'] == max_val)
        all_graphs_stat['any'] += int(graph_stat['any'] == max_val)
    
    all_graphs_avg['one'] = np.nanmean(all_graphs_avg['one'])
    all_graphs_avg['all'] = np.nanmean(all_graphs_avg['all'])
    all_graphs_avg['k-means++'] = np.nanmean(all_graphs_avg['k-means++'])
    all_graphs_avg['any'] = np.nanmean(all_graphs_avg['any'])
    
    return all_graphs_stat, all_graphs_avg

In [15]:
cached = {}

def request_cache(column, kernel, func=exp2):
    n, k, p_in, p_out = column
    column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
    key = f'{column_str}_{kernel.name}'
    
    if key not in cached:
        value = func(column, kernel)
        cached[key] = value

    return cached[key]

In [16]:
with open('exp2_vote.csv', 'w') as f:
    f.write('Inertia\t')
    for column in columns:
        n, k, p_in, p_out = column
        column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
        f.write(f'{column_str}\t\t\t\t')
    f.write('\n\t')
    for column in columns:
        f.write('one\tall\tk-means++\tany\t')
    f.write('\n')

    for kernel in tqdm(kernels):
        f.write(f'{kernel.name}\t')
        for column in columns:
            vote, ari = request_cache(column, kernel)
            f.write(f"{vote['one']}\t{vote['all']}\t{vote['k-means++']}\t{vote['any']}\t")
        f.write('\n')
        f.flush()
                    
with open('exp2_ari.csv', 'w') as f:
    f.write('Inertia\t')
    for column in columns:
        n, k, p_in, p_out = column
        column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
        f.write(f'{column_str}\t\t\t\t')
    f.write('\n\t')
    for column in columns:
        f.write('one\tall\tk-means++\tany\t')
    f.write('\n')

    for kernel in tqdm(kernels):
        f.write(f'{kernel.name}\t')
        for column in columns:
            vote, ari = request_cache(column, kernel)
            f.write(f"{ari['one']}\t{ari['all']}\t{ari['k-means++']}\t{ari['any']}\t")
        f.write('\n')
        f.flush()

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))




HBox(children=(IntProgress(value=0, max=21), HTML(value='')))


