In [1]:
import wandb
import matplotlib.pyplot as plt
import scipy
import numpy as np
import json
from collections import defaultdict
from copy import deepcopy

plt.rcParams["axes.grid"] = False

sys.path.append('..')

In [2]:
language = 'german'

sys.argv = [
"--device cuda",
"--data-folder", "..\\data",
"--seed", "13",
"--configuration", "char-to-char",
"--language", language,
"--challenge", "named-entity-recognition"]

In [3]:
# Configure container:
from dependency_injection.ioc_container import IocContainer

container = IocContainer()

In [4]:
plot_service = container.plot_service()
file_service = container.file_service()

In [5]:
entity = 'eval-historical-texts'
project = 'named-entity-recognition'

In [6]:
def get_wandb_runs(run_filters: dict):
    api = wandb.Api()
    runs = api.runs(path=f'{entity}/{project}', filters=run_filters)
    return runs

In [7]:
wandb_path = os.path.join('..', 'wandb')
dirnames = os.listdir(wandb_path)

In [8]:
def get_summary_value(run, key: str):
    if key not in run.summary.keys():
        return None

    return run.summary[key]

def get_average_score(run):
    all_scores = [
        get_summary_value(run, 'Best - f1-score-micro-partial-all-component'),
        get_summary_value(run, 'Best - f1-score-micro-partial-all-literal-coarse'),
        get_summary_value(run, 'Best - f1-score-micro-partial-all-literal-fine'),
        get_summary_value(run, 'Best - f1-score-micro-partial-all-metonymic-coarse'),
        get_summary_value(run, 'Best - f1-score-micro-partial-all-metonymic-fine'),
        get_summary_value(run, 'Best - f1-score-micro-partial-all-nested')
    ]

    all_scores = [x for x in all_scores if x is not None]
    avg_score = np.mean(all_scores)
    return avg_score

def normalize(x):
    x = x + abs(np.min(x))
    sum_of_rows = x.sum(axis=1)
    normalized_array = x / sum_of_rows[:, np.newaxis]
    return normalized_array

def generate_heatmap(runs, filename_unique_str: str):
    entity_keys = {}

    best_run = None
    best_avg_score= None
    for run in runs:

        avg_score = get_average_score(run)
        if best_avg_score is None or avg_score > best_avg_score:
            best_run = run
            best_avg_score=avg_score


    run = best_run
    # for i, run in enumerate(runs):
    run_history = run.history(keys=['CRF Transition matrix - literal-coarse', 'Validation - f1-score-micro-partial-all-literal-coarse'], pandas=False)
    f1_scores = [x['Validation - f1-score-micro-partial-all-literal-coarse'] for x in run_history[0]]
    best_iteration = np.argmax(f1_scores)
    literal_fine_path = run_history[0][best_iteration]['CRF Transition matrix - literal-coarse']['path'].replace('/', '\\')

    run_folders = [x for x in dirnames if run.id in x]
    if len(run_folders) == 0:
        raise Exception(f'No folder found for run {run.id}')

    run_folder = run_folders[0]

    full_path = os.path.join(wandb_path, run_folder, literal_fine_path)
    with open(full_path, 'r') as matrix_file:
        matrix_json = json.loads(matrix_file.read())

        unique_keys_count = len(set([x[0] for x in matrix_json['data']]))
        run_matrix = np.zeros((unique_keys_count, unique_keys_count))

        for value in matrix_json['data']:
            if value[0] == '[PAD]' or value[1] == '[PAD]' or (value[0] == '[SEP]' and value[1] == '[CLS]') or value[1] == '[SEP]' or value[0] == '[CLS]':
                continue

            if value[0] not in entity_keys.keys():
                entity_keys[value[0]] = len(entity_keys.values())
                
            if value[1] not in entity_keys.keys():
                entity_keys[value[1]] = len(entity_keys.values())
            
            x_value = entity_keys[value[1]]
            y_value = entity_keys[value[0]]

            matrix_value = value[2] #* -1
            run_matrix[x_value][y_value] = matrix_value


    # run_matrix = run_matrix / len(runs)
    normalized_matrix = normalize(run_matrix)
    # print(run_matrix)
    # print('----------')
    # print(normalized_matrix)
        
    ax = plot_service.create_plot()
    plot_service.plot_heatmap(
        values=normalized_matrix,
        labels=entity_keys.keys(),
        vmin=0,
        vmax=1,
        # show_colorbar=False,
        x_title='To',
        y_title='From',
        ax=ax,
        save_path=file_service.get_experiments_path(),
        title=filename_unique_str,
        filename=f'heatmap-{filename_unique_str}-coarse')

In [9]:
main_config = {
    'createdAt': {
        '$gt': '20200710000000'
    },
    'state': 'finished',

    'config.language': language,
    'config.hidden_dimension': 512,
    'config.embeddings_size': 64,
    'config.include_pretrained_model': False,
    'config.include_fasttext_model': False,
    'config.dropout': 0.8,
    'config.learn_new_embeddings': True,
    'config.fine_tune_pretrained': False,
    'config.split_type': 'multi-segment'
}

available_filters = {
    'none': {},
    'fast-text': {
        'config.include_fasttext_model': True
    },
    'both': {
        'config.include_pretrained_model': True,
        'config.include_fasttext_model': True
    },
    'bert': {
        'config.hidden_dimension': 256,
        'config.include_pretrained_model': True
    },
    'bert-no-new': {
        'config.hidden_dimension': 256,
        'config.include_pretrained_model': True,
        'config.learn_new_embeddings': False
    },
    'both-finetune': {
        'config.include_pretrained_model': True,
        'config.include_fasttext_model': True,
        'config.fine_tune_pretrained': True
    },
    'bert-finetune': {
        'config.hidden_dimension': 256,
        'config.include_pretrained_model': True,
        'config.fine_tune_pretrained': True
    },
    'bert-no-new-finetune': {
        'config.hidden_dimension': 256,
        'config.include_pretrained_model': True,
        'config.learn_new_embeddings': False,
        'config.fine_tune_pretrained': True
    }
}

In [10]:
for unique_str, available_filter in available_filters.items():
    run_filter = deepcopy(main_config)
    run_filter.update(available_filter)
    runs = get_wandb_runs(run_filter)
    # print(f'Loaded {len(runs)} runs')
    generate_heatmap(runs, f'{language}-{unique_str}')
    # break


FileNotFoundError: [Errno 2] No such file or directory: '..\\wandb\\run-20200712_110725-bkfbyjhy\\media\\table\\CRF Transition matrix - literal-coarse_1140_8a06f55c.table.json'

1.0000000500000001