# 🧪 Experiment: Setting Combination
This notebook conducts various experiments on the combination of different settings, which includes different datasets, gazettteers, and models.

**Note**: Before conducting experiments, you need to install `kaner` package first. Otherwise, this notebook will raise an *import error*.

```bash
cd ../
python setup.py install
```

In [None]:
import os
import pprint
from typing import List
from datetime import datetime
from collections import defaultdict

import matplotlib.pyplot as plt

from kaner.pipeline import train
from kaner.context import GlobalContext as gctx
from kaner.trainer import TrainerConfig
from kaner.tracker import NERTrackerRow, NERTracker
from kaner.common.func import query_time
from kaner.common import save_json

## 1 Define `trainall` Function

In [None]:
def trainall(labpath: str, cfgdir: str, m: List[str], d: List[str], g: List[str], n: int, tag: str, **kwargs) -> NERTracker:
    """
    Conducting experiments on all settings.

    Args:
        labpath (str): The file path of recording experimental results.
        cfgdir (str): Configuration folder.
        m (List[str]): All specific models to be trained.
        d (List[str]): All specific datasets to be tested.
        g (List[str]): All specific gazetteers to be tested.
        n (int): The number of training repeating times.
        tag (str): Experimental tags.
    """

    def update_names(names: List[str], all_names: List[str], name_type: str) -> List[str]:
        """
        Check whether the name that user inputs is correct.

        Args:
            names (List[str]): The names (dataset, model, gazetteer) that user inputs.
            all_names (List[str]): All names (dataset, model, gazetteer) that this libary provides.
            name_type (str): The type of the name (Dataset, Model, Gazetteer).
        """
        if len(names) == 0:
            names = all_names
        else:
            for name in names:
                if name not in all_names:
                    print("[{0}] {1} is not in {2}".format(name_type, name, all_names))
                    exit(0)
        return names

    tracker = NERTracker.load(labpath)
    models = update_names(m, gctx.get_model_names(), "Model")
    datasets = update_names(d, gctx.get_dataset_names(), "Dataset")
    gazetteers = update_names(g, gctx.get_gazetteer_names(), "Gazetteer")

    print("--------------------- Experiment Configuration ---------------------")
    print("Models: {0}".format(models))
    print("Datasets: {0}".format(datasets))
    print("Gazetteers: {0}".format(gazetteers))
    print("--------------------------------------------------------------------")

    for dataset in datasets:
        for model in models:
            for gazetteer_model in gazetteers:
                # avoid calculation repeatly
                if model in ["blcrf", "plmtg"]:
                    gazetteer_model = "gigaword"
                for _ in range(n):
                    if len(tracker.query(dataset=dataset, model=model, gazetteer_model=gazetteer_model, tag=tag)) >= n:
                        continue
                    config = TrainerConfig(os.path.join(cfgdir, model + ".yml"), dataset=dataset, gazetteer_model=gazetteer_model, **kwargs)
                    start = str(datetime.now())
                    try:
                        results, trainer = train(config)
                    except RuntimeError as error:
                        print(error)
                        continue
                    tracker.insert(
                        NERTrackerRow(
                            start, model, dataset, config.tokenizer_model, gazetteer_model, config.output_folder, query_time(trainer.train),
                            results["f1-score"], results["precision-score"], results["recall-score"], results["epoch_count"], results["test-loss"], tag
                        )
                    )
                    tracker.save(labpath)
                    del trainer

    return tracker

## 2 Given models, datasets, gazetteers, train them
You can find all available models, datasets and gazetteers by the following code block.

```python
models = gctx.get_model_names()
datasets = gctx.get_dataset_names()
gazetteers = gctx.get_gazetteer_names()
```

In [None]:
labpath = "../data/logs/experiments.csv"
cfgdir = "../configs"
models = ["blcrf", "ses", "cgn", "mdgg"]
datasets = ["weiboner", "resumener", "ecommerce", "msraner", "ontonotes"]
gazetteers = ["gigaword", "sgns"]
n = 5
tag = "nil"
kwargs = {"data_folder": "../data"}

trainall(labpath, cfgdir, models, datasets, gazetteers, n, tag, **kwargs)

## 3 Result Summary and Visualization

In [None]:
content_colors = ["#FDEDEC", "#F4ECF7", "#EBF5FB", "#E8F6F3", "#FEF9E7", "#EAECEE"]
edge_colors    = ["#EC7063", "#A569BD", "#5DADE2", "#45B39D", "#F4D03F", "#566573"]

# configuration
log_folder = "../data/logs/"
labpath = "../data/logs/experiments.csv"
tracker = NERTracker.load(labpath)
results = tracker.summay()
save_json(results, os.path.dirname(labpath), "experimental_summary.json")

reports = defaultdict(list)
for result in results:
    reports[result["dataset"]].append(result)
    
for i, (dataset, report) in enumerate(reports.items()):
    print("\x1b[6;30;42m#{0} Dataset: {1}\x1b[0m".format(i, dataset))
    x = ["{0}-{1}".format(r["model"], r["gazetteer_model"]) for _, r in enumerate(report)]
    for i, label in enumerate(x):
        if label.startswith(("blcrf", "plmtg")):
            x[i] = x[i].split("-")[0]
    fig, axs = plt.subplots(1, 5, sharey = True)
    fig.tight_layout(pad=0.05)
    fig.set_figwidth(25)
    # f1 score
    y = [r["f1_score"] for _, r in enumerate(report)]
    axs[0].barh(x, y, color=content_colors, edgecolor=edge_colors)
    axs[0].set_title("F1 Score on {0}".format(dataset))
    axs[0].spines['top'].set_visible(False)
    axs[0].spines['right'].set_visible(False)
    axs[0].spines['left'].set_visible(False)
    for i, v in enumerate(y):
        axs[0].text(v + 0.01, i - 0.05, str(round(v*100, 2)))
    # precison score
    y = [r["precision_score"] for _, r in enumerate(report)]
    axs[1].barh(x, y, color=content_colors, edgecolor=edge_colors)
    axs[1].set_title("Precision Score on {0}".format(dataset))
    axs[1].spines['top'].set_visible(False)
    axs[1].spines['right'].set_visible(False)
    axs[1].spines['left'].set_visible(False)
    for i, v in enumerate(y):
        axs[1].text(v + 0.01, i - 0.05, str(round(v*100, 2)))
    # recall score
    y = [r["recall_score"] for _, r in enumerate(report)]
    axs[2].barh(x, y, color=content_colors, edgecolor=edge_colors)
    axs[2].set_title("Recall Score on {0}".format(dataset))
    axs[2].spines['top'].set_visible(False)
    axs[2].spines['right'].set_visible(False)
    axs[2].spines['left'].set_visible(False)
    for i, v in enumerate(y):
        axs[2].text(v + 0.01, i - 0.05, str(round(v*100, 2)))
    # test loss
    y = [r["test_loss"] for _, r in enumerate(report)]
    axs[3].barh(x, y, color=content_colors, edgecolor=edge_colors)
    axs[3].set_title("Tess Loss on {0}".format(dataset))
    axs[3].spines['top'].set_visible(False)
    axs[3].spines['right'].set_visible(False)
    axs[3].spines['left'].set_visible(False)
    for i, v in enumerate(y):
        axs[3].text(v + 0.01, i - 0.05, str(round(v, 3)))
    # time per epoch
    y = [r["time_per_epoch"] for _, r in enumerate(report)]
    axs[4].barh(x, y, color=content_colors, edgecolor=edge_colors)
    axs[4].set_title("Time Per Epoch on {0}".format(dataset))
    axs[4].spines['top'].set_visible(False)
    axs[4].spines['right'].set_visible(False)
    axs[4].spines['left'].set_visible(False)
    for i, v in enumerate(y):
        axs[4].text(v + 0.01, i - 0.05, str(round(v, 3)))

    plt.show()
    fig.savefig(os.path.join(log_folder, "{0}_chart.pdf".format(dataset)))