In [39]:
import pandas as pd
import os
import yaml
import torch
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib
plt.style.use('ggplot')
matplotlib.pyplot.rcParams['figure.figsize'] = (15, 6)
matplotlib.pyplot.rcParams['font.family'] = 'sans-serif'

In [40]:
def compute_sparsity(model_fn, threshold=10e-3):
    is_zero = 0
    non_zero = 0
    for name, tensor in torch.load(model_fn).items():
        m = tensor.cpu().numpy()
        close = len(np.where(np.abs(m) <= threshold)[0])
        is_zero += close
        non_zero += (m.size - close)
    return is_zero, non_zero, is_zero / (is_zero + non_zero)
    
    
def get_min_loss(row):
    min_idx, min_dev_loss = min(enumerate(row['dev_loss']), key=lambda x: x[1])
    min_train_loss = row['train_loss'][min_idx]
    row['min_dev_loss'] = min_dev_loss
    row['min_train_loss'] = min_train_loss
    return row
    
    
def extract_language_name(field):
    fn = field.split('/')[-1]
    if 'dev' in fn:
        return '-'.join(fn.split('-')[:-1])
    return '-'.join(fn.split('-')[:-2])
    

def load_res_dir(basedir, include_sparsity=False):
    experiments = []
    for subdir in os.scandir(basedir):
        exp_d = {}
        with open(os.path.join(subdir.path, "config.yaml")) as f:
            exp_d.update(yaml.load(f))
        res_fn = os.path.join(subdir.path, "result.yaml")
        if os.path.exists(res_fn):
            with open(os.path.join(subdir.path, "result.yaml")) as f:
                exp_d.update(yaml.load(f))
        else:
            continue
        dev_acc_path = os.path.join(subdir.path, "dev.word_accuracy")
        if os.path.exists(dev_acc_path):
            with open(dev_acc_path) as f:
                exp_d['dev_acc'] = float(f.read())
        else:
            print("Dev accuracy file does not exist in dir: {}".format(subdir.path))
        if include_sparsity:
            exp_d['sparsity'] = compute_sparsity(os.path.join(subdir.path, "model"), 10e-4)
        experiments.append(exp_d)
    experiments = pd.DataFrame(experiments)
    if include_sparsity:
        experiments['sparsity_ratio'] = experiments['sparsity'].apply(lambda x: x[2])
    experiments['language'] = experiments.dev_file.apply(extract_language_name)
    experiments = experiments.apply(get_min_loss, axis=1)
    experiments = experiments[experiments['dev_acc'].notnull()]
    experiments = experiments[experiments['dev_loss'].notnull()]
    experiments['train_size'] = experiments['train_file'].apply(lambda fn: fn.split('-')[-1])
    return experiments

In [41]:
%%time
luong = pd.concat([
    load_res_dir("../exps/task1/luong/"),
    load_res_dir("../exps/task1/reverse_luong/"),
    load_res_dir("../exps/task1/luong_new/"),
    load_res_dir("../exps/task1/reverse_luong_new/"),
])

CPU times: user 39.2 s, sys: 196 ms, total: 39.4 s
Wall time: 40.6 s


In [42]:
luong.shape

(2157, 44)

In [43]:
def extract_dataset_name(row):
    row['dataset'] = "{}:{}".format(row['language'], row['train_size'])
    return row

luong = luong.apply(extract_dataset_name, axis=1)

In [44]:
luong["reverse"] = luong.train_file.str.contains("reverse")
luong["new"] = luong.experiment_dir.str.contains("_new")

luong.groupby(['reverse', 'new']).size()

reverse  new  
False    False    728
         True     585
True     False    582
         True     262
dtype: int64

# Is on-the-fly padding better than global padding

On-the-fly padding: pad every sample to the longest sample in the current batch

global padding: pad to the longest sample in the whole dataset


On-the-fly padding is much better if the target is not reversed. It is slightly better if the target is reversed.

In [49]:
luong.groupby(['language', 'reverse', 'new']).max()['dev_acc'].unstack(['reverse', 'new']).describe()

reverse,False,False,True,True
new,False,True,False,True
count,103.0,103.0,103.0,93.0
mean,0.714058,0.763748,0.744194,0.748538
std,0.355039,0.348587,0.343788,0.354439
min,0.0,0.0,0.0,0.0
25%,0.587,0.8065,0.7735,0.78
50%,0.88,0.906,0.87,0.892
75%,0.964,0.9705,0.964,0.972
max,1.0,1.0,1.0,1.0


In [53]:
df = luong[(luong.reverse==False) & (luong.new==True)]
df.groupby('language').dev_acc.max().sort_values(ascending=False)

language
haida                 1.000
tatar                 1.000
pashto                1.000
occitan               1.000
uzbek                 1.000
crimean-tatar         1.000
neapolitan            1.000
swahili               1.000
kabardian             1.000
bashkir               0.999
venetian              0.998
khaling               0.997
urdu                  0.996
adyghe                0.995
middle-french         0.994
persian               0.994
friulian              0.990
albanian              0.990
basque                0.984
quechua               0.982
galician              0.982
yiddish               0.980
maltese               0.980
estonian              0.980
zulu                  0.980
hebrew                0.971
georgian              0.970
classical-syriac      0.970
lower-sorbian         0.969
italian               0.967
                      ...  
romanian              0.831
greek                 0.829
old-english           0.823
faroese               0.812
lithuanian 