In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from collections import defaultdict
import pickle
import torch

## Combine model outputs for vote ensemble

In [5]:
root_dir = './save/output'

dirs = [f for f in os.listdir(root_dir) ]
dirs.sort()

print(dirs)


features_train = []
features_dev = []

all_results_train = []
all_results_dev = []

tokenizers_train = []
tokenizers_dev = []

for model_dir in dirs:
    model_name = ''
    tokenizer = None
    all_results = None
    features = None
    
    
    with open(os.path.join(root_dir, model_dir, 'config.json')) as f:
        config = json.load(f)
        model_name = config['model_name'].replace('-dev', '').replace('-train', '').replace('-01', '')
    if config['type'] == 'train':
        continue
    print(model_name, config['type'])
    
    
    with open(os.path.join(root_dir, model_dir, 'features.pkl'), 'rb') as f:
        features = pickle.load(f)
        
    with open(os.path.join(root_dir, model_dir, 'all_results.pkl'), 'rb') as f:
        all_results = pickle.load(f)
    
            
    with open(os.path.join(root_dir, model_dir, 'tokenizer.pkl'), 'rb') as f:
        tokenizer = pickle.load(f)
        
    if config['type'] == 'train':
        features_train.append(features)
        all_results_train.append(all_results)
        tokenizers_train.append(tokenizer)
    elif config['type'] == 'dev':
        features_dev.append(features)
        all_results_dev.append(all_results)
        tokenizers_dev.append(tokenizer)
        

['albert-large-v2-dev', 'albert-large-v2-train', 'albert-xxlarge-v1-dev', 'albert-xxlarge-v1-train', 'bert-large-dev', 'bert-large-train', 'roberta-large-dev', 'roberta-large-dev2', 'roberta-large-train', 'roberta-large-train2']
albert-large-v2 dev
albert-xxlarge-v1 dev
bert-large dev
roberta-large dev
roberta-large dev


In [7]:
dev_to_save = [features_dev, all_results_dev, tokenizers_dev]
with open(os.path.join(root_dir, 'saved_data_dev.pkl'), 'wb') as f:
    pickle.dump(dev_to_save, f)

In [3]:
# load saved_data
def load_saved(ensemble_dir, evaluate):
    if evaluate:
        with open(os.path.join(ensemble_dir, 'saved_data_dev.pkl'), 'rb') as f:
            saved_data = pickle.load(f)
    else:
        with open(os.path.join(ensemble_dir, 'saved_data_train.pkl'), 'rb') as f:
            saved_data = pickle.load(f)
    return saved_data
ensemble_dir = './output/saved_data_3'
features_train, all_results_train, tokenizers_train = load_saved(ensemble_dir, False)
features_dev, all_results_dev, tokenizers_dev = load_saved(ensemble_dir, True)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
# bewared of OOM issues
# train_to_save = [features_train, all_results_train, tokenizers_train]
with open(os.path.join(root_dir, 'saved_data_train.pkl'), 'wb') as f:
    pickle.dump([features_train, all_results_train, tokenizers_train], f)

## Combine features for training

In [3]:
root_dir = './save/output'

dirs = [f for f in os.listdir(root_dir) ]
dirs.sort()
dirs =  [ 
         'albert-xxlarge-v1-dev', 'albert-xxlarge-v1-train', 
        'albert-large-v2-dev', 'albert-large-v2-train', 
]
print(dirs)
features_train = []
features_dev = []

all_results_train = []
all_results_dev = []

tokenizers_train = []
tokenizers_dev = []
for model_dir in dirs:
    # test
#     if model_dir != 'bert-base-train' and model_dir != 'roberta-large-train':
#         continue
    model_name = ''
    tokenizer = None
    all_results = None
    features = None
    
    with open(os.path.join(root_dir, model_dir, 'config.json')) as f:
        config = json.load(f)
        model_name = config['model_name'].replace('-dev', '').replace('-train', '').replace('-01', '')
    
    print(model_name, config['type'])
    
    with open(os.path.join(root_dir, model_dir, 'features.pkl'), 'rb') as f:
        features = pickle.load(f)
        
    with open(os.path.join(root_dir, model_dir, 'all_results.pkl'), 'rb') as f:
        all_results = pickle.load(f)
    
            
    with open(os.path.join(root_dir, model_dir, 'tokenizer.pkl'), 'rb') as f:
        tokenizer = pickle.load(f)
        
    if config['type'] == 'train':
        features_train.append(features)
        all_results_train.append(all_results)
        tokenizers_train.append(tokenizer)
    elif config['type'] == 'dev':
        features_dev.append(features)
        all_results_dev.append(all_results)
        tokenizers_dev.append(tokenizer)
        

['albert-xxlarge-v1-dev', 'albert-xxlarge-v1-train', 'albert-large-v2-dev', 'albert-large-v2-train']
albert-xxlarge-v1 dev


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


albert-xxlarge-v1 train
albert-large-v2 dev
albert-large-v2 train


In [5]:
with open(os.path.join(root_dir, 'combined_tokenizers_train.pkl'), 'wb') as f:
    pickle.dump(tokenizers_train, f)
with open(os.path.join(root_dir, 'combined_tokenizers_dev.pkl'), 'wb') as f:
    pickle.dump(tokenizers_dev, f)

In [4]:
from collections import OrderedDict
def align_features_and_results(features_train, all_results_train, target_model_index = 0):
    d = [OrderedDict() for _ in features_train]
    for model_idx, features in enumerate(features_train):
        for i, feat in enumerate(features):
            if feat.example_index not in d[model_idx]:
                d[model_idx][feat.example_index] = [i]
            else:
                d[model_idx][feat.example_index].append(i)

    target_dict = d[target_model_index]
    output_mapping_arr = [[] for _ in features_train]
    # Mapping index of target features, should be 1, 2,..., len(features)
    output_mapping_arr[target_model_index] = np.arange(len(features_train[target_model_index]))

    for k, v in target_dict.items():
        n_feat = len(v) #  > 1 if splitted
        for model_idx, features in enumerate(features_train):
            if model_idx == target_model_index:
                continue
            example_indices = np.random.choice(d[model_idx][k], n_feat)
            output_mapping_arr[model_idx].extend(example_indices)
            
    final_train_features = [[] for _ in features_train]
    final_all_results = [[] for _ in features_train]
    
    

    for model_idx in range(len(features_train)):
        for index in output_mapping_arr[model_idx]:
            final_train_features[model_idx].append(features_train[model_idx][index])
            final_all_results[model_idx].append(all_results_train[model_idx][index])
    
    len_feats = np.array([len(f) for f in final_train_features])
    len_results = np.array([len(f) for f in final_all_results])
    assert np.all(len_feats == len_feats[0]), print('final_train_features error')
    assert np.all(len_results == len_results[0]), print('final_all_results error')
    return final_train_features, final_all_results

final_features_train, final_all_results_train = align_features_and_results(features_train, all_results_train)
final_features_dev, final_all_results_dev = align_features_and_results(features_dev, all_results_dev)

In [5]:
features1 = final_features_train[0]

print([f.start_position  for f in features1[:10]])
print([f.end_position  for f in features1[:10]])
print([f.example_index  for f in features1[:10]])

[77, 70, 146, 60, 80, 97, 138, 104, 79, 87]
[81, 72, 150, 62, 82, 100, 140, 106, 81, 88]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [14]:
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
examples = SquadV2Processor().get_train_examples(None, filename='data/train-v2.0.json')

100%|██████████| 442/442 [00:33<00:00, 13.08it/s]


In [7]:
print([e.start_position for e in examples[:10]])
print([e.end_position for e in examples[:10]])

NameError: name 'examples' is not defined

In [6]:
features2 = final_features_train[1]
print([f.start_position  for f in features2[:10]])
print([f.example_index  for f in features2[:10]])


[77, 70, 146, 60, 80, 97, 138, 104, 79, 87]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [17]:
with open(os.path.join(root_dir, 'combined_features_train.pkl'), 'wb') as f:
    pickle.dump(final_features_train, f)
with open(os.path.join(root_dir, 'combined_all_results_train.pkl'), 'wb') as f:
    pickle.dump(final_all_results_train, f)


In [None]:
with open(os.path.join(root_dir, 'combined_features_train.pkl'), 'wb') as f:
    pickle.dump(final_features_dev, f)
with open(os.path.join(root_dir, 'combined_all_results_train.pkl'), 'wb') as f:
    pickle.dump(final_features_dev, f)


145816

131944

In [24]:
features = features_train[0]
all_result = all_results_train[0]
print(len(all_result))

131944


In [10]:
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

In [11]:
all_example_index

tensor([     0,      1,      2,  ..., 131941, 131942, 131943])

In [25]:
features2 = features_train[1]
all_result2 = all_results_train[1]
print(len(all_result2))
torch.tensor([f.example_index for f in features2], dtype=torch.long)

163591


tensor([     0,      1,      2,  ..., 130316, 130317, 130318])

In [20]:
len(features2)

163591

## Output for Pure Start End Scores

In [3]:

dir = './runs'
directory = os.fsencode('./runs')

names = []
dfs = []
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    df = pd.read_csv(os.path.join(dir, filename))
    dfs.append(df)
    names.append(filename)


In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(14, 6)
ax.set(xlabel='Steps', ylabel='Eval F1', title='Eval F1 vs. steps')
ax.grid()

for idx, df in enumerate(dfs):
    label = names[idx].split('-0')[0][4:]
    plt.plot(df['Step'], df['Value'], label=label)
ax.legend()
axes = plt.gca()
plt.yticks(np.arange(64, 92, 2))
# fig.savefig("eval_f1_runs.png")

In [2]:
root_dir = './save/output'
directory = os.fsencode(root_dir)

d = defaultdict(defaultdict)

for dir in os.listdir(directory):
    model_dir = os.fsdecode(dir)
    with open(os.path.join(root_dir, model_dir, 'model_output.json')) as f:
        x = json.load(f)
        model_name = x['model_name'].replace('-dev', '').replace('-train', '').replace('-01', '')
        d[model_name][x['type']] = np.array(x['output'])
        del x
    



In [3]:
train = []
dev = []
for k, v in d.items():
    train.append(v['train'])
    dev.append(v['dev'])
    print(k)
    
    


albert-xxlarge-v1
albert-large-v2
roberta-large
bert-large


In [4]:
np_train = np.array(train).transpose(1, 0, 2, 3)
np_train.shape

(130319, 4, 2, 256)

In [5]:
np_dev = np.array(dev).transpose(1, 0, 2, 3)
np_dev.shape

(6078, 4, 2, 256)

In [8]:
import h5py
with h5py.File(os.path.join(root_dir, 'model_output.h5'), 'w') as hf:
    hf.create_dataset("model_output_train",  data=np_train)
    hf.create_dataset("model_output_dev",  data=np_dev)

In [10]:
with h5py.File(os.path.join(root_dir, 'model_output.h5'), 'r') as hf:
    data_train = hf['model_output_train'][:]
    data_dev = hf['model_output_dev'][:]
    print(data_train.shape, data_dev.shape)

(130319, 4, 2, 256) (6078, 4, 2, 256)
