In [1]:
# TODO: convert to .py script 

import pickle as pkl
import json
import os
import numpy as np
from bert import extract_features_manifolds as extract_features

## Parameters

In [7]:
model_dir = "models\\bert-prune-30-squad"
experiment_dir = "experiments\\bert_experiment_test"
input_pkl_filename = "train_questions_final.pkl"
relevant_pos_filename = "relevant_pos.txt"

max_manifold_size = 50 # Not yet implemented

# Model directory must contain the following files:
#     - bert_config.json
#     - vocab.txt
#     - model.ckpt.data-00000-of-00001
#     - model.ckpt.index
#     - model.ckpt.meta
#     - checkpoint
# 
# "checkpoint" file contains the following line:
#     model_checkpoint_path: "model.ckpt"

## Load and Restructure Inputs

In [8]:
# Experiment directories
inputs_dir = os.path.join(experiment_dir, "input")
intermediate_dir = os.path.join(experiment_dir, "intermediate")
features_dir = os.path.join(experiment_dir, "features")
mftma_dir = os.path.join(experiment_dir, "mftma-analysis")

# Make the directories if they don't exist
os.makedirs(inputs_dir, exist_ok=True)
os.makedirs(intermediate_dir, exist_ok=True)
os.makedirs(features_dir, exist_ok=True)
os.makedirs(mftma_dir, exist_ok=True)

In [9]:
# Load the data
input_pkl_file = os.path.join(inputs_dir, input_pkl_filename)

tagged_data_obj = pkl.load(open(input_pkl_file, 'rb'))
relevant_pos = open(os.path.join(inputs_dir, relevant_pos_filename), 'r').read().splitlines()

In [4]:
INPUT_examples = []
INPUT_sentences = []
INPUT_key_tag = []

for ex in tagged_data_obj:
    tags = []
    words = []
    for word_tag in ex[2]:
        # Some words are not tagged (looks like only empty strings)
        if len(word_tag)==2:
            words.append(word_tag[0])
            tags.append(word_tag[1])
    sentence = " ".join(words)
    
    INPUT_examples.append({"words": words, "tags": tags})
    INPUT_sentences.append(sentence)
    INPUT_key_tag.append(f"{ex[0]}^{ex[1]}")
    
print(f"Found {len(INPUT_examples)} examples")


# (Limit during development to save time)
limit_examples = 400 
INPUT_examples = INPUT_examples[:limit_examples]
INPUT_sentences = INPUT_sentences[:limit_examples]
INPUT_key_tag = INPUT_key_tag[:limit_examples]

Found 1079 examples


In [5]:
sentences_file = os.path.join(intermediate_dir, "bert_input_sentences.txt")

# Write the input sentences to a file
with open(sentences_file, "w", encoding="utf-8") as f:
    for sentence in INPUT_sentences:
        f.write(f"{sentence}\n")

## Extract Features

In [6]:
# Model files
bert_config_file = os.path.join(model_dir, "bert_config.json")
vocab_file = os.path.join(model_dir, "vocab.txt")
init_checkpoint = model_dir

# Layer indices
num_layers = 12
layers = range(num_layers)
layers_str = ",".join([str(l) for l in layers])

# Target file
features_file = os.path.join(intermediate_dir, "bert_features.json")

In [None]:
# Extracts features of the sentences in file sentences_file, records them
#   in file features_file
extract_features.extract(input_file=sentences_file,
                         output_file=features_file,
                         bert_config_file=bert_config_file,
                         init_checkpoint=init_checkpoint,
                         vocab_file=vocab_file,
                         layers=layers_str
                         )

## Restructure Features

In [21]:
# Load the previously saved features
features_jsons = []
with open(features_file, 'r') as fp:
    features_jsons.extend(json.loads(line) for line in fp)
    
# Getting feature vectors out of features_jsons:
# features_jsons[i]['features'][j]['layers'][k]['values']
# i = example
# j = token
# k = layer    

In [22]:
# TODO: Limit the number of feature vectors per pos tag

# tag -> relevant_pos index mapping
tag_idx_map = {tag: i for i, tag in enumerate(relevant_pos)}


# Generate data structure
# -> list for each layer
# -> list for each pos tags per layer
# -> list for each example per pos tag per layer
# features = [layer][pos_tag][example]
features_structure = [[[] for _ in range(len(relevant_pos))] for _ in range(num_layers)]


# Iterate over the examples + their data structure
# Fill in the corresponding elements in features_structure
for features_dict, key_tag in zip(features_jsons, INPUT_examples):
    for layers_dict, tag in zip(features_dict['features'], key_tag['tags']):
        if tag in relevant_pos:
            for layer_number, layer_features in enumerate(layers_dict['layers']):
                features_structure[layer_number][tag_idx_map[tag]].append(layer_features['values'])
                

## Store Feature Arrays

In [24]:
# Convert the inner lists to numpy arrays of the same length
# Sample to the min(max_manifold_size, len(array)) elements

# Count the number of examples per pos tag for the first layer
# This is the same for all layers
arr_size = min(min(
    len(tag_examples) for tag_examples in features_structure[0]
), max_manifold_size)


# Arrays 
for layer_number, tag_features in enumerate(features_structure,1):
    layer_tag_arrays = [(np.array(tag[:arr_size])).T for tag in tag_features]
    # Record to file
    pkl.dump(layer_tag_arrays, open(os.path.join(features_dir, f"{layer_number}.pkl"), 'wb'))

In [None]:
# Command to generate mftma files
# python mftma_analysis.py --feature_dir experiments/bert_experiment_test/features --mftma_analysis_dir experiments/bert_experiment_test/mftma-analysis  