In [80]:
# TODO: convert to .py script 

import pickle as pkl
import json
import os
import numpy as np
from bert import extract_features_manifolds as extract_features
from bert import tokenization
import random
from collections import defaultdict

## Parameters

In [81]:
prune_percent = 0
experiment_num = 1

model_dir = f".models\\bert-prune-{prune_percent}-squad"
experiment_dir = f"experiments\\prune_{prune_percent}_exp_{experiment_num}"
input_pkl_filename = "FINAL_Q.pkl"
relevant_pos_filename = f"relevant_pos_tags_{experiment_num}.txt"
sample_filename = f"sample_seed_{experiment_num}.pkl"

num_hidden_layers = 12

# Model directory must contain the following files:
#     - bert_config.json
#     - vocab.txt
#     - model.ckpt.data-00000-of-00001
#     - model.ckpt.index
#     - model.ckpt.meta
#     - checkpoint
# 
# "checkpoint" file contains the following line:
#     model_checkpoint_path: "model.ckpt"

## Load and Restructure Inputs

In [82]:
# Experiment directories
inputs_dir = os.path.join(experiment_dir, "input")
intermediate_dir = os.path.join(experiment_dir, "intermediate")
features_dir = os.path.join(experiment_dir, "features")
mftma_dir = os.path.join(experiment_dir, "mftma-analysis")

# Make the directories if they don't exist
os.makedirs(inputs_dir, exist_ok=True)
os.makedirs(intermediate_dir, exist_ok=True)
os.makedirs(features_dir, exist_ok=True)
os.makedirs(mftma_dir, exist_ok=True)

In [83]:
# Load the data
input_pkl_file = os.path.join(inputs_dir, input_pkl_filename)
sample_pkl_file = os.path.join(inputs_dir, sample_filename)

tagged_data_obj = pkl.load(open(input_pkl_file, 'rb'))
relevant_pos = open(os.path.join(inputs_dir, relevant_pos_filename), 'r').read().splitlines()
line_word_tag_map = pkl.load(open(sample_pkl_file, 'rb'))

In [84]:
INPUT_examples = []
INPUT_sentences = []
INPUT_key_tag = []

for ex in tagged_data_obj:
        tags = []
        words = []
        for word_tag in ex[2]:
            # Some words are not tagged (looks like only empty strings)
            if len(word_tag)==2:
                words.append(word_tag[0].lower())
                tags.append(word_tag[1].lower())
        sentence = " ".join(words)

        INPUT_examples.append({"words": words, "tags": tags})
        INPUT_sentences.append(sentence)
        INPUT_key_tag.append(f"{ex[0]}^{ex[1]}")
    
    
print(f"Found {len(INPUT_examples)} examples")

# (Limit during development to save time)
#limit_examples = 400 
#INPUT_examples = INPUT_examples[:limit_examples]
#INPUT_sentences = INPUT_sentences[:limit_examples]
#INPUT_key_tag = INPUT_key_tag[:limit_examples]

Found 398 examples


In [85]:
sentences_file = os.path.join(intermediate_dir, "bert_input_sentences.txt")

# Write the input sentences to a file
with open(sentences_file, "w", encoding="utf-8") as f:
    for sentence in INPUT_sentences:
        f.write(f"{sentence}\n")

## Extract Features

In [86]:
# Model files
bert_config_file = os.path.join(model_dir, "bert_config.json")
vocab_file = os.path.join(model_dir, "vocab.txt")
init_checkpoint = model_dir

# Layer indices
layers = range(num_hidden_layers)
layers_str = ",".join([str(l) for l in layers])

# Target file
features_file = os.path.join(intermediate_dir, "bert_features.json")

In [87]:
# Extracts features of the sentences in file sentences_file, records them
#   in file features_file
extract_features.extract(input_file=sentences_file,
                         output_file=features_file,
                         bert_config_file=bert_config_file,
                         init_checkpoint=init_checkpoint,
                         vocab_file=vocab_file,
                         layers=layers_str
                         )

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] which mountain range influenced the split of the regions ? [SEP]
INFO:tensorflow:input_ids: 101 2029 3137 2846 5105 1996 3975 1997 1996 4655 1029 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

## Restructure Features

In [88]:
# Load the previously saved features
features_jsons = []
with open(features_file, 'r') as fp:
    features_jsons.extend(json.loads(line) for line in fp)
    
# Getting feature vectors out of features_jsons:
# features_jsons[i]['features'][j]['layers'][k]['values']
# i = example
# j = token
# k = layer    

In [89]:
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

# initialise the dict structure
manifold_vectors = defaultdict(dict)
for tag in relevant_pos:
    tag = tag.strip().lower()
    for layer in range(1, num_hidden_layers + 1):
        manifold_vectors[layer][tag] = None

        
# Iterate over the examples + their data structure
# Fill in the corresponding elements in manifold_vectors

# Relying on the feature extraction above running on ALL lines -> indexing preserved
for line_idx, (features_dict, key_tag) in enumerate(zip(features_jsons, INPUT_examples)): 
    if line_idx in line_word_tag_map:
        word_list = key_tag['words']
        for word_idx in line_word_tag_map[line_idx]: 
            tag = line_word_tag_map[line_idx][word_idx]
            # appending -1 to the 1st and last position to represent CLS and SEP tokens 
            split_word_idx = [-1]
            # tokenization - assign the same id for all sub words of a same word
            for split_id, split_word in enumerate(word_list):
                tokens = tokenizer.tokenize(split_word)
                split_word_idx.extend([split_id] * len(tokens))
            split_word_idx.append(-1)
            
            vector_idcs = np.argwhere(np.array(split_word_idx) == word_idx).reshape(-1)         
            tokens_features = [features_dict['features'][i]['layers'] for i in vector_idcs]
            
            # iterating through layers in mftma encoding (1-12)
            for layer in range(1, num_hidden_layers + 1):     
                tokens_layer_features = [token_features[layer-1]['values'] for token_features in tokens_features]
                # take the mean of token features of the same word to represent the word as a single feature vector
                token_vector = np.mean(tokens_layer_features, axis=0).reshape(-1,1)
                if manifold_vectors[layer][tag] is None:
                    manifold_vectors[layer][tag] = token_vector
                else:
                    manifold_vectors[layer][tag] = np.hstack((manifold_vectors[layer][tag], token_vector))                    
                    

## Store Feature Arrays

In [90]:
for layer in range(1,num_hidden_layers+1):
    pkl.dump(list(manifold_vectors[layer].values()), open(os.path.join(features_dir,
                                                                  str(layer)+'.pkl'), 'wb+'))

In [91]:
# Command to generate mftma files
n_vectors = 200
n_reps = 1
f"python mftma_analysis.py --feature_dir {features_dir} --mftma_analysis_dir {mftma_dir} --n_t {n_vectors} --n_reps {n_reps}"

'python mftma_analysis.py --feature_dir experiments\\prune_70_exp_5\\features --mftma_analysis_dir experiments\\prune_70_exp_5\\mftma-analysis --n_t 200 --n_reps 1'

In [92]:
# Command to generate plots
f"python generate_plot.py --mftma_analysis_dir {mftma_dir}"

# python generate_plot.py --mftma_analysis_dir experiments\\prune_60_exp_3\\mftma-analysis
# python generate_plot.py --mftma_analysis_dir experiments\\prune_70_exp_3\\mftma-analysis
# python generate_plot.py --mftma_analysis_dir experiments\\prune_60_exp_4\\mftma-analysis


'python generate_plot.py --mftma_analysis_dir experiments\\prune_70_exp_5\\mftma-analysis'

In [1]:
possible_answers_df

NameError: name 'possible_answers_df' is not defined

In [3]:
prune_percs = [30,40,50]   
for perc in prune_percs: 
    with open(f'../models/bert-prune-{perc}-squad/predictions.json', 'r') as preds_file:
        predictions_json = json.load(preds_file)
        
    predictions_df = pd.DataFrame.from_dict(predictions_json, orient='index')
    ans_pred_df = answers_df.join(predictions_df, how='left')

    # all entries with is_impossible = False
    possible_answers_df = ans_pred_df[~ans_pred_df['is_impossible']]
    # Check if the prediction is in the list of possible answers
    possible_answers_df['correct'] = possible_answers_df.apply(lambda x: x[0] in x.answers_lists, axis=1)
    
    
    results = possible_answers_df.correct.value_counts()
    print(f"{perc} pruned squad v2 test performance: {results[0]/(results[0]+results[1])}")
    

FileNotFoundError: [Errno 2] No such file or directory: '../models/bert-prune-30-squad/predictions.json'