In [2]:
from collections import defaultdict
import itertools
import re
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from multiprocessing import Pool

from stats_count import *
from grab_weights import grab_attention_weights, text_preprocessing

import os
import shutil
import timeit
import ripser_count

import localfuncs

In [3]:
!python --version

Python 3.8.8


In [4]:
!nvcc --version

/bin/bash: nvcc: command not found


In [5]:
import warnings

warnings.filterwarnings('ignore')

In [6]:
!env | grep CUDA_VISIBLE

## Parameters

In [7]:
np.random.seed(42) # For reproducibility.

In [8]:
max_tokens_amount  = 128 # The number of tokens to which the tokenized text is truncated / padded.
stats_cap          = 500 # Max value that the feature can take. Is NOT applicable to Betty numbers.
    
layers_of_interest = [i for i in range(12)]  # Layers for which attention matrices and features on them are 
                                             # calculated. For calculating features on all layers, leave it be
                                             # [i for i in range(12)].
stats_name = "s_e_v_c_b0b1" # The set of topological features that will be count (see explanation below)

thresholds_array = [0.025, 0.05, 0.1, 0.25, 0.5, 0.75] # The set of thresholds
thrs = len(thresholds_array)                           # ("t" in the paper)

model_path = tokenizer_path = "DeepPavlov/rubert-base-cased"  

# You can use either standard or fine-tuned BERT. If you want to use fine-tuned BERT to your current task, save the
# model and the tokenizer with the commands tokenizer.save_pretrained(output_dir); 
# bert_classifier.save_pretrained(output_dir) into the same directory and insert the path to it here.

### Explanation of stats_name parameter

Currently, we implemented calculation of the following graphs features:
* "s"    - amount of strongly connected components
* "w"    - amount of weakly connected components
* "e"    - amount of edges
* "v"    - average vertex degree
* "c"    - amount of (directed) simple cycles
* "b0b1" - Betti numbers

The variable stats_name contains a string with the names of the features, which you want to calculate. The format of the string is the following:

"stat_name + "_" + stat_name + "_" + stat_name + ..."

**For example**:

`stats_name == "s_w"` means that the number of strongly and weakly connected components will be calculated

`stats_name == "b0b1"` means that only the Betti numbers will be calculated

`stats_name == "b0b1_c"` means that Betti numbers and the number of simple cycles will be calculated

e.t.c.

## Filenames

In [9]:
subset = "train_ru"           # .csv file with the texts, for which we count topological features
input_dir = "small_gpt_web/"  # Name of the directory with .csv file
output_dir = "small_gpt_web/" # Name of the directory with calculations results

prefix = output_dir + subset

r_file     = output_dir + 'attentions/' + subset  + "_all_heads_" + str(len(layers_of_interest)) + "_layers_MAX_LEN_" + \
             str(max_tokens_amount) + "_" + model_path.split("/")[-1]
# Name of the file for attention matrices weights

stats_file = output_dir + 'features/' + subset + "_all_heads_" + str(len(layers_of_interest)) + "_layers_" + stats_name \
             + "_lists_array_" + str(thrs) + "_thrs_MAX_LEN_" + str(max_tokens_amount) + \
             "_" + model_path.split("/")[-1] + '.npy'

In [10]:
stats_file

'small_gpt_web/features/train_ru_all_heads_12_layers_s_e_v_c_b0b1_lists_array_6_thrs_MAX_LEN_128_bert-base-uncased.npy'

In [11]:
r_file

'small_gpt_web/attentions/train_ru_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

.csv file must contain the column with the name **sentence** with the texts. It can also contain the column **labels**, which will be needed for testing. Any other arbitrary columns will be ignored.

In [12]:
try:
    data = pd.read_csv(input_dir + subset + ".csv").reset_index(drop=True)
except:
    #data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t")
    data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t", header=None)
    data.columns = ["0", "labels", "2", "sentence"]

In [13]:
len(data)

129066

In [14]:
from math import ceil

batch_size = 10 # batch size
number_of_batches = ceil(len(data['Text']) / batch_size)
DUMP_SIZE = 100 # number of batches to be dumped
number_of_files = ceil(number_of_batches / DUMP_SIZE)

In [15]:
sentences = data['Text']
print("Average amount of words in example:", \
      np.mean(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['Text'])))))
print("Max. amount of words in example:", \
      np.max(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['Text'])))))
print("Min. amount of words in example:", \
      np.min(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['Text'])))))

Average amount of words in example: 222.06571831466073
Max. amount of words in example: 3484
Min. amount of words in example: 5


In [16]:
MAX_LEN = max_tokens_amount

import pickle
# tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
# Our computing cluster does not allow internet connections from jupyter. Hence tokenizer is loaded from login shell and serialized.
f = open('/home/amshtareva/tokenizer_obj','rb')
tokenizer = pickle.load(f)
#tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)

In [17]:
def get_token_length(batch_texts):
    inputs = tokenizer.batch_encode_plus(batch_texts,
       return_tensors='pt',
       add_special_tokens=True,
       max_length=MAX_LEN,             # Max length to truncate/pad
       pad_to_max_length=True,         # Pad sentence to max length
       truncation=True
    )
    inputs = inputs['input_ids'].numpy()
    n_tokens = []
    indexes = np.argwhere(inputs == tokenizer.pad_token_id)
    for i in range(inputs.shape[0]):
        ids = indexes[(indexes == i)[:, 0]]
        if not len(ids):
            n_tokens.append(MAX_LEN)
        else:
            n_tokens.append(ids[0, 1])
    return n_tokens

In [None]:
data['tokenizer_length'] = get_token_length(data['Text'].values)

In [None]:
data

In [None]:
ntokens_array = data['tokenizer_length'].values

In [None]:
pickle.dump(ntokens_array, open('ntokens.obj', 'wb'))

## Attention extraction

Loading **BERT** and tokenizers using **transformers** library.

In [None]:
batched_sentences = np.array_split(data['Text'].values, number_of_batches)
adj_matricies = []
adj_filenames = []
batch_size = 10 # batch size
number_of_batches = ceil(len(data['Text']) / batch_size)
DUMP_SIZE = 100 # number of batches to be dumped
number_of_files = ceil(number_of_batches / DUMP_SIZE)
number_of_batches_single = ceil(len(data['Text']) / batch_size)
single_set = ceil(number_of_batches_single / DUMP_SIZE)
assert number_of_batches == len(batched_sentences) # sanity check

In [None]:
import torch
torch.cuda.is_available()

In [None]:
device='cuda:0'
#model = BertForSequenceClassification.from_pretrained(model_path, output_attentions=True)
g = open('/home/amshtareva/model_obj','rb')
model = pickle.load(g)
model = model.to(device)

In [None]:
template_features_array = pickle.load(open('template_features_array.obj', 'rb'))
stats_tuple_lists_array = pickle.load(open('stats_tuple_lists_array.obj', 'rb'))

In [None]:
mod = 2000
number_of_batches_single = ceil(mod / batch_size)
single_set = ceil(number_of_batches_single / DUMP_SIZE)

In [None]:
feature_list = ['self', 'beginning', 'prev', 'next', 'comma', 'dot']

In [None]:
#attention_dir = 'small_gpt_web/attentions/'
#attention_name = subset + '_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

#texts_name = 'small_gpt_web/' + subset + '.csv'

MAX_LEN = 128
component = 64
iterv = number_of_batches - number_of_batches_single*component

In [None]:
while iterv > 0:
#for component in range(4):
    for i in tqdm(range(min(number_of_batches_single, iterv)), desc="Weights calc"): 
        attention_w = grab_attention_weights(model, tokenizer, batched_sentences[i+component*number_of_batches_single], max_tokens_amount, device)
        # sample X layer X head X n_token X n_token
        adj_matricies.append(attention_w)
        if (i+1) % DUMP_SIZE == 0: # dumping
            print(f'Saving: shape {adj_matricies[0].shape}')
            adj_matricies = np.concatenate(adj_matricies, axis=1)
            print("Concatenated")
            adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
            # Carefully with boundaries
            filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)+component*single_set) + "of" + str(number_of_files) + '.npy'
            print(f"Saving weights to : {filename}")
            adj_filenames.append(filename)
            np.save(filename, adj_matricies)
            adj_matricies = []

    if len(adj_matricies):
        print("Alert!")
        filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)+component*single_set) + "of" + str(number_of_files) + '.npy'
        print(f'Saving: shape {adj_matricies[0].shape}')
        adj_matricies = np.concatenate(adj_matricies, axis=1)
        print("Concatenated")
        adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
        print(f"Saving weights to : {filename}")
        np.save(filename, adj_matricies)
        adj_matricies = []

    print("Results saved.")
    
    adj_filenames = [
        output_dir + 'attentions/' + filename 
        for filename in os.listdir(output_dir + 'attentions/') if r_file in (output_dir + 'attentions/' + filename)
    ]
    # sorted by part number
    adj_filenames = sorted(adj_filenames, key = lambda x: int(x.split('_')[-1].split('of')[0][4:].strip())) 
    print(adj_filenames)
    
    for i, filename in enumerate(tqdm(adj_filenames, desc='Вычисление признаков')):
        adj_matricies = np.load(filename, allow_pickle=True)
        print((i+component*single_set)*batch_size*DUMP_SIZE)
        print((i+component*single_set+1)*batch_size*DUMP_SIZE)
        ntokens = ntokens_array[(i+component*single_set)*batch_size*DUMP_SIZE : (i+component*single_set+1)*batch_size*DUMP_SIZE]
        splitted = split_matricies_and_lengths(adj_matricies, ntokens, num_of_workers) # 2 or 20.
        args = [(m, thresholds_array, ntokens, stats_name.split("_"), stats_cap) for m, ntokens in splitted]
        stats_tuple_lists_array_part = pool.starmap(
            count_top_stats, args
        )
        stats_tuple_lists_array.append(np.concatenate([_ for _ in stats_tuple_lists_array_part], axis=3))
    
    for i, filename in tqdm(list(enumerate(adj_filenames)), desc='Features calc'):
        adj_matricies = np.load(filename, allow_pickle=True)
        ntokens = ntokens_array[(i+component*single_set)*batch_size*DUMP_SIZE : (i+component*single_set+1)*batch_size*DUMP_SIZE]
        splitted = split_matricies_and_lengths(adj_matricies, ntokens, num_of_workers)
        
        args = [(m, feature_list, list_of_ids) for m, list_of_ids in splitted]

        template_features_array_part = pool.starmap(
            calculate_features_t, args
        )
        template_features_array.append(np.concatenate([_ for _ in template_features_array_part], axis=3))

    iterv = iterv - number_of_batches_single
    component = component + 1
    
    for filename in os.listdir(output_dir + 'attentions/'):
        filepath = os.path.join(output_dir + 'attentions/', filename)
        try:
            shutil.rmtree(filepath)
        except OSError:
            os.remove(filepath)
            
    pickle.dump(stats_tuple_lists_array, open('stats_tuple_lists_array.obj', 'wb'))
    pickle.dump(template_features_array, open('template_features_array.obj', 'wb'))

## Calculating topological features

In [None]:
stats_tuple_lists_array = np.concatenate(stats_tuple_lists_array, axis=3)

In [None]:
stats_tuple_lists_array.shape

In [None]:
from numpy import inf

np.sum(stats_tuple_lists_array[stats_tuple_lists_array == -inf]) + \
np.sum(stats_tuple_lists_array[stats_tuple_lists_array == inf])

In [None]:
stats_file

In [None]:
np.save(stats_file, stats_tuple_lists_array)

##### Checking the size of features matrices:

Layers amount **Х** Heads amount **Х** Features amount **X** Examples amount **Х** Thresholds amount

**For example**:

`stats_name == "s_w"` => `Features amount == 2`

`stats_name == "b0b1"` => `Features amount == 2`

`stats_name == "b0b1_c"` => `Features amount == 3`

e.t.c.

`thresholds_array == [0.025, 0.05, 0.1, 0.25, 0.5, 0.75]` => `Thresholds amount == 6`

In [None]:
template_features_array = np.concatenate(template_features_array, axis=3)

In [None]:
"small_gpt_web/features/" + attention_name + "_template.npy"

In [None]:
np.save("small_gpt_web/features/" + attention_name + "_template.npy", template_features_array)