In [None]:
!pip install transformers==4.5.1

In [2]:
import transformers
import os
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import time
import pickle
import statistics 
from scipy import stats
import math
import csv

In [3]:
weightsPath='/content/drive/My Drive/MTP/Weights/'
datasetPath='/content/drive/My Drive/MTP/Datasets/'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Delimiter heads statistical test in pretrained mBERT**

In [6]:
#This cell runs the statistical test for the delimiter roles (CLS,SEP) for the specified languages and functional roles.

languages=['english']
functions=['cls','sep']

model='bert-base-multilingual-cased'
MAX_LEN=64
BATCH_SIZE=8

for lang in languages:
  for func in functions:
    print(lang+' '+func)
    sentences_file=datasetPath + lang + '-sentences-1000.txt'

    in_file=open(sentences_file,'r',encoding='utf-8')

    lines=in_file.readlines()

    tokenizer=transformers.BertTokenizer.from_pretrained(model)

    class SentenceDataset(Dataset):
      
      def __init__(self, sentences,tokenizer,max_len):
        self.sentences=sentences
        self.tokenizer=tokenizer
        self.max_len=max_len

      def __len__(self):
        return len(self.sentences)

      def __getitem__(self,item):
        sentence=str(self.sentences[item])

        encoding=tokenizer.encode_plus(
            sentence,
            max_length=self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True,
            return_tensors='pt'
        )

        return{
            'sentence':sentence,
            'input_ids':encoding['input_ids'],
            'attention_mask':encoding['attention_mask']
        }

    n=len(lines)

    def create_data_loader(sentences,tokenizer,max_len,batch_size):
      ds= SentenceDataset(
          sentences=sentences,
          tokenizer=tokenizer,
          max_len=max_len
      )

      return DataLoader(
          ds,
          batch_size=batch_size
      )


    data_loader=create_data_loader(lines,tokenizer,MAX_LEN,BATCH_SIZE)

    if (func=='cls'):
      index=0
    else:
      index=-1

    headSentScores=[[0 for i in range(1000)] for j in range(145)]

    bert_model = transformers.BertModel.from_pretrained(model, output_attentions=True)

    for count, data in enumerate(iter(data_loader)):
        last_hidden_state,pooler_output,attentions = bert_model(input_ids=torch.squeeze(data['input_ids']),
                                                                  attention_mask=torch.squeeze(data['attention_mask']),return_dict=False)

        for attention_id, attention in enumerate(attentions):
            for element in range(attention.shape[0]):
              tokens = tokenizer.tokenize(lines[count * BATCH_SIZE + element])
              for head in range(attention.shape[1]):
                  number = attention_id * 12 + (head + 1)
                  sentenceWeights = attention[element][head]
                  final_tokens = ['CLS'] + tokens[:62] + ['SEP']
                  sent_len = len(final_tokens)
                  array = sentenceWeights[:sent_len, :sent_len].detach().numpy()
                  sentScore=0
                  for i in range(sent_len):
                      tokenScore=array[i,index]/(array[i,:].sum()/sent_len)
                      sentScore+=tokenScore
                  sentScore/=sent_len
                  headSentScores[number][count * BATCH_SIZE + element]=sentScore
                    
        print(count)

    # with open(weightsPath + lang +'-matrix-' + func + '-1000.pl', 'wb') as f:
    #   pickle.dump(headSentScores, f)

**Local heads statistical test in pretrained mBERT**

In [8]:
#This cell runs the statistical tests for the local role for the specified languages.

languages=['english']
functions=['local']

model='bert-base-multilingual-cased'
bert_model = transformers.BertModel.from_pretrained(model, output_attentions=True)
threshold=3
MAX_LEN=64
BATCH_SIZE=8

for lang in languages:
  for func in functions:
    print(lang+' '+func)
    sentences_file=datasetPath + lang + '-sentences-1000.txt'

    in_file=open(sentences_file,'r',encoding='utf-8')

    lines=in_file.readlines()

    tokenizer=transformers.BertTokenizer.from_pretrained(model)

    class SentenceDataset(Dataset):
      
      def __init__(self, sentences,tokenizer,max_len):
        self.sentences=sentences
        self.tokenizer=tokenizer
        self.max_len=max_len

      def __len__(self):
        return len(self.sentences)

      def __getitem__(self,item):
        sentence=str(self.sentences[item])

        encoding=tokenizer.encode_plus(
            sentence,
            max_length=self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True,
            return_tensors='pt'
        )

        return{
            'sentence':sentence,
            'input_ids':encoding['input_ids'],
            'attention_mask':encoding['attention_mask']
        }

    n=len(lines)

    def create_data_loader(sentences,tokenizer,max_len,batch_size):
      ds= SentenceDataset(
          sentences=sentences,
          tokenizer=tokenizer,
          max_len=max_len
      )

      return DataLoader(
          ds,
          batch_size=batch_size
      )


    data_loader=create_data_loader(lines,tokenizer,MAX_LEN,BATCH_SIZE)

    headSentScores=[[0 for i in range(1000)] for j in range(145)]

    for count, data in enumerate(iter(data_loader)):
        last_hidden_state,pooler_output,attentions = bert_model(input_ids=torch.squeeze(data['input_ids']),
                                                                  attention_mask=torch.squeeze(data['attention_mask']),return_dict=False)
        for attention_id, attention in enumerate(attentions):
            for element in range(attention.shape[0]):
              tokens = tokenizer.tokenize(lines[count * BATCH_SIZE + element])
              for head in range(attention.shape[1]):
                  number = attention_id * 12 + (head + 1)
                  sentenceWeights = attention[element][head]
                  final_tokens = ['CLS'] + tokens[:62] + ['SEP']
                  sent_len = len(final_tokens)
                  array = sentenceWeights[:sent_len, :sent_len].detach().numpy()
                  sentScore=0
                  counti=0
                  for i in range(sent_len):
                      if((i-2)>=0 and i+2<sent_len):
                          counti+=1
                          tokenScore=0
                          for j in range(-2,3):
                              tokenScore+=array[i,i+j]
                          tokenScore=(tokenScore/5)/(array[i,:].sum()/sent_len)
                          sentScore+=tokenScore
                  if(counti!=0):
                      sentScore/=counti
                  headSentScores[number][count * BATCH_SIZE + element]=sentScore
                    
        print(count)

    # with open(weightsPath + lang +'-matrix-' + func + '-1000.pl', 'wb') as f:
    #   pickle.dump(headSentScores, f)