# Candidate merging and related preprocessing


Import relevant packages for the following parts

In [68]:
#python libraries
import stanza
from stanza_batch import batch

import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import os
import re
import csv
from tqdm import tqdm
import time
from collections import Counter, defaultdict


# self written modules
import preprocessing
import candidate_processing as cand_prep
import candidate_extraction as cand_ex

"""import candidate_extraction as cand_ex
from ekphrasis.classes.segmenter import Segmenter
seg = Segmenter() """

from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 


## 1. We import the data

In [69]:
#data_url = r"CBS - Copenhagen Business School\Kick-Ass Master Thesis - General\Data\moria-data/moria_no_duplicates.csv"
beirut_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_beirut.csv" # for Beirut
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_tigray.csv" # for Tigray
channel_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_channel.csv" # for Channel
moria_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_moria.csv" # for Moria
all_url = r"Dropbox (CBS)/Master thesis data/df_tweets.csv" # for all


def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
moria_df = read_event_df(moria_url)
beirut_df = read_event_df(beirut_url)

#channel_df = read_event_df(channel_url)

loaded 92806 tweets!
loaded 24511 tweets!


## 3. We instantiate stanza english language module

In [67]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ needed when running first time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#

#stanza.download("en")

#stanza.install_corenlp()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# loading the pipeline
en_nlp = stanza.Pipeline("en", tokenize_pretokenized=True, ner_batch_size=4096)

2021-04-11 18:56:42 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-04-11 18:56:42 INFO: Use device: cpu
2021-04-11 18:56:42 INFO: Loading: tokenize
2021-04-11 18:56:42 INFO: Loading: pos
2021-04-11 18:56:44 INFO: Loading: lemma
2021-04-11 18:56:46 INFO: Loading: depparse
2021-04-11 18:56:47 INFO: Loading: sentiment
2021-04-11 18:56:49 INFO: Loading: ner
2021-04-11 18:56:51 INFO: Done loading processors!


In [4]:
import pickle

def pickle_files(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)

In [70]:

tqdm.pandas()
moria_tweets = event_df['text'].progress_apply(preprocessing.preprocess_tweets)



100%|███████████████████████████████████████████████████████████████████████████| 92806/92806 [01:56<00:00, 798.48it/s]

Canada's immigrant population is 20%, USA is 13%. We're not of fire. We have systematic racism for sure, the difference is not electing leadership stoking the fire  and  getting elected on " I hate the same people you hate ".
Hi @EUHomeAffairs @Place Beauvau @BMI Bund @ukhomeoffice @Justitiedep @ministerieJenV thousands of refugees are at risk of covid 19 on Greek islands due to crowded unsanitary conditions. Will you act now to leave no one behind and save lives?
greece Dozens of Asylum seekers, who face the risk of becoming homeless due to the end of accommodation support, gathered in protest outside UNHCR in athens'ESTIA'is an accommodation programme funded by the EU  and  implemented by UNHCR with Greek NGOs
Hmmm? Maybe not, Spain is the COVID - 19 hot spot now and Greece still has immigrant camps don't they. They might not want to risk more infection.
Greece to evict over 10,000 refugees from shelters
Refugees can find information on their asylum procedures on these boards. Today 




In [86]:
i=240
for tweet in moria_tweets[i:i+10]:
    print('\n',tweet)



 Greek government forces thousands of refugees out of their residences

 A Sunday afternoon fire in Rainier displaced one person but cause no injuries, according to Columbia River Fire  and  Rescue.

 . @PrimeMinisterGR: I know you are committed to keeping all kids in greece healthy  and  safe through covid 19. Hundreds of migrant kids are locked up for no reason. You can save their childhoods. free the kids, put them in child - friendly housing now!

 Yeesh. Speaking as someone of Greek Cypriot origin, I'm not really comfortable with the idea of a game about the illegal Turkish invasion of Cyprus. The Turks killed and displaced a lot of people and are still occupying our land. That's not in good taste IMO. @compassgamesllc

 Sex, Drugs  and  Refugees. Syrian teenagers in Athens resort to prostitution ...  via @YouTube

 greece will purchase equipment worth of millions of euros for the Polices riot force, including large quantities of tear gas, in anticipation of a fresh push by illeg

In [None]:
event_crf_list  = load_pickle('moria_crf_list')
event_crf_list

In [None]:
event_np_list,event_crf_list,event_tagged_tweets = load_event_data('moria')

event_crf_list[:10]

In [None]:
def prep_candlist_for_batching(candidate):
    #change noun_phrase_list to batching compatible format    
    candidate = ['candidate_to_be_removed'] if len(candidate) == 0 else candidate     
    return ' \n '.join(candidate)

#print(f'Tagging {event_name} noun phrase candidates...')
#tag all tweets and save them in a list    
#batched_np_list = cand_prep.prep_candlist_for_batching(event_np_list)
# remove NP candidates longer than threshold and remove all child NPs of parent NPs
event_np_list = cand_prep.remove_long_nps(event_np_list[0:100])
event_np_list = cand_prep.remove_child_nps(event_np_list)
tqdm.pandas()

event_np_list = remove_weird_chars(event_np_list)
event_np_series = pd.Series(event_np_list)
#event_np_series = event_np_series.progress_apply(lowercase_except_first)
#print(event_np_series)
#batched_np_series = event_np_series.progress_apply(prep_candlist_for_batching)
#batched_np_list = list(event_np_series.progress_apply(prep_candlist_for_batching))
#print(batched_np_list)

#which if these two will be faster should be used 
#tagged_np_cands = batched_np_series.progress_apply(en_nlp)
tagged_np_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_np_list)]

#fast but is breaking the text
#tagged_np_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_np_list, en_nlp, batch_size=6000))]
#tagged_np_cands = en_nlp('\n\n'.join(event_np_list))
#tagged_np_cands = en_nlp('\n\n'.join(['. '.join(cand) for cand in event_np_list]))

#print(tagged_np_cands[0].sentences[1])
np_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_np_cands]
#print(np_cand_heads)


# get easily accessible list of tuples (POS-tags of each word, NER-tags of each named entity) 
tweet_tags = cand_prep.get_tweet_tags(event_tagged_tweets) 


np_and_cand_list = cand_prep.get_cand_type(event_np_list,np_cand_heads, tweet_tags)


nps_cands = [cand for cands in np_and_cand_list for cand in cands]
nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences]



In [None]:
tagged_np_cands = en_nlp('\n\n'.join(['. '.join(cand) for cand in event_np_list]))
tagged_np_cands.text



In [None]:
#print(tagged_np_cands[30].sentences[0].text)
for i in range(len(event_np_list)):
    print(f'at index {i}: {event_np_list[i]}')
    #print(f'at index {i}: {event_tagged_tweets[i].text}')

In [None]:
i = 217
print(f'at index {i}: {nps_cands[i][0]}')
print(f'at index {i}: {nps_tagged[i].text}')

In [None]:
print(len(nps_cands),len(nps_tagged))

for i in range(len(nps_cands)):
    if nps_tagged[i].text != nps_cands[i][0]:
        #print(f'index {i} doesnt match for {len(nps_tagged[i].text)} and {len(nps_cands[i][0])}')
        print(f'index {i} doesnt match for {nps_tagged[i].text} and {nps_cands[i][0]}')
        


cand_df = pd.DataFrame(
    {'candidates': nps_cands,
     'cand_tags': nps_tagged
    })

cand_df

In [5]:
def load_event_data(event_name):
    assert event_name in ['moria','tigray','channel','all','beirut'], f"Oh no! We do not analyze {event_name} event"
    
    print(f'Loading {event_name} data...')
    try:
        #sample = 2000
        event_np_list = load_pickle(event_name + '_np_list')#[1000:sample]
        event_crf_list = load_pickle(event_name + '_crf_list')#[1000:sample]
        event_tagged_tweets = load_pickle(event_name + '_tagged_tweets')#[1000:sample]
        
        return event_np_list,event_crf_list,event_tagged_tweets
    except:
        print(f'The {event_name} files not found! Run candidate_extraction.py file on the {eventname}_df')
        return None
    
    
def remove_weird_chars(event_cand_list):
    weird_chars = ['@','>','<','\xa0','  -  ','.']
    for char in weird_chars:
        event_cand_list = cand_prep.remove_char(event_cand_list,char)
    return event_cand_list

def lowercase_except_first(tweet_cands):
    #print(tweet_cands)
    cand_lowercased = [" ".join([word[0] + word[1:].lower() for word in cand.split()]) for cand in tweet_cands ]
    #print(cand_lowercased)
    return cand_lowercased

In [None]:
def pipeline(event_name):
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 1. LOAD THE DATA ~~~~~~~~~~~~~~~~~~~~~
    event_np_list,event_crf_list,event_tagged_tweets = load_event_data(event_name)
    
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 2. GET POS AND NER TAGS ~~~~~~~~~~~~~~~~~~~~~
    # get easily accessible list of tuples (POS-tags of each word, NER-tags of each named entity) 
    tweet_tags = cand_prep.get_tweet_tags(event_tagged_tweets) 
    
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 3. PREPROCESS CANDIDATES ~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~ processing of noun phrases ~~~~~~~~~~~~~~~~~~~~~
    print(f'Processing {event_name} noun phrase candidates...')
    
    tqdm.pandas()
    # remove NP candidates longer than threshold and remove all child NPs of parent NPs
    event_np_list = cand_prep.remove_long_nps(event_np_list)
    event_np_list = cand_prep.remove_child_nps(event_np_list) 
    #event_np_list = remove_weird_chars(event_np_list)
    event_np_list = cand_prep.remove_char(event_np_list,'@')

    event_np_list = [['no_candidate'] if len(noun_ps)==0 or noun_ps ==' ' else noun_ps for noun_ps in event_np_list ]
    
    #print(event_np_list)
    print(f'Tagging {event_name} noun phrase candidates...')
    #tag all tweets and save them in a list    

    #tagged_np_cands = batched_np_list.progress_apply(en_nlp)
    tagged_np_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_np_list)]
    #tagged_np_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_np_list, en_nlp, batch_size=6000))]

    np_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_np_cands]
    #print(np_cand_heads)
    
    np_and_cand_list = cand_prep.get_cand_type(event_np_list,np_cand_heads, tweet_tags)
    #print(event_np_list)
          
    # ~~~~~~~~~~~~ processing of coref candidates ~~~~~~~~~~~~~~~~~~~~~
    print(f'Processing {event_name} coreference candidates...')    
    
    #extract only the representative mentions as representative phrases of candidates
    event_crf_list = [[coref_group[0][coref_group[1]] for coref_group in tweet_corefs] for tweet_corefs in event_crf_list]
    
    #event_crf_list = remove_weird_chars(event_crf_list)
    event_crf_list = cand_prep.remove_char(event_crf_list,'@')

    event_crf_list = [['no_candidate'] if len(crf_ps)==0 else crf_ps for crf_ps in event_crf_list ]
    
    print(f'Tagging {event_name} coreference candidates...')       
    #tag all tweets and save them in a list    
    #batched_coref_list = cand_prep.prep_candlist_for_batching(event_crf_list)
    #print(batched_coref_list)
    tagged_coref_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_crf_list)]
    #tagged_coref_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_coref_list, en_nlp, batch_size=6000))] 
    #print(tagged_coref_cands)
        
    coref_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_coref_cands]
          
    coref_and_cand_list = cand_prep.get_cand_type(event_crf_list, coref_cand_heads, tweet_tags)
    print(len(coref_and_cand_list))
    print(len(event_np_list))
          
    # ~~~~~~~~~~~~~~~~~~~~ combining candidate lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #concatenate corefs and noun phrase lists
    nps_cands = [cand for cands in np_and_cand_list for cand in cands]
    crf_cands = [cand for cands in coref_and_cand_list for cand in cands]
    #candidate_list = coref_and_cand_list + np_and_cand_list
    #print(f'Len = {len(candidate_list)} should be 2x amount of tweets')
    #print(len(nps_cands), len(crf_cands))
    #unpack list of lists into one list
    candidate_list = nps_cands + crf_cands
    print(f'The amount of all candidates is {len(candidate_list)} -  nps: {len(nps_cands)}, crfs:{len(crf_cands)}')
          
    nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences ]
    crf_tagged = [sent for tagged_cand in tagged_coref_cands for sent in tagged_cand.sentences ]
    print(len(nps_tagged), len(crf_tagged))
    all_cands_tagged = nps_tagged + crf_tagged

        
    #print(len(candidate_list),'vs', len(all_cands_tagged))
    cand_df = pd.DataFrame(
        {'candidates': candidate_list,
         'cand_tags': all_cands_tagged
        })

    cand_df['cand_text'] = cand_df.candidates.apply(lambda x: x[0])
    cand_df['cand_len'] = cand_df.cand_text.apply(lambda x: len(x.split()))


    count_cands = Counter(cand_df['cand_text'])
    cand_df['cand_freq'] = cand_df["cand_text"].map(count_cands)
    
    #count_cands[cand_df['cand_text']]
    #count_sorted = sorted(count_cands.items(),key=lambda x: x[1],reverse=True)
    cand_df.columns = cand_df.columns.str.strip()
    
          
    # we sort the candidates by their length
    cand_df.sort_values('cand_freq', ascending=False,inplace=True)

    #cand_df = cand_df[cand_df.cand_text not in  ['no_candidate', 'candidate_to_be_removed']]

    cand_df.reset_index(drop=True, inplace = True)
    #remove dummy candidates that were used to avoid errors

    print(len(cand_df))
    cand_df = cand_df[cand_df.cand_text != 'candidate_to_be_removed']
    cand_df = cand_df[cand_df.cand_text != 'no_candidate']
    len(cand_df)
    cand_df.reset_index(drop=True,inplace=True)
          
    return cand_df
          
          
moria_cands = pipeline('moria')

pickle_files('moria_cands_df', moria_cands)

In [None]:
event_np_list[1]

In [None]:
beirut_cands['cand_text'] = beirut_cands['cand_text'].apply(lambda x: x.lower())
beirut_cands

In [None]:
#event_np_list = load_pickle('beirut_np_list')

event_df['text'][37]

## 4. We apply stanza module on the tweets to get NER and POS tags. We do it in batches to speed things up.

## 5. As initial WCL candidates, we extract noun phrases (NPs) and coreference chains.

## We do so using CoreNLPClient wrapper

### SOME PREPROCESSING NEEDED
* remove links - check
* remove # from hashtags? - check
* remove/merge mentions? - check


* remove recurring texts (signatures of news media) - any new spotted should be added in preprocessing file's '__remove_tweet_signatures__' function
* remove posts of some accounts (refugee_list)
* exclude NERs that tag numbers - should we mark phrase as NE if the head is not NE? - check
* play around with candidate types
* optimize code and make it neater



## 6. We keep only NPs shorter than 20 words and remove children of parent NPs 

## 7. We get the heads of noun phrases (in batches)

## 8. We define candidate types 

## 9. We assign candidate types to noun phrase candidates

## 10. We get coreference chains candidates from the tweet corpus

## 11. We determine candidate's type for representative mentions of coref candidates (in batches)

## 12. We combine the candidate lists for candidate merging

We organize candidates in a list sorted by their number of phrases

In [6]:
moria_cands = load_pickle('moria_cands_df')

### First merging step

In [7]:
#
# THIS IS THE FIRST MERGING STEP
#

        
def merging_step1(candidate_list):
    """
    In the first merging step, we merge two candidates if the head of each of their representative phrase 
     is identical by string comparison.
    """
    #print(list(candidate_list))
    indices_to_remove = set()
    for up_cand_id in tqdm(range(len(candidate_list))):   
        ##if index is already marked to remove then skip
        if up_cand_id in indices_to_remove:
            continue
        up_cand = candidate_list[up_cand_id]    
            
        for low_cand_id in range(up_cand_id+1,len(candidate_list)):
            low_cand = candidate_list[low_cand_id]
            #print(f'comparing {longer_cand} with {cand}')
            #print(f'for index {candidate_list[longer_cand][1]} checking the index {candidate_list[cand][1]}')
            #print(type(candidate_list[longer_cand]))
            #mark for merging if the head and its head's cand type is the same for 2 candidates
            #print(candidate_list[longer_cand][1],candidate_list[longer_cand][3])
            if up_cand[1].lower() == low_cand[1].lower():# and upper_cand[3] == lower_cand[3]:
                #print(f'matching "{longer_cand}" with "{cand}"')
                #print(f'{candidate_list[longer_cand][1]} ===== {candidate_list[cand][1]}')
                indices_to_remove.add(low_cand_id)
                #what_merged[candidate_list[longer_cand][0]].append(candidate_list[cand][0])
                
    return indices_to_remove


def merge_indices(cand_df,indices_to_remove):                

    print(f'Initial amount of candidates: {len(cand_df)}')                
    #print(len(sorted(indices_to_remove)))

    #for index in reversed(sorted(indices_to_remove)):
    cand_df.drop(indices_to_remove,inplace=True)
        
    cand_df.reset_index(drop=True,inplace=True)
    print(f'Amount of candidates: {len(cand_df)}, after removing {len(sorted(indices_to_remove))} indices') 
    return cand_df



In [None]:
event_cands_merged

In [None]:
for cand in cand_df['candidates']:
    print(cand[1], cand[3])

### Second merging step

We merge 2 candidates if their sets of phrases heads are semantically similar

In [8]:
import gensim

#load the GoogleNews 300dim model (fix path)
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import numpy as np

def merging_step2(candidate_list):
    
    indices_to_remove = set()
    
    for upper_cand_id in tqdm(range(len(candidate_list))):     
        upper_cand = candidate_list[upper_cand_id]
        
        up_cand_mean_vec = phrase_heads_avg_vector(upper_cand[2])
        
        for lower_cand_id in range(upper_cand_id+1,len(candidate_list)): 
            lower_cand = candidate_list[lower_cand_id]
            #print(f'for index {candidate_list.index(longer_cand)} checking the index {candidate_list.index(cand)}')
            #if candidate_list[longer_cand][1] == candidate_list[cand][1]:
                #print(f'matching "{longer_cand}" with "{cand}"')
            low_cand_mean_vec = phrase_heads_avg_vector(lower_cand[2])

            if upper_cand[3] == lower_cand[3]:
                #try:
                    #print(1-cosine(long_cand_mean_vec,cand_mean_vec))
                    #print(long_cand_mean_vec.reshape(-1,1).shape, cand_mean_vec.reshape(1,-1).shape)
                    if 1-cosine(up_cand_mean_vec,low_cand_mean_vec) >= 0.7:
                        #print(f'matching "{longer_cand}" with "{cand}"') 
                        indices_to_remove.add(lower_cand_id)
                        what_merged2[upper_cand[0].lower()].append(lower_cand[0].lower())
                        
                #except AttributeError:
                    #pass

            else:

                if 1-cosine(up_cand_mean_vec,low_cand_mean_vec) >= 0.8:
                    #print(f'matching "{longer_cand}" with "{cand}"') 
                    indices_to_remove.add(lower_cand_id)
                    what_merged2[upper_cand[0].lower()].append(lower_cand[0].lower())



    return indices_to_remove

def phrase_heads_avg_vector(phrase_set):
    phrase_head_vectors = []
    for phrase_head in phrase_set:    
        try:
            phrase_head_vectors.append(model[phrase_head])
        except KeyError:
            phrase_head_vectors.append(np.NaN)
    #phrase_head_vectors = [model[phrase_head] for phrase_head in phrase_set]
    if len(phrase_head_vectors) != 0:
        return np.mean(phrase_head_vectors,axis=0)
    else: 
        return np.NaN

        




In [None]:
list(event_cands_merged['cand_text'])


In [None]:
for cand in cand_df['candidates']:
    print(cand[1], cand[3])

## Third merging step representative labeling

currently working on average cosine similarity of each phrase in the candidate - maybe not optimal, maybe it will be better with a different threshold

In [10]:
from sklearn.cluster import AffinityPropagation

from sklearn.metrics.pairwise import cosine_similarity

def merging_step3(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:  
        #the head of noun phrase is marked with value 0 for the word.head
        cand_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]
        #np_pos_tags = {word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #print(np_heads_pos)
        cand_labeling_phrases = []
        for word, head, pos in cand_heads_pos:
            #head-1 because the pointer to head does not use 0 index
            if (pos == 'JJ' or pos=='VBN') and 'NN' in cand_heads_pos[head-1][2]:
                cand_labeling_phrases.append(f'{word}_{cand_heads_pos[head-1][0]}')
        phrases.append(cand_labeling_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for up_cand_id in range(len(candidate_list)):     
        up_cand = candidate_list[up_cand_id]
        up_cand_vectors = phrases_vectors(phrases[up_cand_id])
        if len(up_cand_vectors)==0:
            pass
        else:
            for low_cand_id in range(up_cand_id+1,len(candidate_list)): 
                low_cand = candidate_list[low_cand_id]
                low_cand_vectors = phrases_vectors(phrases[low_cand_id])
                if len(low_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(up_cand_vectors),len(low_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(up_cand_vectors)):
                        for j in range(len(low_cand_vectors)):

                            sim_matrix[i][j] = 1-cosine(up_cand_vectors[i],low_cand_vectors[j])

                    # can we compute matrix mean like this? 
                    #print(sim_matrix)
                    if np.mean(sim_matrix) > 0.6:
                        #print(f'{longer_cand} and {cand} are {numpy.mean(sim_matrix)} similar' )
                        indices_to_remove.add(low_cand_id)
                        what_merged3[up_cand[0].lower()].append(low_cand[0].lower())
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove
                


def phrases_vectors(cand_phrases):
    
#for cand_phrases in phrases:
    #print(cand_phrases)
    cand_phrase_vectors = []
    for phrase in cand_phrases:
        try:
            cand_phrase_vectors.append(model[phrase])
            #print(f'for existing phrase "{phrase}" the vector is {model[phrase][0]}')
        except KeyError:
            phrase_words = phrase.split('_')
            #print(model[phrase_words[1]])
            try:
                phrase_vectors = [model[phrase_word] for phrase_word in phrase_words]
                #print(f'for phrase "{phrase}" avg vector is "{sum(phrase_vectors)/len(phrase_vectors)}') 
                cand_phrase_vectors.append(sum(phrase_vectors)/len(phrase_vectors))
            except KeyError:
                cand_phrase_vectors.append(np.NaN)
    #print(len(cand_phrase_vectors))
    return cand_phrase_vectors
    
 
#event_cands_merged = merge_indices(event_cands_merged, merging_step3(event_cands_merged))
#print(indices_to_remove)

In [None]:
what_merged3
    

### Merging step 4

In [11]:
# missing the second method - we check for the lexical identity of specific stems in multiple candidates.

def merging_step4(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:

        #the head of noun phrase is marked with value 0 for the word.head
        cand_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]

        #print(np_heads_pos)
        cand_compound_phrases = []
        for word, head, pos in cand_heads_pos:
            #i = np_heads_pos.index((word, head, pos))
            #print(np_heads_pos)
            #print(np_heads_pos[i])
            #print(np_heads_pos[head-1])
            #'NN' in np_heads_pos[head-1][2] and
            try:
                #if 'NN' in pos and 'NN' in cand_heads_pos[i+1][2] : 
                    #cand_compound_phrases.append(f'{word}_{cand_heads_pos[i+1][0]}')
                if 'NN' in pos and 'NN' in cand_heads_pos[head-1][2]:
                    cand_compound_phrases.append(f'{word}_{cand_heads_pos[head-1][0]}')
            except IndexError:
                pass
        phrases.append(cand_compound_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for up_cand_id in range(len(candidate_list)):     
        up_cand = candidate_list[up_cand_id]
        up_cand_vectors = phrases_vectors(phrases[up_cand_id])
        if len(up_cand_vectors)==0:
            pass
        else:
            for low_cand_id in range(up_cand_id+1,len(candidate_list)):
                low_cand = candidate_list[low_cand_id]
                low_cand_vectors = phrases_vectors(phrases[low_cand_id])
                if len(low_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(up_cand_vectors),len(low_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(up_cand_vectors)):
                        for j in range(len(low_cand_vectors)):
                            #print(cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)))
                            sim_matrix[i][j] = 1-cosine(up_cand_vectors[i],low_cand_vectors[j])
                            """if cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.4:                
                                sim_matrix[i][j] = 2
                            elif cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.2:
                                sim_matrix[i][j] = 1
                            else:
                                sim_matrix[i][j] = 0"""

                    #print(sim_matrix, up_cand,low_cand)            
                    if np.mean(sim_matrix) > 0.6:
                        print(f'{up_cand_id} and {low_cand_id} are {np.mean(sim_matrix)} similar' )
                        indices_to_remove.add(low_cand_id)
                        what_merged4[up_cand[0].lower()].append(low_cand[0].lower())
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove


#event_cands_merged = merge_indices(event_cands_merged, merging_step4(event_cands_merged))
#print(merging_step4(candidate_list))

In [None]:
what_merged

### Merging step 5


In [13]:
what_merged1,what_merged2,what_merged3,what_merged4 = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
event_cands = moria_cands

event_cands_merged = merge_indices(event_cands, merging_step1(event_cands['candidates']))




100%|████████████████████████████████████████████████████████████████████████| 682391/682391 [5:36:29<00:00, 33.80it/s]


Initial amount of candidates: 682391
Amount of candidates: 31503, after removing 650888 indices


In [64]:
event_cands_merged = merge_indices(event_cands_merged, merging_step2(event_cands_merged['candidates']))

what_merged2

 53%|█████████████████████████████████████▊                                  | 16571/31503 [8:33:51<7:43:01,  1.86s/it]


KeyboardInterrupt: 

In [None]:
event_cands_merged = merge_indices(event_cands_merged, merging_step3(event_cands_merged))

what_merged3

In [None]:
event_cands_merged = merge_indices(event_cands_merged, merging_step4(event_cands_merged))
what_merged4

In [None]:
pickle_files('moria_cands_merged',event_cands_merged)
pickle_files('moria_whatmerged2',what_merged2)
pickle_files('moria_whatmerged3',what_merged3)
pickle_files('moria_whatmerged4',what_merged4)

## Frame identification

In [34]:
"""frame_properties = {'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'unreason/irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']}""" #from Hamborg's paper

# from paper Shifting the refugee narratives? by Greussing & Boomgaarden (2015)
frame_properties = {'settlement':['settlement','accomodation','permanent','temporary','barracks','accommodated','tent','camp', 'shelter'],
                   'reception':['quota', 'distribution', 'limit', 'selection','reception','together','asylum','receive'],
                    'security':['security', 'border','crossing','fence','control','flow'],
                    'criminality':['officer','terror','suspicion','crime','offense','police','trafficking','suspect'],
                    'economisation':['euro','economic','million','thousand','cost','money'],
                    'humanitarian':['humane','voluntary','help','support','aid','care','solidarity'],
                    'victimization':['fight','victim','war','dead','rescued','state'],
                    'integration': ['labour','employed','unemployed','integration','positive'],
                    
                    #from hamborg
                    'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']
                   
                   }



In [31]:
what_merged2 = load_pickle('beirut_whatmerged3')
what_merged2

defaultdict(list,
            {'lebanese government': ['lebanese ministry of foreign affairs',
              'lebanese migrants',
              'lebanese immigrants',
              'lebanese phalangist militias and the israeli military',
              'lebanese families',
              '16 , 1982, lebanese christian militiamen',
              'plight of syrian refugees in lebanon',
              'lebanese citizens',
              'lebanese christian',
              'a lebanese accent',
              'lebanese authorities',
              'israeli forces',
              'a syrian refugee kid 13 year',
              'lebanese nationals',
              'members of the camp and a local lebanese family',
              'marwa, a syrian refugee in lebanon',
              'the lebanese army',
              'this syrian refugee family n',
              'syrian refugees battle for survival',
              'syrian refugee misery',
              'lebanese labor market',
              'this palestin

NameError: name 'manual_cands' is not defined

In [35]:
#import conceptnet_lite as cn
import gensim
import gensim.downloader as api

#model = gensim.models.KeyedVectors.load_word2vec_format(r"C:/Users/niol19ac/Dropbox (CBS)/Master thesis data/GoogleNews-vectors-negative300.bin.gz", binary=True)

manual_cands = ['refugee','migrant','greece','turkey','syria','beirut','immigrant','aoun']


# to run on the server we should use larger model according to the paper - "conceptnet-numberbatch-17-06-300"
model = api.load("glove-twitter-200")


In [40]:
from nltk.corpus import stopwords
from collections import defaultdict
import nltk

nltk.download('stopwords')

stop_words = list(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

print('preprocessing tweets...')
tqdm.pandas()
tweets_corpus = list(event_df['text'].progress_apply(preprocessing.preprocess_tweets))


print('assigning frame properties to words from tweets...')
word_properties = defaultdict(dict)
for i in tqdm(tweets_corpus):
    tweet_words = [word.lower() for word in i.split() if word not in stop_words and len(word)>1]
    for word in tweet_words:
        #print(word)
        word = lemma.lemmatize(word)
        property_list = []
        #print(list(frame_properties.keys()))
        for prop in list(frame_properties.keys()):
            #print(frame_properties[prop])
            
            try:
                #print(f'sim of {word}, {prop} is {model.similarity(word, prop)}')
                weights = [model.similarity(word, seed) for seed in frame_properties[prop]]
                #print(weights)
                if max(weights)>0.4:
                    word_properties[word][prop] = max(weights)
            except KeyError:
                pass
            



print(word_properties)
        

            

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  0%|                                                                            | 105/92806 [00:00<01:29, 1039.46it/s]

preprocessing tweets...


100%|██████████████████████████████████████████████████████████████████████████| 92806/92806 [01:04<00:00, 1433.55it/s]
  0%|                                                                              | 2/92806 [00:00<1:41:19, 15.27it/s]

assigning frame properties to words from tweets...


100%|████████████████████████████████████████████████████████████████████████████| 92806/92806 [52:11<00:00, 29.64it/s]




In [43]:
word_properties_beirut = word_properties.copy()


In [44]:
beirut_tagged_tweets = load_pickle('beirut_tagged_tweets')
moria_tagged_tweets = load_pickle('moria_tagged_tweets')
#coref_chains[2]
word_properties_moria

defaultdict(dict,
            {'immigrant': {'integration': 0.51407313},
             'population': {'reception': 0.40921068,
              'economisation': 0.44051868},
             'usa': {'settlement': 0.45015204,
              'economisation': 0.4147114,
              'victimization': 0.5107105},
             'we': {'settlement': 0.43890932,
              'reception': 0.71172595,
              'security': 0.4454641,
              'criminality': 0.41856155,
              'economisation': 0.587628,
              'humanitarian': 0.6216242,
              'victimization': 0.5621486,
              'integration': 0.40430248,
              'affection': 0.691247,
              'reason': 0.60125166,
              'difficulty': 0.5067118,
              'honor': 0.42500293},
             'racism': {'criminality': 0.4122551,
              'economisation': 0.4090782,
              'victimization': 0.41302788,
              'integration': 0.42162287,
              'no trustworthiness': 0.52987504

In [None]:
for tweet_id in range(len(tagged_tweets))[:10]:
    print(coref_chains[tweet_id])
    print(tagged_tweets[tweet_id].text)

In [46]:
from collections import defaultdict
# import these modules 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus.reader.wordnet import NOUN
import numpy as np
  
lemma = WordNetLemmatizer() 

beirut_df[['date','time']] = beirut_df['created_at'].str.split(' ',expand=True)
moria_df[['date','time']] = moria_df['created_at'].str.split(' ',expand=True)

def get_frames(event_df,tagged_tweets,word_properties):
    cand_frames = defaultdict(list)
    for tweet_id in tqdm(range(len(tagged_tweets))):
        #print(tweet)
        cand_words = [[word.id, word.text,word.head] for sent in tagged_tweets[tweet_id].sentences for word in sent.words]
        #print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')
        #print(len(cand_df['candidates']))
        for cand in manual_cands:#event_merged_cands['candidates']:
                # lemmatize representative head of candidate 
                rep_head = cand #lemma.lemmatize(cand[1].lower(),pos=NOUN)

                if rep_head in tagged_tweets[tweet_id].text.lower() and len(rep_head)>1:
                    #find all dependencies of the phrase head
                    for related in range(len(cand_words)):
                        cand_word_lemma = lemma.lemmatize(cand_words[related][1].lower())
                        #print(f'Yes it is, related = {lemma.lemmatize(cand_heads[related][1].lower(),pos=NOUN)}')
                        #lemma.lemmatize(cand_heads[related][1].lower(),pos=NOUN)
                        if rep_head == cand_word_lemma:
                            related_word = lemma.lemmatize(cand_words[cand_words[related][2]-1][1].lower())
                            cand_frames['word'].append(rep_head)
                            cand_frames['date'].append(event_df['date'][tweet_id])
                            #cand_frames['word'].append(phrase_head)
                            for frame_property in list(frame_properties.keys()):
                                #print(frame_property)

                                try:
                                    #print(word_properties[phrase_head][frame_property])
                                    cand_frames[frame_property].append(word_properties[related_word][frame_property])

                                except KeyError:
                                    #print('Error')
                                    #cand_frames[frame_property].append(word_properties['tent'][frame_property])
                                    cand_frames[frame_property].append(np.NaN)
                                
    return cand_frames
                    #print(len(cand_frames[frame_property]))
                    
                #print('\n')
beirut_cand_frames = get_frames(beirut_df, beirut_tagged_tweets,word_properties_beirut)               
moria_cand_frames = get_frames(moria_df, moria_tagged_tweets,word_properties_moria)                   
"""for frame_property in list(frame_properties.keys()):
                    for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    #print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                            #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                        except KeyError:
                            pass"""
                        #[cand_frames[seed_word][cand].append(model.similarity(print(f'{cand} is related to {np_heads[np_heads[related][2]+1][1]}') if cand == np_heads[related][1] else print('nej') for related in range(len(np_heads))]
            #print(get_head(cand))
            #print(np_heads[19][1])
            #[f(x) if condition else g(x) for x in sequence]
            #[print(np_heads[np_heads[related][2]-1]) if get_head(cand)==np_heads[related][1] else print('hi') for related in range(len(np_heads))]

            
#became ___ (vb and vbx)
#(VP sit/VB (PP on/IN (NP the/DT mat/NN))))) 

#common phrases = migrant camp, covid case, covid test

#cand_frames

100%|███████████████████████████████████████████████████████████████████████████| 24511/24511 [00:45<00:00, 539.76it/s]
100%|██████████████████████████████████████████████████████████████████████████| 92806/92806 [01:08<00:00, 1360.97it/s]


"for frame_property in list(frame_properties.keys()):\n                    for seed_word in frame_properties[frame_property]:\n                        try:\n                            for related in range(len(np_heads)):\n                                #print(np_heads[related])\n                                #if cand[1] == np_heads[related][1]:\n                                if phrase_head == np_heads[related][1]:\n                                    #print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')\n                                    cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))\n                            #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]\n                        except KeyError:\n                            pass"

In [50]:

def aggr_frames(cand_frames,the_word):
    framed_words = pd.DataFrame.from_dict(cand_frames)

    merged_frames = framed_words.copy()

    #framed_words[framed_words['date']=='2020-09-04']

    merged_frames['no trustworthiness'] = - merged_frames['no trustworthiness']
    merged_frames['refusal'] = - merged_frames['refusal']
    merged_frames['irrationality'] = -merged_frames['irrationality']
    merged_frames['difficulty'] = -merged_frames['difficulty'] 
    merged_frames['dishonor'] = -merged_frames['dishonor']


    trust = ['trustworthiness', 'no trustworthiness']
    honor = ['honor', 'dishonor']
    affection = ['affection','refusal']
    reason = ['reason','irrationality']
    easiness = ['easiness','difficulty']

    merged_frames = pd.lreshape(merged_frames,
                            {'reason':reason, 'honor':honor,'affection':affection,'trust':trust,'easiness':easiness},
                           dropna=False)


    merged_frames = merged_frames[merged_frames['word'] == the_word]

    aggr_frames = merged_frames.groupby(['word','date'],as_index=False).median()
    frame_size = merged_frames.groupby(['word','date'],as_index=False).size()

    return aggr_frames,frame_size


In [53]:
the_word = 'refugee'
moria_aggr_frames,_ = aggr_frames(moria_cand_frames,the_word)
beirut_aggr_frames,_= aggr_frames(beirut_cand_frames,the_word)

In [63]:
import plotly.express as px
from plotly.subplots import make_subplots

#fig = make_subplots(specs=[[{"secondary_y": True}]])


frame_dim = 'honor'

fig = px.line(moria_aggr_frames, x="date", y=[moria_aggr_frames[frame_dim],beirut_aggr_frames[frame_dim]],
             title=f'Frame bias towards "{the_word}" within frame {frame_dim}')

#ax2 = px.line(beirut_aggr_frames, x="date", y=[frame_dim],render_mode='webgl')

#ax2.update_traces(yaxis='y2')
#fig.add_traces(ax1.data + ax2.data)

#fig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))

fig.show()

In [None]:
fig = px.line(aggr_frames, x="date", y=['difficulty','easiness'], title=f'Frame bias towards {the_word}')
fig.show()

# TESTING:

In [None]:
# batching the tweets speeds the model considerably and is enabled by splitting sentences using '\n\n' 
from stanza_batch import batch
from nltk.tokenize import sent_tokenize

# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(tweets_corpus)[:50] 
for tweet in range(len(all_tweets_list)):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if tweet_sentokenized == []:
        tweet_sentokenized.append('empty_tweet')
        print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=1000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

In [None]:


for tweet in tqdm(range(len(tweets_corpus))):
    print(tweets_corpus[tweet])
    np_heads = [[word.id, word.text,word.head,word.deprel] for sent in tagged_tweets[tweet].sentences for word in sent.words]
    print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in tagged_tweets[tweet].sentences for word in sent.words], sep='\n')
    #print(np_heads)
    ph_ids = set([np_heads[i][2] for i in range(len(np_heads))])
    ph_words = [np_heads[i-1][1] for i in ph_ids]

    word_pairs = [(np_heads[word][1], np_heads[np_heads[word][2]-1][1]) for word in range(len(np_heads)) if np_heads[word][2] != 0]
    #print(word_pairs)
    
    compounds = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if 'compound' in np_heads[i][3]]
    print(compounds)
    
    advmods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='advmod']
    print(advmods)
    
    amods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='amod']
    print(amods)
    for pair in word_pairs:
        phrase = pair[0]+'_'+pair[1]

    #print(model.most_similar('illegal_immigrant'))

    
    """#print(len(cand_df['candidates']))
    candidate_list = cand_df['candidates']
    for cand in cand_df['candidates']:
        #print(cand[2])
        #print(get_head(str(cand)))
        for phrase_head in cand[2]:
            #print(phrase_head)
            #if str(cand[1]) in str(tweet):
            if str(phrase_head) in str(tweet) and len(phrase_head)>2:
                #print(phrase_head)
                ph_words = [np_heads[i-1][1] for i in phrase_heads]
                #print(ph_words)
                for related in range(len(np_heads)):
                    if phrase_head == np_heads[related][1]:
                        pass
                        #print(f'checking {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                  for frame_property in list(frame_properties.keys()):
                        for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    #cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                                #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                                except KeyError:
                                pass"""