# Candidate merging and related preprocessing


Import relevant packages for the following parts

In [1]:
#python libraries
import stanza
import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import os
import re
import csv
from tqdm import tqdm
import time

# self written modules
import preprocessing
import candidate_processing as cand_prep
import candidate_extraction as cand_ex


Reading english - 1grams ...
Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. We import the data and split them based on the event date

In [3]:
#data_url = r"CBS - Copenhagen Business School\Kick-Ass Master Thesis - General\Data\moria-data/moria_no_duplicates.csv"
beirut_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_beirut.csv" # for Beirut
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_tigray.csv" # for Tigray
channel_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_channel.csv" # for Channel
moria_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_moria.csv" # for Moria
all_url = r"Dropbox (CBS)/Master thesis data/df_tweets.csv" # for all

def read_dataframe(data_url):
    directory_path = os.getcwd() + "/../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print('total tweets: ', event_df.shape[0])
    return event_df

#beirut_df = read_dataframe(beirut_url)
tigray_df = read_dataframe(tigray_url)
channel_df = read_dataframe(channel_url)
moria_df = read_dataframe(moria_url)
#all_df = read_dataframe(all_url)



total tweets:  180779
total tweets:  604536
total tweets:  92806


## 2. We preprocess the data using the function from self-written preprocessing module

In [None]:
# see the description of the method in the preprocessing module
beirut_tweets = preprocessing.preprocess_tweets(beirut_df['text'])
tigray_tweets = preprocessing.preprocess_tweets(tigray_df['text'])
channel_tweets = preprocessing.preprocess_tweets(channel_df['text'])
moria_tweets = preprocessing.preprocess_tweets(moria_df['text'])
#all_tweets = preprocessing.preprocess_tweets(all_df['text'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
100%|███████████████████████████████████████████████████████████████████████████| 24511/24511 [02:13<00:00, 184.07it/s]
 52%|███████████████████████████████████████                                    | 94204/180779 [37:38<27:19, 52.80it/s]

## 3. We instantiate stanza english language module

In [4]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ needed when running first time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#

#stanza.download("en")

#directory = '../../export CORENLP_HOME=' ##ADD DIRECTORY HERE
#stanza.install_corenlp()

#import os
#os.environ["CORENLP_HOME"] = directory

In [4]:
en_nlp = stanza.Pipeline("en", ner_batch_size=8192)

2021-03-26 08:26:47 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-26 08:26:47 INFO: Use device: cpu
2021-03-26 08:26:47 INFO: Loading: tokenize
2021-03-26 08:26:47 INFO: Loading: pos
2021-03-26 08:26:47 INFO: Loading: lemma
2021-03-26 08:26:47 INFO: Loading: depparse
2021-03-26 08:26:48 INFO: Loading: sentiment
2021-03-26 08:26:48 INFO: Loading: ner
2021-03-26 08:26:49 INFO: Done loading processors!


In [5]:

import pickle
import os

def pickle_files(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)

In [None]:

# extract noun_phrase and coreference candidates, as well as tag tweets using cand_ex.candidate_identification function

# to show progress of apply function
tqdm.pandas()

moria_tweets = moria_df['text'].progress_apply(preprocessing.preprocess_tweets)
moria_np_list, moria_tagged_tweets = cand_ex.candidate_identification(moria_tweets,en_nlp,10000)

pickle_files('moria_np_list',moria_np_list)
pickle_files('moria_tagged_tweets',moria_tagged_tweets)

tigray_tweets = tigray_df['text'].progress_apply(preprocessing.preprocess_tweets)
tigray_np_list, tigray_tagged_tweets = cand_ex.candidate_identification(tigray_tweets,en_nlp,10000)

pickle_files('tigray_np_list',tigray_np_list)
pickle_files('tigray_tagged_tweets',tigray_tagged_tweets)
                                                                    
                                                                    
channel_tweets = channel_df['text'].progress_apply(preprocessing.preprocess_tweets)
channel_np_list, channel_tagged_tweets = cand_ex.candidate_identification(channel_tweets,en_nlp,10000)

pickle_files('channel_np_list',channel_np_list)
pickle_files('channel_tagged_tweets',channel_tagged_tweets)



  from pandas import Panel
100%|██████████████████████████████████████████████████████████████████████████| 92806/92806 [00:50<00:00, 1848.79it/s]
0it [00:00, ?it/s]

annotating the tweet corpus...


## 4. We apply stanza module on the tweets to get NER and POS tags. We do it in batches to speed things up.

In [276]:
from stanza_batch import batch
from nltk.tokenize import sent_tokenize

#~~~ check how much faster it is with "\n\n"
# the sampled_df series should be converted to list so we avoid slowing things down with iteration
all_tweets_list = list(sampled_df) 

tagged_tweets = [tweet for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=3000))] # Default batch size is 32

tagged_tweets[0].text

200it [00:44,  4.53it/s]


'I read all your books professor, and even waited on you when you came to Ilili, this tweet gave so much confidence because we did this . And we also did Jerusalem and Beirut next week in a non a7zab related kitchen skin inthe game'

In [277]:
# get easily accessible list of tuples (POS-tags of each word, NER-tags of each named entity) 
tweet_tags = cand_prep.get_tweet_tags(tagged_tweets) 
#print(tweet_tags)

#get a set of all NER tags existing in corpus - check which one are found in corpus

tweet_tags_set = set()
for tweet in tweet_tags:
    tweet_ner_tags = set(tweet[1].values())
    tweet_tags_set.update(tweet_ner_tags)

print(tweet_tags_set)

100%|█████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 19994.78it/s]

{'FAC', 'CARDINAL', 'QUANTITY', 'ORG', 'TIME', 'ORDINAL', 'LANGUAGE', 'NORP', 'LOC', 'LAW', 'GPE', 'EVENT', 'DATE', 'WORK_OF_ART', 'PERSON'}





## 5. As initial WCL candidates, we extract noun phrases (NPs) and coreference chains.

## We do so using CoreNLPClient wrapper

### SOME PREPROCESSING NEEDED
* remove links - check
* remove # from hashtags? - check
* remove/merge mentions? - check


* remove recurring texts (signatures of news media) - any new spotted should be added in preprocessing file's '__remove_tweet_signatures__' function
* remove posts of some accounts (refugee_list)
* exclude NERs that tag numbers - should we mark phrase as NE if the head is not NE? - check
* play around with candidate types
* optimize code and make it neater



In [4]:
import candidate_extraction as cand_ex

noun_phrase_list, corefs_list = cand_ex.extract_candidates(sampled_df)

2021-03-22 21:40:04 INFO: Writing properties to tmp file: corenlp_server-dac4812596094d5d.props
2021-03-22 21:40:04 INFO: Starting server with command: java -Xmx16G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 300000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-dac4812596094d5d.props -annotators tokenize,ssplit,pos,lemma,parse,coref,ner,udfeats,depparse -preload -outputFormat serialized
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

extracting noun phrases...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.88s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

extracting coreference chains...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.03s/it]


In [279]:
#Store the noun phrases in the pickle file
import pickle

with open('beirut_nps', 'wb') as fp:
    pickle.dump(noun_phrase_list, fp)

with open('beirut_crfs','wb') as fp:
    pickle.dump(corefs_list, fp)

In [59]:
#removing the spaces around dashes, slashes and apostrophes should increase stanza's ability to parse sentence correctly
noun_phrase_list = [[np.replace(' - ','-') for np in nps] for nps in noun_phrase_list]
noun_phrase_list = [[np.replace(' / ','/') for np in nps] for nps in noun_phrase_list]
noun_phrase_list = [[np.replace(" ' ","'") for np in nps] for nps in noun_phrase_list]


In [286]:
# Load NPs and corefs from pickle file
import pickle

with open(r"beirut_nps", "rb") as input_file:
    noun_phrase_list = pickle.load(input_file)

with open(r"beirut_nps", "rb") as input_file:
    corefs_list = pickle.load(input_file)


## 6. We keep only NPs shorter than 20 words and remove children of parent NPs 

In [12]:
def get_cand_len(cand_list):
    # calculates number of candidates in the corpus
    sum_len = 0
    for tweet_cands in cand_list:
        sum_len += len(tweet_cands)
    return sum_len

print(get_cand_len(noun_phrase_list))

def remove_long_nps(noun_phrase_list):
    for tweet_id in range(len(noun_phrase_list)):
        #print(noun_phrase_list[tweet_id])
        #reverse the list of tweets nps so we avoid moving indexes and leaving out some phrases 
        for noun_p in reversed(noun_phrase_list[tweet_id]):
            i = noun_phrase_list[tweet_id].index(noun_p)
            np_split = noun_p.split()
            if len(np_split) > 19:
                noun_phrase_list[tweet_id].remove(noun_phrase_list[tweet_id][i])
    return noun_phrase_list


noun_phrase_list = remove_long_nps(noun_phrase_list)
print(get_cand_len(noun_phrase_list))

132
132


In [15]:
def remove_child_nps(noun_phrase_list):
    # remove the child NPs and keep only parents, run until the sum_len stops decreasing
    after_removal_len = 0
    while after_removal_len != get_cand_len(noun_phrase_list):
        after_removal_len = get_cand_len(noun_phrase_list)
        for tweet_nps in noun_phrase_list:
            for noun_p in range(len(tweet_nps)):
                try:
                    #if the subsequent noun_p (child np) is contained in the current one, remove the child np
                    if tweet_nps[noun_p].find(tweet_nps[noun_p+1]) != -1:
                        tweet_nps.remove(tweet_nps[noun_p+1])
                        
                #ignore the error caused with end of the list
                except IndexError:
                    pass
    return noun_phrase_list

def remove_mention(noun_phrase_list):
    return [[np.replace('@','') for np in nps ] for nps in noun_phrase_list ]    

print(get_cand_len(noun_phrase_list))

noun_phrase_list = remove_child_nps(noun_phrase_list)
noun_phrase_list = remove_mention(noun_phrase_list)

print(get_cand_len(noun_phrase_list))

73
73


## 7. We get the heads of noun phrases (in batches)

In [283]:

#tag all tweets and save them in a list    
batched_np_list = cand_prep.prep_candlist_for_batching(noun_phrase_list)
#print(batched_np_list)


tagged_np_cands = [tagged_cand for tagged_cand in batch(batched_np_list, en_nlp, batch_size=6000)]# Default batch size is 32
tagged_np_cands


['i\\n\\nall your books\\n\\nprofessor\\n\\nyou\\n\\nilili\\n\\nthis tweet\\n\\nso much confidence\\n\\nwe\\n\\nthis\\n\\nwe\\n\\njerusalem and beirut\\n\\na non a7zab related kitchen skin inthe game', 'my grandfather\\n\\na lebanese immigrant\\n\\nthe second wave\\n\\nnormandy\\n\\nim\\n\\nhis blood pumping under my skin', 'migrant _ workers _ lives _ matter lets not forget lebanese racism\\n\\nkafala', 'more than a dozen refugees in eastern lebanon\\n\\nthe coronavirus, which shows the vulnerability of refugees in the country, many of whom live in crowded, poor conditions', 'a lovely way\\n\\nthe week\\n\\ninspiring art\\n\\nthe beautiful creations\\n\\nyoung people\\n\\nlebanon who are proving in their own way that everyone counts in the fight against covid 19, including refugees\\n\\nyou\\n\\nyours', "we\\n\\nthat people\\n\\npeople, just as modern - day lebanese\\n\\nphoenicians and modern - day iranians\\n\\npersians\\n\\nthe only difference being we\\n\\nour land for generations

In [284]:
tagged_np_cands


[[
   [
     {
       "id": 1,
       "text": "i\\n\\nall",
       "lemma": "i\\n\\nall",
       "upos": "VERB",
       "xpos": "VB",
       "feats": "Mood=Imp|VerbForm=Fin",
       "head": 0,
       "deprel": "root",
       "misc": "start_char=0|end_char=8",
       "ner": "O"
     },
     {
       "id": 2,
       "text": "your",
       "lemma": "you",
       "upos": "PRON",
       "xpos": "PRP$",
       "feats": "Person=2|Poss=Yes|PronType=Prs",
       "head": 3,
       "deprel": "nmod:poss",
       "misc": "start_char=9|end_char=13",
       "ner": "O"
     },
     {
       "id": 3,
       "text": "books\\n\\nprofessor\\n\\nyou\\n\\nilili\\n\\nthis",
       "lemma": "books\\n\\nprofessor\\n\\nyou\\n\\nilili\\n\\nthis",
       "upos": "NOUN",
       "xpos": "NN",
       "feats": "Number=Sing",
       "head": 1,
       "deprel": "obj",
       "misc": "start_char=14|end_char=56",
       "ner": "O"
     }
   ],
   [
     {
       "id": 1,
       "text": "tweet\\n\\nso",
       "lemma": "t

In [285]:

def get_cand_heads(tagged_cands):
    # each candidate will be stored as [(set_of_phrases_heads), cand_rep_head] 
    return [[set([cand.words[word.head-1].text for word in cand.words]), 
             [word.text for word in cand.words if word.head == 0]] #the root of NP has value 0 
             for cand in tagged_cands.sentences]


np_cand_heads = [get_cand_heads(tweet_cands) for tweet_cands in tagged_np_cands]
print(np_cand_heads)

#[print(tagged_np_cand.text) for tagged_np_cand in tagged_np_cands]     

[[[{'i\\n\\nall', 'books\\n\\nprofessor\\n\\nyou\\n\\nilili\\n\\nthis'}, ['i\\n\\nall']], [{'tweet\\n\\nso'}, ['tweet\\n\\nso']], [{'skin', 'game', 'non', 'a7zab', 'confidence\\n\\nwe\\n\\nthis\\n\\nwe\\n\\njerusalem'}, ['confidence\\n\\nwe\\n\\nthis\\n\\nwe\\n\\njerusalem']]], [[{'blood', 'pumping', 'skin'}, ['blood']]], [[{'racism\\n\\nkafala', 'forget', 'workers'}, ['workers']]], [[{'shows', 'live', 'vulnerability', 'country', 'whom', 'many', 'conditions', 'refugees', 'coronavirus'}, ['live']]], [[{'covid', 'way', 'fight', 'creations', 'refugees\\n\\nyou\\n\\nyours', 'proving', 'counts', 'week\\n\\ninspiring', 'people\\n\\nlebanon', 'way\\n\\nthe'}, ['way\\n\\nthe']]], [[{'generations\\n\\nthat', 'land', 'people\\n\\npeople', 'iota', 'iranians\\n\\npersians', 'day', 'difference', 'we\\n\\nour', 'lebanese\\n\\nphoenicians', 'one', 'doesn'}, ['doesn']]], [[{'war', 'person', 'lebanon\\n\\nthe', 'surprised', '0', 'prescence', 'migrant', 'australian', 'country', 'comment\\n\\n', 'reporte

## 8. We define candidate types 

In [77]:
#dictionary to assign candidate types based on named entities and part of speech tags
#the key tuple consists of (isNE, lexicographer type, plural)
cand_types_dict = {(True,'PERSON',None):'person-ne',
              (True,'NORP',None):'person-ne',
              (True,'PERSON','plural'):'person-nes',
              (True,'NORP','plural'):'person-nes',
              (False,'PERSON',None):'person-nn',
              (False,'PERSON','plural'):'person-nns',
              (True,'ORG',None):'group-ne',
              (True,'FAC',None):'group-ne',
              (False,'ORG',None):'group',
              (True,'LOC',None):'loc-ne',
              (True,'GPE',None):'loc-ne',
              (False, 'LOC',None):'loc'
            }


In [78]:
from collections import Counter

#test syntactic categories on all heads of noun phrases
list_of_cand_types = [cand_prep.get_synt_category(cand[1][0]) for tweet_cands in np_cand_heads for cand in tweet_cands]
    
count_types = Counter(list_of_cand_types)
print(count_types)

Counter({None: 1064, 'PERSON': 162, 'ORG': 134, 'LOC': 74})


## 9. We assign candidate types to noun phrase candidates

In [79]:
# label the noun phrases with the candidate types
print(noun_phrase_list)

np_and_cand_list = cand_prep.get_cand_type(noun_phrase_list,np_cand_heads, tweet_tags, cand_types_dict)
print(np_and_cand_list)

 

  8%|██████                                                                          | 15/200 [00:00<00:01, 141.51it/s]

[['it', 'asylum in greece, mac, ser or cro or bul', '3 eu members', 'hu', 'they', 'help', 'the first safe country as asylum seeker', 'the 5', 'no law', 'you', 'it'], ['isis refuge .', "wouldn' t", 'turkey', 'i', "you' re likely to get your head", 'the beaten track', 'christian greece', 'a similar climate', 'islamic turkey'], ['greece', 'refugee overcrowding', 'un'], ['1 / 3', 'law', 'the idle stuff', 'all rights of the turkish minority in greece', 'the law', 'refugees', 'the eu', 'the law', 'western states', 'the states', 'the middle east and africa', 'underground resources', 'the law'], ['the asylum seekers', 'hu borders', 'they', 'a long journey', 'greece', 'they', 'help', 'they', 'multiple more countries'], ['i', 'photo gr apy', 'aesthetic', 'news', 'it', 'a potential embellishments of hard facts and harsh reality', 'i', 'raw realism', 'i', 'this boundary using filmicpro first light this week in lesbos infamous moria migrants camp', 'a big question4me'], ['manfredweber', 'it', 'euro

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 126.18it/s]

[[('it', 'it', {'it'}, 'misc'), ('asylum in greece, mac, ser or cro or bul', 'asylum', {'mac', 'asylum', 'greece', 'bul', 'cro', 'ser'}, 'group'), ('3 eu members', 'members', {'members'}, 'person-nes'), ('hu', 'hu', {'hu'}, 'misc'), ('they', 'they', {'they'}, 'misc'), ('help', 'help', {'help'}, 'person-nn'), ('the first safe country as asylum seeker', 'country', {'country', 'seeker'}, 'group-ne'), ('the 5', '5', {'5'}, 'misc'), ('no law', 'law', {'law'}, 'misc'), ('you', 'you', {'you'}, 'misc'), ('it', 'it', {'it'}, 'misc')], [('isis refuge .', 'refuge', {'refuge', '.'}, 'loc'), ("wouldn' t", 't', {'t'}, 'misc'), ('turkey', 'turkey', {'turkey'}, 'misc'), ('i', 'i', {'i'}, 'misc'), ("you' re likely to get your head", 'likely', {'likely', 'get', 'head'}, 'misc'), ('the beaten track', 'track', {'track'}, 'loc'), ('christian greece', 'greece', {'greece'}, 'misc'), ('a similar climate', 'climate', {'climate'}, 'misc'), ('islamic turkey', 'turkey', {'turkey'}, 'misc')], [('greece', 'greece',




In [80]:
from collections import Counter

# couunt occurence of each candidate type
cand_type_count = [np[3] for nps in np_and_cand_list if nps != None for np in nps ]
    
counts = Counter(cand_type_count)
print(counts)
len(cand_type_count)

Counter({'misc': 1055, 'group': 127, 'person-nns': 89, 'loc': 67, 'person-nn': 60, 'person-nes': 16, 'loc-ne': 10, 'group-ne': 8, 'person-ne': 2})


1434

## 10. We get coreference chains candidates from the tweet corpus

In [22]:
from stanza.server import CoreNLPClient
dict_of_tweet_corefs = {}
#corefs = []
with CoreNLPClient(properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'}, memory='16G') as client:
    for tweet_index in tqdm(range(len(sampled_df))):
        tweet_corefs=[]
        #print(f'Coreferences for the tweet {list(event_df["Tweet Raw"]).index(tweet)} are:')
        for chain in cand_prep.get_coref_chain(sampled_df[tweet_index],client):
            tweet_corefs.append(chain)
            #print(' <-> '.join(chain),'\n')
        #corefs.append(tweet_corefs)
        dict_of_tweet_corefs[tweet_index] = tweet_corefs

dict_of_tweet_corefs

2021-03-21 15:36:21 INFO: Writing properties to tmp file: corenlp_server-f5a2a01a25ca4ce7.props
2021-03-21 15:36:21 INFO: Starting server with command: java -Xmx16G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-f5a2a01a25ca4ce7.props -preload -outputFormat serialized
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [02:32<00:00,  1.32it/s]


{0: [(['No law', 'it'], 0)],
 1: [(['You', 'your', 'you'], 0)],
 2: [],
 3: [(['the law', 'the law', 'the law'], 0)],
 4: [(['They', 'they', 'the asylum seekers', 'they'], 2)],
 5: [(['I', 'I', 'I'], 0), (['aesthetic', 'it'], 0)],
 6: [],
 7: [],
 8: [(['Birmingham', 'birmingham'], 0)],
 9: [(['you', 'you'], 0), (['Turkey', 'it', 'it', 'Turkey'], 3)],
 10: [(['The Bessudos', 'Isaac Bessudos'], 1),
  (['Raquel Bessudo', 'Raquel'], 0)],
 11: [(['them', 'their'], 0)],
 12: [(['you', 'you', 'your', 'your', 'you'], 1),
  (['it', 'it'], 1),
  (['they', 'they'], 0)],
 13: [],
 14: [(['It', 'Greece'], 1)],
 15: [],
 16: [],
 17: [(['their', 'Both men', 'they', 'them', 'they'], 1)],
 18: [(['Those refugees',
    '10 refugees , including three children with disabilities'],
   1),
  (['Greece', 'Greece'], 0)],
 19: [(['UN Refugee Agency', 'Anadolu Agency'], 0)],
 20: [],
 21: [],
 22: [],
 23: [(['we', 'we'], 0)],
 24: [],
 25: [],
 26: [],
 27: [(['they', 'Enterprise Greece', 'their', 'they', 't

In [23]:
with open('sample_crfs', 'wb') as fp:
    pickle.dump(dict_of_tweet_corefs, fp)

dict_of_tweet_corefs

{0: [(['No law', 'it'], 0)],
 1: [(['You', 'your', 'you'], 0)],
 2: [],
 3: [(['the law', 'the law', 'the law'], 0)],
 4: [(['They', 'they', 'the asylum seekers', 'they'], 2)],
 5: [(['I', 'I', 'I'], 0), (['aesthetic', 'it'], 0)],
 6: [],
 7: [],
 8: [(['Birmingham', 'birmingham'], 0)],
 9: [(['you', 'you'], 0), (['Turkey', 'it', 'it', 'Turkey'], 3)],
 10: [(['The Bessudos', 'Isaac Bessudos'], 1),
  (['Raquel Bessudo', 'Raquel'], 0)],
 11: [(['them', 'their'], 0)],
 12: [(['you', 'you', 'your', 'your', 'you'], 1),
  (['it', 'it'], 1),
  (['they', 'they'], 0)],
 13: [],
 14: [(['It', 'Greece'], 1)],
 15: [],
 16: [],
 17: [(['their', 'Both men', 'they', 'them', 'they'], 1)],
 18: [(['Those refugees',
    '10 refugees , including three children with disabilities'],
   1),
  (['Greece', 'Greece'], 0)],
 19: [(['UN Refugee Agency', 'Anadolu Agency'], 0)],
 20: [],
 21: [],
 22: [],
 23: [(['we', 'we'], 0)],
 24: [],
 25: [],
 26: [],
 27: [(['they', 'Enterprise Greece', 'their', 'they', 't

## 11. We determine candidate's type for representative mentions of coref candidates (in batches)

In [81]:
# Load NPs from pickle file
import pickle

with open(r"sample_crfs", "rb") as input_file:
    dict_of_tweet_corefs = pickle.load(input_file)



In [82]:
corefs_list = []

#pick out only the representative mention as the candidate's rep. phrase
for tweet_corefs in dict_of_tweet_corefs:
        tw_corefs = [coref[0][coref[1]] for coref in dict_of_tweet_corefs[tweet_corefs]] 
        # empty list would cause problems in the following steps, that is why we append 'no_candidate' to empty lists
        corefs_list.append(tw_corefs) if len(tw_corefs) != 0 else corefs_list.append(['no_candidate'])

corefs_list  = [[crf.replace('@','').lower() for crf in crfs ] for crfs in corefs_list ]     
corefs_list

[['no law'],
 ['you'],
 ['no_candidate'],
 ['the law'],
 ['the asylum seekers'],
 ['i', 'aesthetic'],
 ['no_candidate'],
 ['no_candidate'],
 ['birmingham'],
 ['you', 'turkey'],
 ['isaac bessudos', 'raquel bessudo'],
 ['them'],
 ['you', 'it', 'they'],
 ['no_candidate'],
 ['greece'],
 ['no_candidate'],
 ['no_candidate'],
 ['both men'],
 ['10 refugees , including three children with disabilities', 'greece'],
 ['un refugee agency'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['we'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['enterprise greece'],
 ['no_candidate'],
 ['they'],
 ['he'],
 ['erdoan'],
 ['uk', 'uk size'],
 ['no_candidate'],
 ['the infected person , a 40 year old man', 'the camp', 'moria', "didn '"],
 ['no_candidate'],
 ['greece', 'protection & operations , gilliantriggs & raoufmazou'],
 ['greece', 'i', 'you'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['you'],
 ['turkey'],
 ['i'],
 ['no_candida

In [83]:
#removing the spaces around dashes, slashes and apostrophes should increase stanza's ability to parse sentence correctly
corefs_list = [[crf.replace(' - ','-') for crf in corefs] for corefs in corefs_list]
corefs_list = [[crf.replace(' / ','/') for crf in corefs] for corefs in corefs_list]
corefs_list = [[crf.replace(" ' ","'") for crf in corefs] for corefs in corefs_list]

In [84]:
#tag all tweets and save them in a list    
tagged_coref_cands = [] 
batched_coref_list = prep_candlist_for_batching(corefs_list)
#print(batched_coref_list)


tagged_coref_cands = [tagged_cand for tagged_cand in batch(batched_coref_list, en_nlp, batch_size=6000)] # Default batch size is 32
#print(tagged_coref_cands)
        
coref_cand_heads = [get_cand_heads(tweet_cands) for tweet_cands in tagged_coref_cands]
coref_cand_heads

[[[{'law'}, ['law']]],
 [[{'you'}, ['you']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'law'}, ['law']]],
 [[{'seekers'}, ['seekers']]],
 [[{'i'}, ['i']], [{'aesthetic'}, ['aesthetic']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'birmingham'}, ['birmingham']]],
 [[{'you'}, ['you']], [{'turkey'}, ['turkey']]],
 [[{'bessudos', 'isaac'}, ['isaac']], [{'bessudo', 'raquel'}, ['raquel']]],
 [[{'them'}, ['them']]],
 [[{'you'}, ['you']], [{'it'}, ['it']], [{'they'}, ['they']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'greece'}, ['greece']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'men'}, ['men']]],
 [[{'children', 'disabilities', 'refugees'}, ['refugees']],
  [{'greece'}, ['greece']]],
 [[{'agency'}, ['agency']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'we'}, ['we']]],
 [[{'no_candidate'}, ['no_candidate']]]

In [85]:
coref_and_cand_list = cand_prep.get_cand_type(corefs_list, coref_cand_heads, tweet_tags, cand_types_dict, corefs=True)


print(coref_and_cand_list) 

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 917.46it/s]

[[('no law', 'law', {'law'}, 'misc')], [('you', 'you', {'you'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('the law', 'law', {'law'}, 'misc')], [('the asylum seekers', 'seekers', {'seekers'}, 'person-nns')], [('i', 'i', {'i'}, 'person-nn'), ('aesthetic', 'aesthetic', {'aesthetic'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('birmingham', 'birmingham', {'birmingham'}, 'misc')], [('you', 'you', {'you'}, 'misc'), ('turkey', 'turkey', {'turkey'}, 'misc')], [('isaac bessudos', 'isaac', {'bessudos', 'isaac'}, 'misc'), ('raquel bessudo', 'raquel', {'bessudo', 'raquel'}, 'misc')], [('them', 'them', {'them'}, 'misc')], [('you', 'you', {'you'}, 'misc'), ('it', 'it', {'it'}, 'misc'), ('they', 'they', {'they'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('greece', 'greece', {'greece'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'},




## 12. We combine the candidate lists for candidate merging

We organize candidates in a list sorted by their number of phrases

In [86]:
#concatenate corefs and noun phrase lists
nps_cands = [cand for cands in np_and_cand_list for cand in cands]
crf_cands = [cand for cands in coref_and_cand_list for cand in cands]
#candidate_list = coref_and_cand_list + np_and_cand_list
#print(f'Len = {len(candidate_list)} should be 2x amount of tweets')
print(len(nps_cands), len(crf_cands))
#unpack list of lists into one list
candidate_list = nps_cands + crf_cands
print(f'The amount of all candidates is {len(candidate_list)}')


1434 288
The amount of all candidates is 1722


In [87]:
nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences ]
crf_tagged = [sent for tagged_cand in tagged_coref_cands for sent in tagged_cand.sentences ]
print(len(nps_tagged), len(crf_tagged))
all_cands_tagged = nps_tagged + crf_tagged


1434 288


In [88]:
print(len(candidate_list))
print(len(all_cands_tagged))

1722
1722


In [89]:
cand_df = pd.DataFrame(
    {'candidates': candidate_list,
     'cand_tags': all_cands_tagged
    })

cand_df['cand_text'] = cand_df.candidates.apply(lambda x: x[0])
cand_df['cand_len'] = cand_df.cand_text.apply(lambda x: len(x.split()))
cand_df.columns = cand_df.columns.str.strip()
cand_df

Unnamed: 0,candidates,cand_tags,cand_text,cand_len
0,"(it, it, {it}, misc)","[\n {\n ""id"": 1,\n ""text"": ""it"",\n ""...",it,1
1,"(asylum in greece, mac, ser or cro or bul, asy...","[\n {\n ""id"": 1,\n ""text"": ""asylum"",\n ...","asylum in greece, mac, ser or cro or bul",9
2,"(3 eu members, members, {members}, person-nes)","[\n {\n ""id"": 1,\n ""text"": ""3"",\n ""l...",3 eu members,3
3,"(hu, hu, {hu}, misc)","[\n {\n ""id"": 1,\n ""text"": ""hu"",\n ""...",hu,1
4,"(they, they, {they}, misc)","[\n {\n ""id"": 1,\n ""text"": ""they"",\n ...",they,1
...,...,...,...,...
1717,"(no_candidate, no_candidate, {no_candidate}, m...","[\n {\n ""id"": 1,\n ""text"": ""no_candidat...",no_candidate,1
1718,"(omfg, omfg, {omfg}, misc)","[\n {\n ""id"": 1,\n ""text"": ""omfg"",\n ...",omfg,1
1719,"(italian immigrants in greece, immigrants, {gr...","[\n {\n ""id"": 1,\n ""text"": ""italian"",\n...",italian immigrants in greece,4
1720,"(no_candidate, no_candidate, {no_candidate}, m...","[\n {\n ""id"": 1,\n ""text"": ""no_candidat...",no_candidate,1


In [92]:
# we sort the candidates by their length

cand_df.sort_values('cand_len', ascending=False,inplace=True)

#cand_df = cand_df[cand_df.cand_text not in  ['no_candidate', 'candidate_to_be_removed']]

cand_df.reset_index(drop=True, inplace = True)
cand_df
#all_cands_tagged.sort(reverse=True,key=get_cand_len(candidate_list))

Unnamed: 0,candidates,cand_tags,cand_text,cand_len
0,(conditions + reduce overcrowding at the recep...,"[\n {\n ""id"": 1,\n ""text"": ""conditions""...",conditions + reduce overcrowding at the recept...,19
1,"(access to territory & asylum, living conditio...","[\n {\n ""id"": 1,\n ""text"": ""access"",\n ...","access to territory & asylum, living condition...",19
2,"(population & children for 39%, of whom more t...","[\n {\n ""id"": 1,\n ""text"": ""population""...","population & children for 39%, of whom more th...",19
3,(economic migrants who cross developed nations...,"[\n {\n ""id"": 1,\n ""text"": ""economic"",\...",economic migrants who cross developed nations ...,19
4,(morocco tunisia libya greece turkey each one ...,"[\n {\n ""id"": 1,\n ""text"": ""morocco"",\n...",morocco tunisia libya greece turkey each one o...,19
...,...,...,...,...
1717,"(who, who, {who}, group)","[\n {\n ""id"": 1,\n ""text"": ""who"",\n ...",who,1
1718,"(asylum, asylum, {asylum}, group)","[\n {\n ""id"": 1,\n ""text"": ""asylum"",\n ...",asylum,1
1719,"(you, you, {you}, misc)","[\n {\n ""id"": 1,\n ""text"": ""you"",\n ...",you,1
1720,"(you, you, {you}, misc)","[\n {\n ""id"": 1,\n ""text"": ""you"",\n ...",you,1


In [93]:
#remove dummy candidates that were used to avoid errors

print(len(cand_df))
cand_df = cand_df[cand_df.cand_text != 'candidate_to_be_removed']
cand_df = cand_df[cand_df.cand_text != 'no_candidate']
len(cand_df)
cand_df.reset_index(drop=True,inplace=True)


1722


### First merging step

In [95]:
#
# THIS IS THE FIRST MERGING STEP
#
        
def merging_step1(candidate_list):
    """
    In the first merging step, we merge two candidates if the head of each of their representative phrase 
     is identical by string comparison.
    """
    indices_to_remove = set()
    for longer_cand in tqdm(range(len(candidate_list))):     
        for cand in range(longer_cand+1,len(candidate_list)): 
            #print(f'for index {candidate_list[longer_cand][1]} checking the index {candidate_list[cand][1]}')

            #performing merging only for NE candidates of the same type
            if 'ne' in candidate_list[longer_cand][3]:
                #mark for merging if the head and its head's cand type is the same for 2 candidates

                if candidate_list[longer_cand][1] == candidate_list[cand][1] and candidate_list[longer_cand][3] == candidate_list[cand][3]:
                    print(f'matching "{longer_cand}" with "{cand}"')
                    #print(f'{candidate_list[longer_cand][1]} ===== {candidate_list[cand][1]}')
                    indices_to_remove.add(cand)
    return indices_to_remove

def merge_indices(cand_df,indices_to_remove):                

    print(f'Initial amount of candidates: {len(cand_df)}')                
    #print(len(sorted(indices_to_remove)))

    for index in reversed(sorted(indices_to_remove)):
        cand_df.drop([index],inplace=True)
        
    cand_df.reset_index(drop=True,inplace=True)
    print(f'Amount of candidates: {len(cand_df)}, after removing {len(sorted(indices_to_remove))} indices') 
    return cand_df


cand_df = merge_indices(cand_df, merging_step1(cand_df['candidates']))

100%|█████████████████████████████████████████████████████████████████████████████| 1613/1613 [00:04<00:00, 328.16it/s]

Initial amount of candidates: 1613
Amount of candidates: 1613, after removing 0 indices





In [485]:
cand_df['candidates'][250][0]

'13,000 migrants & asylum seekers'

### Second merging step

We merge 2 candidates if their sets of phrases heads are semantically similar

In [96]:
import gensim

#load the GoogleNews 300dim model (fix path)
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [97]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import numpy as np

#adjust for sets of phrases in the candidate
def merging_step2(candidate_list):
    
    indices_to_remove = set()
    for longer_cand in tqdm(range(len(candidate_list))):     
        i = candidate_list[longer_cand]
        long_cand_mean_vec = phrase_heads_avg_vector(candidate_list[longer_cand][2])
        for cand in range(longer_cand+1,len(candidate_list)): 
            #print(f'for index {candidate_list.index(longer_cand)} checking the index {candidate_list.index(cand)}')
            #if candidate_list[longer_cand][1] == candidate_list[cand][1]:
                #print(f'matching "{longer_cand}" with "{cand}"')
            cand_mean_vec = phrase_heads_avg_vector(candidate_list[cand][2])

            if candidate_list[longer_cand][3] == candidate_list[cand][3]:
                try:
                    #print(1-cosine(long_cand_mean_vec,cand_mean_vec))
                    #print(long_cand_mean_vec.reshape(-1,1).shape, cand_mean_vec.reshape(1,-1).shape)
                    if 1-cosine(long_cand_mean_vec.reshape(-1,1),cand_mean_vec.reshape(-1,1)) >= 0.5:
                        #print(f'matching "{longer_cand}" with "{cand}"') 
                        indices_to_remove.add(cand)
                except AttributeError:
                    pass

            else:
                try:
                    if 1-cosine(long_cand_mean_vec.reshape(-1,1),cand_mean_vec.reshape(-1,1)) >= 0.7:
                        #print(f'matching "{longer_cand}" with "{cand}"') 
                        indices_to_remove.add(cand)
                        
                except AttributeError:
                    pass


    return indices_to_remove

def phrase_heads_avg_vector(phrase_set):
    phrase_head_vectors = []
    for phrase_head in phrase_set:    
        try:
            phrase_head_vectors.append(model[phrase_head])
        except KeyError:
            pass
    #phrase_head_vectors = [model[phrase_head] for phrase_head in phrase_set]
    if len(phrase_head_vectors) != 0:
        return np.mean(phrase_head_vectors,axis=0)
    else: 
        return np.NaN

        

cand_df = merge_indices(cand_df, merging_step2(cand_df['candidates']))


100%|██████████████████████████████████████████████████████████████████████████████| 1613/1613 [02:35<00:00, 10.36it/s]


Initial amount of candidates: 1613
Amount of candidates: 374, after removing 1239 indices


In [551]:
cand_df['candidates'][63][2]

{'camps', 'conditions', 'improvement', 'refugees'}

## Third merging step representative labeling

currently working on average cosine similarity of each phrase in the candidate - maybe not optimal, maybe it will be better with a different threshold

In [98]:
from sklearn.cluster import AffinityPropagation

from sklearn.metrics.pairwise import cosine_similarity

def merging_step3(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:  
        #the head of noun phrase is marked with value 0 for the word.head
        np_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]
        #np_pos_tags = {word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #print(np_heads_pos)
        cand_np_phrases = []
        for word, head, pos in np_heads_pos:
            #head-1 because the pointer to head does not use 0 index
            if (pos == 'JJ' or pos=='VBN') and 'NN' in np_heads_pos[head-1][2]:
                cand_np_phrases.append(f'{word}_{np_heads_pos[head-1][0]}')
        phrases.append(cand_np_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for longer_cand in range(len(candidate_list)):     
        i = candidate_list[longer_cand]
        long_cand_vectors = phrases_vectors(phrases[longer_cand])
        if len(long_cand_vectors)==0:
            pass
        else:
            for cand in range(longer_cand+1,len(candidate_list)): 
                short_cand_vectors = phrases_vectors(phrases[cand])
                if len(short_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(long_cand_vectors),len(short_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(long_cand_vectors)):
                        for j in range(len(short_cand_vectors)):

                            sim_matrix[i][j] = cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1))

                                
                    if np.mean(sim_matrix) > 0.3:
                        #print(f'{longer_cand} and {cand} are {numpy.mean(sim_matrix)} similar' )
                        indices_to_remove.add(cand)
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove
                


def phrases_vectors(cand_phrases):
    
#for cand_phrases in phrases:
    #print(cand_phrases)
    cand_phrase_vectors = []
    for phrase in cand_phrases:
        try:
            cand_phrase_vectors.append(model[phrase])
            #print(f'for existing phrase "{phrase}" the vector is {model[phrase][0]}')
        except KeyError:
            phrase_words = phrase.split('_')
            #print(model[phrase_words[1]])
            try:
                phrase_vectors = [model[phrase_word] for phrase_word in phrase_words]
                #print(f'for phrase "{phrase}" avg vector is "{sum(phrase_vectors)/len(phrase_vectors)}') 
                cand_phrase_vectors.append(sum(phrase_vectors)/len(phrase_vectors))
            except KeyError:
                pass
    #print(len(cand_phrase_vectors))
    return cand_phrase_vectors
    
    
cand_df = merge_indices(cand_df, merging_step3(cand_df))
#print(indices_to_remove)

Initial amount of candidates: 374
Amount of candidates: 297, after removing 77 indices


In [568]:
for cand in cand_df['cand_text']:
    print(cand)
    

Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece
t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset
Its sounds more a kin to a penny dreadful sci fi novel than an economic treatise .
camp on Lesbos, where just 

### Merging step 4

In [99]:
# missing the second method - we check for the lexical identity of specific stems in multiple candidates.

def merging_step4(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:

        #the head of noun phrase is marked with value 0 for the word.head
        np_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]

        #print(np_heads_pos)
        cand_np_phrases = []
        for word, head, pos in np_heads_pos:
            i = np_heads_pos.index((word, head, pos))
            #print(np_heads_pos)
            #print(np_heads_pos[i])
            #print(np_heads_pos[head-1])
            #'NN' in np_heads_pos[head-1][2] and
            try:
                if 'NN' in pos and 'NN' in np_heads_pos[i+1][2] : 
                    cand_np_phrases.append(f'{word}_{np_heads_pos[i+1][0]}')
                if 'NN' in pos and 'NN' in np_heads_pos[head-1][2]:
                    cand_np_phrases.append(f'{word}_{np_heads_pos[head-1][0]}')
            except IndexError:
                pass
        phrases.append(cand_np_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for longer_cand in range(len(candidate_list)):     
        i = candidate_list[longer_cand]
        long_cand_vectors = phrases_vectors(phrases[longer_cand])
        if len(long_cand_vectors)==0:
            pass
        else:
            for cand in range(longer_cand+1,len(candidate_list)): 
                short_cand_vectors = phrases_vectors(phrases[cand])
                if len(short_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(long_cand_vectors),len(short_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(long_cand_vectors)):
                        for j in range(len(short_cand_vectors)):
                            #print(cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)))
                            sim_matrix[i][j] = cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1))
                            """if cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.4:                
                                sim_matrix[i][j] = 2
                            elif cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.2:
                                sim_matrix[i][j] = 1
                            else:
                                sim_matrix[i][j] = 0"""

                                
                    if np.mean(sim_matrix) > 0.6:
                        print(f'{longer_cand} and {cand} are {np.mean(sim_matrix)} similar' )
                        indices_to_remove.add(cand)
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove

cand_df = merge_indices(cand_df, merging_step4(cand_df))
#print(merging_step4(candidate_list))

2 and 34 are 0.6444906344016393 similar
2 and 129 are 0.6352750410636266 similar
2 and 142 are 0.6738031804561615 similar
29 and 119 are 0.6717105408509573 similar
34 and 129 are 0.6940330862998962 similar
34 and 142 are 0.6754783987998962 similar
49 and 104 are 0.6065908074378967 similar
129 and 142 are 0.6704616546630859 similar
Initial amount of candidates: 297
Amount of candidates: 292, after removing 5 indices


In [575]:
for cand in cand_df['cand_text']:
    print(cand)

Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece
t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset
Its sounds more a kin to a penny dreadful sci fi novel than an economic treatise .
camp on Lesbos, where just 

### Merging step 5


In [1316]:
for i in event_df['Tweet Raw'][:100]:
    print(i)

@sztiv5 @Juliivan_ Yes, why? Why it wasn’t good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU? They must get help in the first safe country as asylum seeker,not in the 5th. No law says you can pick and choose and get it.
@GoTurkey ISIS refuge. Wouldn't go to Turkey if I was paid. You're likely to get your head lopped off if you stray off the beaten track. Go to Christian Greece and be safe while enjoying a similar climate to islamic turkey
Greece must improve refugee overcrowding, UN warns https://t.co/UDM4GDMcmo
@ThisIsOzcan @Nervana_1 @EGozuguzelli 1/3 Law? Let the idle stuff. All rights of the Turkish minority in Greece were taken away. Where is the law? Refugees are not accepted into the EU. Where is the law? Western states divided the states in the Middle East and Africa for underground resources. Where is the law?
@Juliivan_ @sztiv5 Anyway, how did the asylum seekers ended up at HU borders? They must have had a long journey through Greece,

## Frame identification

In [143]:
"""frame_properties = {'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'unreason/irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']}""" #from Hamborg's paper

# from paper Shifting the refugee narratives? by Greussing & Boomgaarden (2015)
frame_properties = {'settlement':['settlement','accomodation','permanent','temporary','barracks','accommodated','tent','camp', 'shelter'],
                   'reception':['quota', 'distribution', 'limit', 'selection','reception','together','asylum','receive'],
                    'security':['security', 'border','crossing','fence','control','flow'],
                    'criminality':['officer','terror','suspicion','crime','offense','police','trafficking','suspect'],
                    'economisation':['euro','economic','million','thousand','cost','money'],
                    'humanitarian':['humane','voluntary','help','support','aid','care','solidarity'],
                    'victimization':['islamic','fight','victim','war','dead','rescued','state'],
                    'integration': ['labour','employed','unemployed','integration','positive'],
                    
                    #from hamborg
                    'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'unreason/irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']
                   
                   }



In [120]:
import conceptnet_lite as cn
import gensim.downloader as api


manual_cands = ['refugee','muslim','migrant','christian','immigrant','israel','arab','syrian']

# to run on the server we should use larger model according to the paper - "conceptnet-numberbatch-17-06-300"
#model = api.load("glove-twitter-200")


@Kkkk09240868
@0khalodi0
@POTUS
Also,
Eedogan
has
been
documented
using
ISIS
militants
aka
terrorists,
he
played
the
immigrants
card
as
a
way
to
political
threat
to
Europe,
he
pushed
immigrants
to
greece
and
europe
for
their
own
death,
after
letting
them
homeless
for
years
As
I
said,
All
Muslims
are
guilty
@Nionios1908
@kitsikis
Greece
unable
to
cope
with
60
thousand
refugees
with
a
population
of
10
million.
and
the
border
next
to
it
wants
a
country
of
83
million
to
be
torn
apart.
God,
I've
never
seen
a
fool
like
you
together
in
my
life.
the
problem
is,
you're
all
idiots.😂
@hama_ashad
@realDonaldTrump
and
from
there
you
can
try
to
pass
Europe
especially
Greece
there
are
lots
of
boats
you
know
but
you
have
to
know
you
might
die
from
all
of
that
this
refugge
thing
is
very
fishy
@hama_ashad
@realDonaldTrump
I
dont
know.
I
dont
live
in
Iraq.
In
Europe
many
migrants
walked
from
Greece
to
Norway/sweden/Germany.
I
guess
you
can
do
the
same.
Just
from
Iraq
to
Greece.
The
life
as
migrant
is
awf

dependend
of
EU
money.
Wothout
the
EU
greek
people
would
seek
refuge
in
Turkey.
HAHAHA
@Susan60190970
@AndreAp0ll0
@itvnews
@emmamurphyitv
Please
tell
me,
what
exactly
do
you
know
about
the
asylum
seekers
systems
in
countries
like
France,
Germany
and
Greece?
Details
please.
I'm
very
interested
in
how
you
know
their
systems
are
flawless
and
don't
discriminate
🙃
via
@PerilOfAfrica
#Newsdeck
COVID-19:
Greece
reports
first
coronavirus
case
in
Moria
migrant
camp
on
Lesbos:
ATHENS,
Sept
2
(Reuters)
-
Greece
recorded
its
first
coronavirus
case
in
the
overcrowded
migrant
camp
of
Moria
on
the
island
of
Lesbos
and
the…
https://t.co/O31oc3V6j0
https://t.co/24C6hWAyVz
@BenTheSilent
@AndreAp0ll0
@itvnews
@emmamurphyitv
That's
not
what
i
said
refugees
who
are
brought
in
through
the
proper
channels
are
vetted.
Do
you
watch
what's
happening
france
Germany
and
Greece.
If
they
were
genuine
they
wouldn't
have
been
refused
asylum
in
the
countries
they've
passed
No
I
don't
think
they
are
all
criminals
@Spu

In [342]:
from nltk.corpus import stopwords
from collections import defaultdict

stop_words = list(stopwords.words('english'))

print('preprocessing tweets...')
tweets_corpus = list(preprocessing.preprocess_tweets(event_df['Tweet Raw'][200:]))


print('assigning frame properties to words from tweets...')
word_properties = defaultdict(dict)
for i in tqdm(tweets_corpus):
    tweet_words = [word.lower() for word in i.split() if word not in stop_words and len(word)>1]
    for word in tweet_words:
        word = lemma.lemmatize(word)
        property_list = []
        #print(list(frame_properties.keys()))
        for prop in list(frame_properties.keys()):
            #print(frame_properties[prop])
            
            try:
                #print(f'sim of {word}, {prop} is {model.similarity(word, prop)}')
                weights = [model.similarity(word, seed) for seed in frame_properties[prop]]
                #print(weights)
                if max(weights)>0.4:
                    word_properties[word][prop] = max(weights)
            except KeyError:
                pass
            



print(word_properties)
        

            

  0%|▎                                                                             | 86/18003 [00:00<00:21, 835.07it/s]

preprocessing tweets...


100%|███████████████████████████████████████████████████████████████████████████| 18003/18003 [00:18<00:00, 951.66it/s]
  0%|                                                                                        | 0/18003 [00:00<?, ?it/s]

assigning frame properties to words from tweets...


100%|████████████████████████████████████████████████████████████████████████████| 18003/18003 [10:41<00:00, 28.08it/s]

defaultdict(<class 'dict'>, {'camp': {'settlement': 1.0}, 'case': {'reason': 0.47645345}, 'get': {'reception': 0.50504476}, 'eu': {'victimization': 0.41524222}, 'currency': {'economisation': 0.633761}, 'need': {'humanitarian': 0.46502197}, 'money': {'economisation': 1.0}, 'soros': {'victimization': 0.46909347}, 'germany': {'victimization': 0.40653825}, 'never': {'refusal': 0.4868157}, 'give': {'reception': 0.5135118}, 'nazi': {'victimization': 0.4395619}, 'refugee': {'reception': 0.60989624}, 'wanna': {'affection': 0.45185497}, 'murder': {'criminality': 0.53367186, 'victimization': 0.40415478}, 'without': {'refusal': 0.46807507}, 'there': {'refusal': 0.62095666}, 'cash': {'economisation': 0.6151221}, 'not': {'refusal': 0.52004266}, 'support': {'humanitarian': 1.0}, 'refuge': {'settlement': 0.63942665, 'reception': 0.41259414}, 'want': {'affection': 0.43143085}, 'country': {'victimization': 0.43598586}, 'send': {'reception': 0.46330065}, 'iran': {'victimization': 0.49675876}, 'muslim': 




In [None]:
# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(tweets_corpus)[:200] 

#all_tweets_list = all_tweets_list + ['Muslim refugees is government']

# IF WE DON'T CARE ABOUT (KINDA) LOSING THE TWEETS (BECAUSE WE WILL LOOK AT SENTENCES ONLY) THEN WE CAN BATCH WITH \n\n AND
# SPEED THINGS UP SIGNIFICANTLY

#all_tweets_list = '\n\n'.join(all_tweets_list)
#print(all_tweets_list[:1000])
"""for tweet in tqdm(range(len(all_tweets_list))):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if len(tweet_sentokenized) == 0:
        tweet_sentokenized.append('empty_tweet')
        #print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)"""

"""all_tweets_list = '\n\n'.join(all_tweets_list)  

for tweet_id in range(len(all_tweets_list.split('\n\n'))):
    if len(all_tweets_list[tweet_id]) == 0:
        all_tweets_list[tweet_id] = 'empty_tweet'
#print(all_tweets_list[:1000]"""


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=2000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

In [140]:
#Store the noun phrases in the pickle file
import pickle

with open('test_moria_tagged_tweets', 'wb') as fp:
    pickle.dump(tagged_tweets, fp)

In [343]:
# Load NPs from pickle file
import pickle

with open(r"test_moria_tagged_tweets", "rb") as input_file:
    tagged_tweets = pickle.load(input_file)

In [344]:
event_df['Date Short'][200]

'2020-09-04'

In [361]:
from collections import defaultdict
# import these modules 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus.reader.wordnet import NOUN
import numpy as np
  
lemma = WordNetLemmatizer() 

cand_frames = defaultdict(list)

framed_words = pd.DataFrame(columns=['word','date',list(frame_properties.keys())])


for tweet_id in tqdm(range(len(tagged_tweets))):
    #print(tweet)
    cand_words = [[word.id, word.text,word.head] for sent in tagged_tweets[tweet_id].sentences for word in sent.words]
    #print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')
    #print(len(cand_df['candidates']))
    for cand in cand_df['candidates']:
            # lemmatize representative head of candidate 
            rep_head = lemma.lemmatize(cand[1].lower(),pos=NOUN)
            
            if rep_head in tagged_tweets[tweet_id].text.lower() and len(rep_head)>1:
                #find all dependencies of the phrase head
                for related in range(len(cand_words)):
                    cand_word_lemma = lemma.lemmatize(cand_words[related][1].lower())
                    #print(f'Yes it is, related = {lemma.lemmatize(cand_heads[related][1].lower(),pos=NOUN)}')
                    #lemma.lemmatize(cand_heads[related][1].lower(),pos=NOUN)
                    if rep_head == cand_word_lemma:
                        related_word = lemma.lemmatize(cand_words[cand_words[related][2]-1][1].lower())
                        cand_frames['word'].append(rep_head)
                        cand_frames['date'].append(event_df['Date Short'][200+tweet_id])
                        #print(f'\n\n checking {phrase_head}_{related_word}')                
                        #cand_frames['word'].append(rep_head)
                        #cand_frames['date'].append(event_df['Date Short'][200+tweet_id])
                        #cand_frames['word'].append(phrase_head)
                        for frame_property in list(frame_properties.keys()):
                            #print(frame_property)

                            try:
                                #print(word_properties[phrase_head][frame_property])
                                cand_frames[frame_property].append(word_properties[related_word][frame_property])

                            except KeyError:
                                #print('Error')
                                #cand_frames[frame_property].append(word_properties['tent'][frame_property])
                                cand_frames[frame_property].append(np.NaN)
                                

                    #print(len(cand_frames[frame_property]))
                    
                #print('\n')
                
                
                """for frame_property in list(frame_properties.keys()):
                    for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    #print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                            #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                        except KeyError:
                            pass"""
                        #[cand_frames[seed_word][cand].append(model.similarity(print(f'{cand} is related to {np_heads[np_heads[related][2]+1][1]}') if cand == np_heads[related][1] else print('nej') for related in range(len(np_heads))]
            #print(get_head(cand))
            #print(np_heads[19][1])
            #[f(x) if condition else g(x) for x in sequence]
            #[print(np_heads[np_heads[related][2]-1]) if get_head(cand)==np_heads[related][1] else print('hi') for related in range(len(np_heads))]

            
#became ___ (vb and vbx)
#(VP sit/VB (PP on/IN (NP the/DT mat/NN))))) 

#common phrases = migrant camp, covid case, covid test

#cand_frames

100%|███████████████████████████████████████████████████████████████████████████| 18003/18003 [01:16<00:00, 234.40it/s]


In [375]:
print(cand_frames.keys())

framed_words = pd.DataFrame.from_dict(cand_frames)

#framed_words[framed_words['word']=='migrants'].tail(50)

framed_words = framed_words.dropna(subset=['settlement', 'reception', 'security', 'criminality', 'economisation', 'humanitarian', 'victimization', 'integration', 'affection', 'refusal', 'trustworthiness', 'no trustworthiness', 'reason', 'unreason/irrationality', 'easiness', 'difficulty', 'honor', 'dishonor'],how='all')

framed_words

dict_keys(['word', 'date', 'settlement', 'reception', 'security', 'criminality', 'economisation', 'humanitarian', 'victimization', 'integration', 'affection', 'refusal', 'trustworthiness', 'no trustworthiness', 'reason', 'unreason/irrationality', 'easiness', 'difficulty', 'honor', 'dishonor'])


Unnamed: 0,word,date,settlement,reception,security,criminality,economisation,humanitarian,victimization,integration,affection,refusal,trustworthiness,no trustworthiness,reason,unreason/irrationality,easiness,difficulty,honor,dishonor
0,migrant,2020-09-04,1.0,,,,,,,,,,,,,,,,,
2,covid,2020-09-04,,,,,,,,,,,,,0.476453,,,,,
3,covid,2020-09-04,,,,,,,,,,,,,0.476453,,,,,
4,covid,2020-09-04,,,,,,,,,,,,,0.476453,,,,,
5,covid,2020-09-04,,,,,,,,,,,,,0.476453,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138104,moria,2020-09-16,1.0,,,,,,,,,,,,,,,,,
138106,migrant,2020-09-16,,,,,,,0.415242,,,,,,,,,,,
138110,muslim,2020-09-16,,,,,,,,,0.438872,,,,,,,,,
138116,government,2020-09-16,,0.400841,,,,,,,,,,,,,,,,


In [379]:
framed_words = framed_words[framed_words['word'] == 'refugee']
#framed_words[framed_words['date']=='2020-09-04']

aggr_frames = framed_words.groupby(['word','date'],as_index=False).mean()
aggr_frames.tail(50)

Unnamed: 0,word,date,settlement,reception,security,criminality,economisation,humanitarian,victimization,integration,affection,refusal,trustworthiness,no trustworthiness,reason,unreason/irrationality,easiness,difficulty,honor,dishonor
0,refugee,2020-08-24,0.88565,0.528205,,,0.861238,0.647149,0.506002,,0.516642,0.453999,,,0.429876,,,0.750321,,
1,refugee,2020-08-25,0.71884,0.596905,,0.484127,,0.585392,0.598926,,0.431431,0.537289,,,,,0.416482,0.729447,,
2,refugee,2020-08-26,1.0,0.56823,0.538357,,1.0,0.463913,0.480653,,0.428018,0.577606,,,,,,0.830878,,
3,refugee,2020-08-27,0.850285,0.502025,0.40145,,0.738238,0.821304,0.46384,,0.417464,0.500712,,,,,,0.42237,,
4,refugee,2020-08-28,1.0,0.589157,1.0,,1.0,0.60094,0.469478,,0.412,0.553681,,,0.429918,,,0.734831,0.430042,
5,refugee,2020-08-29,0.881932,0.568262,1.0,0.451951,0.803007,0.494139,0.668953,,1.0,0.520043,,0.602489,0.413419,,,0.717828,,0.445651
6,refugee,2020-08-30,1.0,0.527151,0.621869,,1.0,0.623637,0.501811,,0.420636,,,,0.524333,,,1.0,,
7,refugee,2020-08-31,1.0,0.609896,1.0,0.717057,1.0,1.0,0.584668,,0.432889,0.441415,,0.461287,0.455918,,,0.646462,,0.463172
8,refugee,2020-09-01,1.0,0.547397,0.40145,,1.0,0.753653,0.43226,,0.431431,0.577606,,,,,,0.844236,0.439388,
9,refugee,2020-09-02,1.0,0.609896,,,0.828203,0.734906,0.44241,,0.432889,1.0,,,0.544848,,,1.0,,0.466985


In [383]:
import plotly.express as px


fig = px.line(aggr_frames, x="date", y=["reason",'affection','reception','settlement'], title=f'Frame bias towards refugees')
fig.show()

# TESTING:

In [271]:
# batching the tweets speeds the model considerably and is enabled by splitting sentences using '\n\n' 
from stanza_batch import batch
from nltk.tokenize import sent_tokenize

# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(tweets_corpus)[:50] 
for tweet in range(len(all_tweets_list)):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if tweet_sentokenized == []:
        tweet_sentokenized.append('empty_tweet')
        print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=1000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

50it [00:16,  2.99it/s]


'Greeces Moria migrant camp quarantined after first Covid - 19 case'

In [273]:


for tweet in tqdm(range(len(tweets_corpus))):
    print(tweets_corpus[tweet])
    np_heads = [[word.id, word.text,word.head,word.deprel] for sent in tagged_tweets[tweet].sentences for word in sent.words]
    print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in tagged_tweets[tweet].sentences for word in sent.words], sep='\n')
    #print(np_heads)
    ph_ids = set([np_heads[i][2] for i in range(len(np_heads))])
    ph_words = [np_heads[i-1][1] for i in ph_ids]

    word_pairs = [(np_heads[word][1], np_heads[np_heads[word][2]-1][1]) for word in range(len(np_heads)) if np_heads[word][2] != 0]
    #print(word_pairs)
    
    compounds = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if 'compound' in np_heads[i][3]]
    print(compounds)
    
    advmods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='advmod']
    print(advmods)
    
    amods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='amod']
    print(amods)
    for pair in word_pairs:
        phrase = pair[0]+'_'+pair[1]

    #print(model.most_similar('illegal_immigrant'))

    
    """#print(len(cand_df['candidates']))
    candidate_list = cand_df['candidates']
    for cand in cand_df['candidates']:
        #print(cand[2])
        #print(get_head(str(cand)))
        for phrase_head in cand[2]:
            #print(phrase_head)
            #if str(cand[1]) in str(tweet):
            if str(phrase_head) in str(tweet) and len(phrase_head)>2:
                #print(phrase_head)
                ph_words = [np_heads[i-1][1] for i in phrase_heads]
                #print(ph_words)
                for related in range(len(np_heads)):
                    if phrase_head == np_heads[related][1]:
                        pass
                        #print(f'checking {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                  for frame_property in list(frame_properties.keys()):
                        for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    #cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                                #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                                except KeyError:
                                pass"""

  0%|                                                                              | 20/18003 [00:00<01:31, 196.07it/s]

Greeces Moria migrant camp quarantined after first Covid - 19 case
id: 1	word: Greeces        head id: 0    head: root      deprel: root
id: 2	word: Moria          head id: 1    head: Greeces   deprel: flat
id: 3	word: migrant        head id: 4    head: camp      deprel: amod
id: 4	word: camp           head id: 1    head: Greeces   deprel: appos
id: 5	word: quarantined    head id: 4    head: camp      deprel: acl
id: 6	word: after          head id: 11   head: case      deprel: case
id: 7	word: first          head id: 11   head: case      deprel: amod
id: 8	word: Covid          head id: 11   head: case      deprel: compound
id: 9	word: -              head id: 8    head: Covid     deprel: punct
id: 10	word: 19             head id: 8    head: Covid     deprel: nummod
id: 11	word: case           head id: 5    head: quarantineddeprel: obl
[['Covid_case']]
[]
[['migrant_camp'], ['first_case']]
2,000 covid - 19 tests will be carried out at Moria migrant camp
id: 1	word: 2,000          head id

  0%|▏                                                                             | 50/18003 [00:00<01:35, 187.97it/s]


id: 1	word: This           head id: 3    head: why       deprel: nsubj
id: 2	word: is             head id: 3    head: why       deprel: cop
id: 3	word: why            head id: 0    head: root      deprel: root
id: 4	word: so             head id: 5    head: many      deprel: advmod
id: 5	word: many           head id: 6    head: Turks     deprel: amod
id: 6	word: Turks          head id: 7    head: wish      deprel: nsubj
id: 7	word: wish           head id: 3    head: why       deprel: acl:relcl
id: 8	word: to             head id: 9    head: escape    deprel: mark
id: 9	word: escape         head id: 7    head: wish      deprel: xcomp
id: 10	word: the            head id: 11   head: dictatorshipdeprel: det
id: 11	word: dictatorship   head id: 9    head: escape    deprel: obj
id: 12	word: of             head id: 13   head: Ergogan   deprel: case
id: 13	word: Ergogan        head id: 11   head: dictatorshipdeprel: nmod
id: 14	word: and            head id: 15   head: seek      deprel: cc
id: 15

IndexError: list index out of range