# Candidate merging and related preprocessing


Import relevant packages for the following parts

In [1]:
#python libraries
import stanza
import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import os
import re
import csv
from tqdm import tqdm
import time

# self written modules
import preprocessing
import candidate_processing as cand_prep



  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading english - 1grams ...
Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. We import the data and split them based on the event date

In [2]:
data_url = r"CBS - Copenhagen Business School\Kick-Ass Master Thesis - General\Data\moria-data/moria_no_duplicates.csv"
directory_path = os.getcwd() + "/../../../" + data_url 
event_df = pd.read_csv(directory_path, index_col=0)

event_df.reset_index(drop=True, inplace=True)
# given event date, split the dataset to pre and post event dfs

event_date = '2020-09-09'

#moria_df[['Date','Time']] = moria_df['Date Short'].astype(str).str.split(' ', 1, expand=True)
# create pre and post event partition
pre_event = event_df[event_df['Date Short'] < event_date]
post_event = event_df[event_df['Date Short'] >= event_date]

print('total tweets: ', event_df.shape[0])
print('Pre event tweets: ',pre_event.shape[0])
print('Post event tweets: ',post_event.shape[0])


total tweets:  18203
Pre event tweets:  3311
Post event tweets:  14892


## 2. We preprocess the data using the function from self-written preprocessing module

In [3]:
# see the description of the method in the preprocessing module
sampled_df = preprocessing.preprocess_tweets(event_df['Tweet Raw'][:200])

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 522.17it/s]


## 3. We instantiate stanza english language module

In [4]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ needed when running first time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#

#stanza.download("en")

#directory = '../../export CORENLP_HOME=' ##ADD DIRECTORY HERE
#stanza.install_corenlp()

#import os
#os.environ["CORENLP_HOME"] = directory

In [5]:
en_nlp = stanza.Pipeline("en", ner_batch_size=4096)

2021-03-18 22:04:27 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-18 22:04:27 INFO: Use device: cpu
2021-03-18 22:04:27 INFO: Loading: tokenize
2021-03-18 22:04:27 INFO: Loading: pos
2021-03-18 22:04:28 INFO: Loading: lemma
2021-03-18 22:04:28 INFO: Loading: depparse
2021-03-18 22:04:29 INFO: Loading: sentiment
2021-03-18 22:04:30 INFO: Loading: ner
2021-03-18 22:04:31 INFO: Done loading processors!


In [8]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NOT USED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# if we did not care about keeping track of tweet indices then we could use this - very fast batching

# set the column width otherwise only truncated tweet will be converted
pd.options.display.max_colwidth = 1000

tweets_txt = sampled_df.to_string(index = False)

# we need 2 new line for batching, if they do not exist, replace 1 line break with two
tweets_txt = tweets_txt if '\n\n' in tweets_txt else tweets_txt.replace('\n','\n\n') 

tweets_txt

"                                               Yes, why? Why it wasnt good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU? They must get help in the first safe country as asylum seeker, not in the 5 th . No law says you can pick and choose and get it.\n\n                                                                          ISIS refuge . Wouldn' t go to Turkey if I was paid . You' re likely to get your head lopped off if you stray off the beaten track . Go to Christian Greece and be safe while enjoying a similar climate to islamic turkey\n\n                                                                                                                                                                                                                                                 Greece must improve refugee overcrowding, UN warns\n\n           1 / 3 Law? Let the idle stuff . All rights of the Turkish minority in Greece were taken away . Where is

In [143]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NOT USED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# the purpose would be to store all matching candidates, their phrases, phrases heads in a big dataframe

len(stanza_documents)

{stanza_documents[tweet_id]:[stanza_documents[tweet_id].text, ] for tweet_id in range(len(tweets)) for sent in tweets[tweet_id].sentences for word,ent in sent}
# List1  
lst = [[stanza_documents[tweet_id],[stanza_documents[tweet_id].text, ] for tweet_id in range(len(tweets)) for sent in tweets[tweet_id].sentences for word,ent in sent}] 
    
df = pd.DataFrame(lst, columns =['FName', 'LName', 'Age'], dtype = float) 
df 


100

## 4. We apply stanza module on the tweets to get NER and POS tags. We do it in batches to speed things up.

In [7]:
# batching the tweets speeds the model considerably and is enabled by splitting sentences using '\n\n' 
from stanza_batch import batch
from nltk.tokenize import sent_tokenize

# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(sampled_df) 
for tweet in range(len(all_tweets_list)):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if tweet_sentokenized == []:
        tweet_sentokenized.append('empty_tweet')
        print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=1000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

0it [00:00, ?it/s]

empty tweet at index 137
empty tweet at index 167


200it [01:01,  3.26it/s]


'Yes, why?\n\nWhy it wasnt good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU?\n\nThey must get help in the first safe country as asylum seeker, not in the 5 th .\n\nNo law says you can pick and choose and get it.'

In [8]:
# get easily accessible list of tuples (POS-tags of each word, NER-tags of each named entity) 
tweet_tags = cand_prep.get_tweet_tags(tagged_tweets) 


#get a set of all NER tags existing in corpus - check which one are found in corpus

tweet_tags_set = set()
for tweet in tweet_tags:
    tweet_ner_tags = set(tweet[1].values())
    tweet_tags_set.update(tweet_ner_tags)

print(tweet_tags_set)

100%|█████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 16661.92it/s]

{'PERCENT', 'TIME', 'NORP', 'ORDINAL', 'GPE', 'PERSON', 'LANGUAGE', 'PRODUCT', 'EVENT', 'QUANTITY', 'DATE', 'CARDINAL', 'LOC', 'MONEY', 'LAW', 'ORG'}





## 5. As initial WCL candidates, we extract noun phrases (NPs) and coreference chains.

## We do so using CoreNLPClient wrapper

### SOME PREPROCESSING NEEDED
* remove links - check
* remove # from hashtags? - check
* remove/merge mentions? - check


* remove recurring texts (signatures of news media) - any new spotted should be added in preprocessing file's '__remove_tweet_signatures__' function
* remove posts of some accounts (refugee_list)
* exclude NERs that tag numbers - should we mark phrase as NE if the head is not NE? - check
* play around with candidate types
* optimize code and make it neater



In [10]:
from stanza.server import CoreNLPClient

noun_phrase_list = []
with CoreNLPClient(timeout=300000, memory='16G') as client:
    for tweet in tqdm(list(sampled_df)):
        # get noun phrases with tregex using get_noun_phrases function
        noun_phrase = cand_prep.get_noun_phrases(client,str(tweet),annotators="tokenize,ssplit,pos,lemma,parse")
        noun_phrase_list.append(noun_phrase)
        #noun_phrases(client,event_df["Tweet Raw"][:100],_annotators="tokenize,ssplit,pos,lemma,parse")

2021-03-18 22:05:51 INFO: Writing properties to tmp file: corenlp_server-d485d6788767453d.props
2021-03-18 22:05:51 INFO: Starting server with command: java -Xmx16G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 300000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-d485d6788767453d.props -preload -outputFormat serialized
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [03:06<00:00,  1.07it/s]


In [11]:

noun_phrase_list = [[np.replace(' - ','-') for np in nps] for nps in noun_phrase_list]
noun_phrase_list = [[np.replace(' / ','/') for np in nps] for nps in noun_phrase_list]
noun_phrase_list = [[np.replace(" ' ","'") for np in nps] for nps in noun_phrase_list]


In [None]:
#Store the noun phrases in the pickle file
import pickle

with open('file_name_to_save', 'wb') as fp:
    pickle.dump(noun_phrase_list, fp)

In [71]:
# Load NPs from pickle file
import pickle

with open(r"moria_noun_phrases", "rb") as input_file:
    noun_phrase_list = pickle.load(input_file)

#noun_phrase_list

## 6. We keep only NPs shorter than 20 words and remove children of parent NPs 

In [12]:
def get_cand_len(cand_list):
    # calculates number of candidates in the corpus
    sum_len = 0
    for tweet_cands in cand_list:
        sum_len += len(tweet_cands)
    return sum_len

print(get_cand_len(noun_phrase_list))


for tweet_nps in noun_phrase_list:
    #reverse the list of tweets nps so we avoid moving indexes and leaving out some phrases 
    for np in reversed(tweet_nps):
        i = tweet_nps.index(np)
        np_split = np.split()
        if len(np_split) > 19:
            tweet_nps.remove(tweet_nps[i])
            
print(get_cand_len(noun_phrase_list))

2714
2692


In [13]:
# remove the child NPs and keep only parents, run until the sum_len stops decreasing
after_removal_len = 0
while after_removal_len != get_cand_len(noun_phrase_list):
    after_removal_len = get_cand_len(noun_phrase_list)
    for tweet_nps in noun_phrase_list:
        for np in range(len(tweet_nps)):
            try:
                #if the subsequent np (child np) is contained in the current one, remove the child np
                if tweet_nps[np].find(tweet_nps[np+1]) != -1:
                    tweet_nps.remove(tweet_nps[np+1])

            #ignore the error caused with end of the list
            except IndexError:
                pass

print(get_cand_len(noun_phrase_list))

1430


In [None]:
noun_phrase_list

## 7. We get the heads of noun phrases (in batches)

In [14]:

#tag all tweets and save them in a list    
tagged_np_cands = [] 
batched_np_list = cand_prep.prep_candlist_for_batching(noun_phrase_list)

for tagged_cand in tqdm(batch(batched_np_list, en_nlp, batch_size=6000)): # Default batch size is 32
        tagged_np_cands.append(tagged_cand)
        


0it [00:00, ?it/s]

empty tweet at index 137
empty tweet at index 167
['it\n\nasylum in Greece, MAC, SER or CRO or BUL\n\n3 EU members\n\nHU\n\nThey\n\nhelp\n\nthe first safe country as asylum seeker\n\nthe 5\n\nNo law\n\nyou\n\nit', "ISIS refuge .\n\nWouldn' t\n\nTurkey\n\nI\n\nYou' re likely to get your head\n\nthe beaten track\n\nChristian Greece\n\na similar climate\n\nislamic turkey", 'Greece\n\nrefugee overcrowding\n\nUN', '1/3\n\nLaw\n\nthe idle stuff\n\nAll rights of the Turkish minority in Greece\n\nthe law\n\nRefugees\n\nthe EU\n\nthe law\n\nWestern states\n\nthe states\n\nthe Middle East and Africa\n\nunderground resources\n\nthe law', 'the asylum seekers\n\nHU borders\n\nThey\n\na long journey\n\nGreece\n\nthey\n\nhelp\n\nthey\n\nmultiple more countries', 'I\n\nphoto gr apy\n\naesthetic\n\nnews\n\nit\n\na potential embellishments of hard facts and harsh reality\n\nI\n\nraw realism\n\nI\n\nthis boundary using @FiLMiCPro first light this week in lesbos infamous Moria migrants camp\n\na big quest

200it [01:05,  3.07it/s]


In [15]:
np_cand_heads = cand_prep.get_cand_heads(tagged_np_cands)
np_cand_heads

[[[{'it'}, ['it']],
  [{'BUL', 'CRO', 'Greece', 'MAC', 'SER', 'asylum'}, ['asylum']],
  [{'members'}, ['members']],
  [{'HU'}, ['HU']],
  [{'They'}, ['They']],
  [{'help'}, ['help']],
  [{'country', 'seeker'}, ['country']],
  [{'5'}, ['5']],
  [{'law'}, ['law']],
  [{'you'}, ['you']],
  [{'it'}, ['it']]],
 [[{'.', 'refuge'}, ['refuge']],
  [{'Wouldn', 't'}, ['Wouldn']],
  [{'Turkey'}, ['Turkey']],
  [{'I'}, ['I']],
  [{'get', 'head', 'likely'}, ['likely']],
  [{'track'}, ['track']],
  [{'Christian', 'Greece'}, ['Christian']],
  [{'climate'}, ['climate']],
  [{'turkey'}, ['turkey']]],
 [[{'Greece'}, ['Greece']],
  [{'overcrowding'}, ['overcrowding']],
  [{'UN'}, ['UN']]],
 [[{'1/3'}, ['1/3']],
  [{'Law'}, ['Law']],
  [{'stuff'}, ['stuff']],
  [{'Greece', 'minority', 'rights'}, ['rights']],
  [{'law'}, ['law']],
  [{'Refugees'}, ['Refugees']],
  [{'EU'}, ['EU']],
  [{'law'}, ['law']],
  [{'states'}, ['states']],
  [{'states'}, ['states']],
  [{'Africa', 'East'}, ['East']],
  [{'resources

## 8. We define candidate types 

In [16]:
#dictionary to assign candidate types based on named entities and part of speech tags
#the key tuple consists of (isNE, lexicographer type, plural)
cand_types_dict = {(True,'PERSON',None):'person-ne',
              (True,'NORP',None):'person-ne',
              (True,'PERSON','plural'):'person-nes',
              (True,'NORP','plural'):'person-nes',
              (False,'PERSON',None):'person-nn',
              (False,'PERSON','plural'):'person-nns',
              (True,'ORG',None):'group-ne',
              (True,'FAC',None):'group-ne',
              (False,'ORG',None):'group',
              (True,'LOC',None):'loc-ne',
              (True,'GPE',None):'loc-ne',
              (False, 'LOC',None):'loc'
            }


In [17]:
from collections import Counter

#test syntactic categories on all heads of noun phrases
list_of_cand_types = [cand_prep.get_synt_category(cand[1][0]) for tweet_cands in np_cand_heads for cand in tweet_cands]
    
count_types = Counter(list_of_cand_types)
print(count_types)

Counter({None: 1069, 'PERSON': 162, 'ORG': 134, 'LOC': 74})


## 9. We assign candidate types to noun phrase candidates

In [18]:
# label the noun phrases with the candidate types
np_and_cand_list = cand_prep.get_cand_type(noun_phrase_list,np_cand_heads, tweet_tags, cand_types_dict)
print(np_and_cand_list)

 

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 164.42it/s]

[[('it', 'it', {'it'}, 'misc'), ('asylum in Greece, MAC, SER or CRO or BUL', 'asylum', {'Greece', 'MAC', 'asylum', 'CRO', 'BUL', 'SER'}, 'group-ne'), ('3 EU members', 'members', {'members'}, 'person-nes'), ('HU', 'HU', {'HU'}, 'group-ne'), ('They', 'They', {'They'}, 'misc'), ('help', 'help', {'help'}, 'person-nn'), ('the first safe country as asylum seeker', 'country', {'seeker', 'country'}, 'group-ne'), ('the 5', '5', {'5'}, 'misc'), ('No law', 'law', {'law'}, 'misc'), ('you', 'you', {'you'}, 'misc'), ('it', 'it', {'it'}, 'misc')], [('ISIS refuge .', 'refuge', {'.', 'refuge'}, 'loc-ne'), ("Wouldn' t", 'Wouldn', {'Wouldn', 't'}, 'misc'), ('Turkey', 'Turkey', {'Turkey'}, 'loc-ne'), ('I', 'I', {'I'}, 'group'), ("You' re likely to get your head", 'likely', {'get', 'head', 'likely'}, 'misc'), ('the beaten track', 'track', {'track'}, 'loc'), ('Christian Greece', 'Christian', {'Christian', 'Greece'}, 'person-ne'), ('a similar climate', 'climate', {'climate'}, 'misc'), ('islamic turkey', 'tur




In [19]:
from collections import Counter

# couunt occurence of each candidate type
cand_type_count = [np[3] for nps in np_and_cand_list if nps != None for np in nps ]
    
counts = Counter(cand_type_count)
print(counts)

Counter({'misc': 853, 'loc-ne': 197, 'group-ne': 76, 'person-nns': 73, 'group': 69, 'loc': 50, 'person-ne': 43, 'person-nes': 42, 'person-nn': 29})


## 10. We get coreference chains candidates from the tweet corpus

In [20]:
from stanza.server import CoreNLPClient
dict_of_tweet_corefs = {}
#corefs = []
with CoreNLPClient(properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'}, memory='16G') as client:
    for tweet_index in tqdm(range(len(sampled_df))):
        tweet_corefs=[]
        #print(f'Coreferences for the tweet {list(event_df["Tweet Raw"]).index(tweet)} are:')
        for chain in cand_prep.get_coref_chain(sampled_df[tweet_index],client):
            tweet_corefs.append(chain)
            #print(' <-> '.join(chain),'\n')
        #corefs.append(tweet_corefs)
        dict_of_tweet_corefs[tweet_index] = tweet_corefs

dict_of_tweet_corefs

2021-03-18 22:10:17 INFO: Writing properties to tmp file: corenlp_server-045b60b08d594fa6.props
2021-03-18 22:10:17 INFO: Starting server with command: java -Xmx16G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-045b60b08d594fa6.props -preload -outputFormat serialized
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [02:39<00:00,  1.26it/s]


{0: [(['No law', 'it'], 0)],
 1: [(['You', 'your', 'you'], 0)],
 2: [],
 3: [(['the law', 'the law', 'the law'], 0)],
 4: [(['They', 'they', 'the asylum seekers', 'they'], 2)],
 5: [(['I', 'I', 'I'], 0), (['aesthetic', 'it'], 0)],
 6: [],
 7: [],
 8: [(['Birmingham', 'birmingham'], 0)],
 9: [(['you', 'you'], 0), (['Turkey', 'it', 'it', 'Turkey'], 3)],
 10: [(['The Bessudos', 'Isaac Bessudos'], 1),
  (['Raquel Bessudo', 'Raquel'], 0)],
 11: [(['them', 'their'], 0)],
 12: [(['you', 'you', 'your', 'your', 'you'], 1),
  (['it', 'it'], 1),
  (['they', 'they'], 0)],
 13: [],
 14: [(['It', 'Greece'], 1)],
 15: [],
 16: [],
 17: [(['their', 'Both men', 'they', 'them', 'they'], 1)],
 18: [(['Those refugees',
    '10 refugees , including three children with disabilities'],
   1),
  (['Greece', 'Greece'], 0)],
 19: [(['UN Refugee Agency', 'Anadolu Agency'], 0)],
 20: [],
 21: [],
 22: [],
 23: [(['we', 'we'], 0)],
 24: [],
 25: [],
 26: [],
 27: [(['they', 'Enterprise Greece', 'their', 'they', 't

In [239]:
with open('moria_tweet_corefs', 'wb') as fp:
    pickle.dump(dict_of_tweet_corefs, fp)

dict_of_tweet_corefs

{0: [(['No law', 'it'], 0)],
 1: [(['You', 'your', 'you'], 0)],
 2: [],
 3: [(['the law', 'the law', 'the law'], 0)],
 4: [(['They', 'they', 'the asylum seekers', 'they'], 2)],
 5: [(['#photograpy pushing aesthetic', 'it'], 0), (['I', 'I'], 0)],
 6: [(['@GreeceMFA ManfredWeber', 'It'], 0)],
 7: [],
 8: [],
 9: [(['Turkey', 'Turkey', 'it', 'it'], 0), (['you', 'you'], 0)],
 10: [(['Raquel',
    'Raquel',
    '@realPR_Phoenix @DrEstella @BravoTV Raquel Bessudo',
    'Isaac Bessudo'],
   2)],
 11: [(['@Mproyklis @fragoua @LearnerLerner', 'them', 'their'], 0)],
 12: [(['you', 'you', 'your', 'your', 'you'], 1),
  (['it', 'it'], 1),
  (['they', 'they'], 0)],
 13: [],
 14: [(['It', 'Greece'], 1)],
 15: [],
 16: [],
 17: [(['their', 'Both men', 'they', 'them', 'they'], 1)],
 18: [(['Those refugees',
    '10 refugees , including three children with disabilities'],
   1),
  (['Greece', 'Greece'], 0)],
 19: [(['Anadolu Agency', 'UN Refugee Agency'], 1)],
 20: [],
 21: [],
 22: [],
 23: [(['we', 'w

## 11. We determine candidate's type for representative mentions of coref candidates (in batches)

In [362]:
# Load NPs from pickle file
import pickle

with open(r"moria_tweet_corefs", "rb") as input_file:
    corefs = pickle.load(input_file)



In [21]:
corefs_list = []

#pick out only the representative mention as the candidate's rep. phrase
for tweet_corefs in dict_of_tweet_corefs:
        tw_corefs = [coref[0][coref[1]] for coref in dict_of_tweet_corefs[tweet_corefs]] 
        # empty list would cause problems in the following steps, that is why we append 'no_candidate' to empty lists
        corefs_list.append(tw_corefs) if len(tw_corefs) != 0 else corefs_list.append(['no_candidate'])

corefs_list       


[['No law'],
 ['You'],
 ['no_candidate'],
 ['the law'],
 ['the asylum seekers'],
 ['I', 'aesthetic'],
 ['no_candidate'],
 ['no_candidate'],
 ['Birmingham'],
 ['you', 'Turkey'],
 ['Isaac Bessudos', 'Raquel Bessudo'],
 ['them'],
 ['you', 'it', 'they'],
 ['no_candidate'],
 ['Greece'],
 ['no_candidate'],
 ['no_candidate'],
 ['Both men'],
 ['10 refugees , including three children with disabilities', 'Greece'],
 ['UN Refugee Agency'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['we'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['Enterprise Greece'],
 ['no_candidate'],
 ['they'],
 ['He'],
 ['Erdoan'],
 ['UK', 'UK size'],
 ['no_candidate'],
 ['The infected person , a 40 year old man', 'the camp', 'Moria', "Didn '"],
 ['no_candidate'],
 ['Greece', 'Protection & Operations , @GillianTriggs & @RaoufMazou'],
 ['greece', 'I', 'you'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['no_candidate'],
 ['you'],
 ['Turkey'],
 ['I'],
 ['no_candi

In [22]:
#tag all tweets and save them in a list    
tagged_coref_cands = [] 
batched_coref_list = cand_prep.prep_candlist_for_batching(corefs_list)
print(batched_coref_list)
for tagged_cand in tqdm(batch(batched_coref_list, en_nlp, batch_size=6000)): # Default batch size is 32
        tagged_coref_cands.append(tagged_cand)
        
coref_cand_heads = cand_prep.get_cand_heads(tagged_coref_cands)
coref_cand_heads

0it [00:00, ?it/s]

['No law', 'You', 'no_candidate', 'the law', 'the asylum seekers', 'I\n\naesthetic', 'no_candidate', 'no_candidate', 'Birmingham', 'you\n\nTurkey', 'Isaac Bessudos\n\nRaquel Bessudo', 'them', 'you\n\nit\n\nthey', 'no_candidate', 'Greece', 'no_candidate', 'no_candidate', 'Both men', '10 refugees , including three children with disabilities\n\nGreece', 'UN Refugee Agency', 'no_candidate', 'no_candidate', 'no_candidate', 'we', 'no_candidate', 'no_candidate', 'no_candidate', 'Enterprise Greece', 'no_candidate', 'they', 'He', 'Erdoan', 'UK\n\nUK size', 'no_candidate', "The infected person , a 40 year old man\n\nthe camp\n\nMoria\n\nDidn '", 'no_candidate', 'Greece\n\nProtection & Operations , @GillianTriggs & @RaoufMazou', 'greece\n\nI\n\nyou', 'no_candidate', 'no_candidate', 'no_candidate', 'no_candidate', 'no_candidate', 'you', 'Turkey', 'I', 'no_candidate', 'no_candidate', "Don '\n\nGreece\n\nThose who seek refuge in Greece , those who want to make a coup\n\nfreedom of expression", 'no_c

200it [00:13, 14.92it/s]


[[[{'law'}, ['law']]],
 [[{'You'}, ['You']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'law'}, ['law']]],
 [[{'seekers'}, ['seekers']]],
 [[{'I'}, ['I']], [{'aesthetic'}, ['aesthetic']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'Birmingham'}, ['Birmingham']]],
 [[{'you'}, ['you']], [{'Turkey'}, ['Turkey']]],
 [[{'Bessudos', 'Isaac'}, ['Isaac']], [{'Bessudo', 'Raquel'}, ['Raquel']]],
 [[{'them'}, ['them']]],
 [[{'you'}, ['you']], [{'it'}, ['it']], [{'they'}, ['they']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'Greece'}, ['Greece']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'men'}, ['men']]],
 [[{'children', 'disabilities', 'refugees'}, ['refugees']],
  [{'Greece'}, ['Greece']]],
 [[{'Agency'}, ['Agency']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'no_candidate'}, ['no_candidate']]],
 [[{'we'}, ['we']]],
 [[{'no_candidate'}, ['no_candidate']]]

In [23]:
coref_and_cand_list = cand_prep.get_cand_type(corefs_list, coref_cand_heads, tweet_tags, cand_types_dict, corefs=True)


print(coref_and_cand_list) 

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 636.95it/s]

[[('No law', 'law', {'law'}, 'misc')], [('You', 'You', {'You'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('the law', 'law', {'law'}, 'misc')], [('the asylum seekers', 'seekers', {'seekers'}, 'person-nns')], [('I', 'I', {'I'}, 'misc'), ('aesthetic', 'aesthetic', {'aesthetic'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('Birmingham', 'Birmingham', {'Birmingham'}, 'loc-ne')], [('you', 'you', {'you'}, 'misc'), ('Turkey', 'Turkey', {'Turkey'}, 'loc-ne')], [('Isaac Bessudos', 'Isaac', {'Bessudos', 'Isaac'}, 'person-ne'), ('Raquel Bessudo', 'Raquel', {'Bessudo', 'Raquel'}, 'person-ne')], [('them', 'them', {'them'}, 'misc')], [('you', 'you', {'you'}, 'misc'), ('it', 'it', {'it'}, 'misc'), ('they', 'they', {'they'}, 'misc')], [('no_candidate', 'no_candidate', {'no_candidate'}, 'misc')], [('Greece', 'Greece', {'Greece'}, 'loc-ne')], [('no_candidate', 'no_candidate', {'no_c




## 12. We combine the candidate lists for candidate merging

We organize candidates in a list sorted by their number of phrases

In [393]:
#concatenate corefs and noun phrase lists
nps_cands = [cand for cands in np_and_cand_list for cand in cands]
crf_cands = [cand for cands in coref_and_cand_list for cand in cands]
#candidate_list = coref_and_cand_list + np_and_cand_list
#print(f'Len = {len(candidate_list)} should be 2x amount of tweets')
print(len(nps_cands), len(crf_cands))
#unpack list of lists into one list
candidate_list = nps_cands + crf_cands
print(f'The amount of all candidates is {len(candidate_list)}')


1432 288
The amount of all candidates is 1720


In [395]:
nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences ]
crf_tagged = [sent for tagged_cand in tagged_coref_cands for sent in tagged_cand.sentences ]
print(len(nps_tagged), len(crf_tagged))
all_cands_tagged = nps_tagged + crf_tagged


1439 289


ValueError: arrays must all be same length

In [391]:
print(len(candidate_list))
print(len(all_cands_tagged))

1720
1728


In [427]:
all_cands_tagged.remove(all_cands_tagged[number+1])
#all_cands_tagged[number].text = all_cands_tagged[number].text + all_cands_tagged[number+1].text

In [426]:
number = 1477
print(all_cands_tagged[number].text)
print(candidate_list[number][0])

no_candidate
no_candidate


In [428]:
indices_to_remove = set()
for i in range(len(all_cands_tagged)):
    #print(f'{i}: {candidate_list[i][0]} = {all_cands_tagged[i].text}')
    #print(indices_to_remove)
    if candidate_list[i][0] != all_cands_tagged[i].text:
        print(indices_to_remove)
        indices_to_remove.add(i)

print(indices_to_remove)


set()


In [496]:
cand_df = pd.DataFrame(
    {'candidates': candidate_list,
     'cand_tags': all_cands_tagged
    })

cand_df['cand_text'] = cand_df.candidates.apply(lambda x: x[0])
cand_df['cand_len'] = cand_df.cand_text.apply(lambda x: len(x.split()))
cand_df.columns = cand_df.columns.str.strip()
cand_df

Unnamed: 0,candidates,cand_tags,cand_text,cand_len
0,(Morocco Tunisia Libya Greece Turkey Each one ...,"[\n {\n ""id"": 1,\n ""text"": ""it"",\n ""...",Morocco Tunisia Libya Greece Turkey Each one o...,19
1,"(population & children for 39%, of whom more t...","[\n {\n ""id"": 1,\n ""text"": ""asylum"",\n ...","population & children for 39%, of whom more th...",19
2,"(access to territory & asylum, living conditio...","[\n {\n ""id"": 1,\n ""text"": ""3"",\n ""l...","access to territory & asylum, living condition...",19
3,(conditions + reduce overcrowding at the recep...,"[\n {\n ""id"": 1,\n ""text"": ""HU"",\n ""...",conditions + reduce overcrowding at the recept...,19
4,(economic migrants who cross developed nations...,"[\n {\n ""id"": 1,\n ""text"": ""They"",\n ...",economic migrants who cross developed nations ...,19
...,...,...,...,...
1715,"(they, any, {countries, any}, misc)","[\n {\n ""id"": 1,\n ""text"": ""no_candidat...",they,1
1716,"(connector, connector, {connector}, group)","[\n {\n ""id"": 1,\n ""text"": ""Omfg"",\n ...",connector,1
1717,"(barriers, barriers, {barriers}, misc)","[\n {\n ""id"": 1,\n ""text"": ""italian"",\n...",barriers,1
1718,"(others, others, {others}, misc)","[\n {\n ""id"": 1,\n ""text"": ""no_candidat...",others,1


In [740]:
for cand in cand_df['cand_text']:
    print(cand,'\n')

Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost 

access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration 

economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK 

population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old 

brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham 

enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes 

The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece 

t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset 

Its sounds more a kin to a penny dreadful sci fi novel than an economic treatise . 

camp on L

In [497]:
# we sort the candidates by their length

cand_df.sort_values('cand_len', ascending=False,inplace=True)

cand_df = cand_df[cand_df.cand_text not in  ['no_candidate', 'candidate_to_be_removed']]

cand_df.reset_index(drop=True, inplace = True)
cand_df
#all_cands_tagged.sort(reverse=True,key=get_cand_len(candidate_list))

Unnamed: 0,candidates,cand_tags,cand_text,cand_len
0,(Morocco Tunisia Libya Greece Turkey Each one ...,"[\n {\n ""id"": 1,\n ""text"": ""it"",\n ""...",Morocco Tunisia Libya Greece Turkey Each one o...,19
1,"(access to territory & asylum, living conditio...","[\n {\n ""id"": 1,\n ""text"": ""3"",\n ""l...","access to territory & asylum, living condition...",19
2,(conditions + reduce overcrowding at the recep...,"[\n {\n ""id"": 1,\n ""text"": ""HU"",\n ""...",conditions + reduce overcrowding at the recept...,19
3,(economic migrants who cross developed nations...,"[\n {\n ""id"": 1,\n ""text"": ""They"",\n ...",economic migrants who cross developed nations ...,19
4,"(population & children for 39%, of whom more t...","[\n {\n ""id"": 1,\n ""text"": ""asylum"",\n ...","population & children for 39%, of whom more th...",19
...,...,...,...,...
1715,"(This, This, {This}, misc)","[\n {\n ""id"": 1,\n ""text"": ""it"",\n ""...",This,1
1716,"(me, me, {me}, misc)","[\n {\n ""id"": 1,\n ""text"": ""Its"",\n ...",me,1
1717,"(Greece, Mazou, {Mazou}, group-ne)","[\n {\n ""id"": 1,\n ""text"": ""you"",\n ...",Greece,1
1718,"(no_candidate, no_candidate, {no_candidate}, m...","[\n {\n ""id"": 1,\n ""text"": ""your"",\n ...",no_candidate,1


In [572]:
print(len(cand_df))
cand_df = cand_df[cand_df.cand_text != 'candidate_to_be_removed']
len(cand_df)
cand_df.reset_index(drop=True,inplace=True)


368


### First merging step

In [499]:
#
# THIS IS THE FIRST MERGING STEP
#
        
def merging_step1(candidate_list):
    """
    In the first merging step, we merge two candidates if the head of each of their representative phrase 
     is identical by string comparison.
    """
    indices_to_remove = set()
    for longer_cand in range(len(candidate_list)):     
        for cand in range(longer_cand+1,len(candidate_list)): 
            #print(f'for index {candidate_list[longer_cand][1]} checking the index {candidate_list[cand][1]}')

            #performing merging only for NE candidates of the same type
            if 'ne' in candidate_list[longer_cand][3]:
                #mark for merging if the head and its head's cand type is the same for 2 candidates
                #print(candidate_list[cand])
                if candidate_list[longer_cand][1] == candidate_list[cand][1] and candidate_list[longer_cand][3] == candidate_list[cand][3]:
                    print(f'matching "{longer_cand}" with "{cand}"')
                    #print(f'{candidate_list[longer_cand][1]} ===== {candidate_list[cand][1]}')
                    indices_to_remove.add(cand)
    return indices_to_remove

def merge_indices(cand_df,indices_to_remove):                

    print(f'Initial amount of candidates: {len(cand_df)}')                
    #print(len(sorted(indices_to_remove)))

    for index in reversed(sorted(indices_to_remove)):
        cand_df.drop([index],inplace=True)
        
    cand_df.reset_index(drop=True,inplace=True)
    print(f'Amount of candidates: {len(cand_df)}, after removing {len(sorted(indices_to_remove))} indices') 
    return cand_df


cand_df = merge_indices(cand_df, merging_step1(cand_df['candidates']))

Initial amount of candidates: 1459
Amount of candidates: 1459, after removing 0 indices


In [485]:
cand_df['candidates'][250][0]

'13,000 migrants & asylum seekers'

### Second merging step

We merge 2 candidates if their sets of phrases heads are semantically similar

In [33]:
import gensim

#load the GoogleNews 300dim model (fix path)
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [553]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import numpy as np

#adjust for sets of phrases in the candidate
def merging_step2(candidate_list):
    
    indices_to_remove = set()
    for longer_cand in tqdm(range(len(candidate_list))):     
        i = candidate_list[longer_cand]
        long_cand_mean_vec = phrase_heads_avg_vector(candidate_list[longer_cand][2])
        for cand in range(longer_cand+1,len(candidate_list)): 
            #print(f'for index {candidate_list.index(longer_cand)} checking the index {candidate_list.index(cand)}')
            #if candidate_list[longer_cand][1] == candidate_list[cand][1]:
                #print(f'matching "{longer_cand}" with "{cand}"')
            cand_mean_vec = phrase_heads_avg_vector(candidate_list[cand][2])

            if candidate_list[longer_cand][3] == candidate_list[cand][3]:
                try:
                    #print(1-cosine(long_cand_mean_vec,cand_mean_vec))
                    #print(long_cand_mean_vec.reshape(-1,1).shape, cand_mean_vec.reshape(1,-1).shape)
                    if 1-cosine(long_cand_mean_vec.reshape(-1,1),cand_mean_vec.reshape(-1,1)) >= 0.5:
                        #print(f'matching "{longer_cand}" with "{cand}"') 
                        indices_to_remove.add(cand)
                except AttributeError:
                    pass

            else:
                try:
                    if 1-cosine(long_cand_mean_vec.reshape(-1,1),cand_mean_vec.reshape(-1,1)) >= 0.7:
                        #print(f'matching "{longer_cand}" with "{cand}"') 
                        indices_to_remove.add(cand)
                        
                except AttributeError:
                    pass


    return indices_to_remove

def phrase_heads_avg_vector(phrase_set):
    phrase_head_vectors = []
    for phrase_head in phrase_set:    
        try:
            phrase_head_vectors.append(model[phrase_head])
        except KeyError:
            pass
    #phrase_head_vectors = [model[phrase_head] for phrase_head in phrase_set]
    if len(phrase_head_vectors) != 0:
        return np.mean(phrase_head_vectors,axis=0)
    else: 
        return np.NaN

        

cand_df = merge_indices(cand_df, merging_step2(cand_df['candidates']))


100%|████████████████████████████████████████████████████████████████████████████████| 509/509 [00:18<00:00, 27.75it/s]

Initial amount of candidates: 509
Amount of candidates: 509, after removing 0 indices





In [551]:
cand_df['candidates'][63][2]

{'camps', 'conditions', 'improvement', 'refugees'}

## Third merging step representative labeling

currently working on average cosine similarity of each phrase in the candidate - maybe not optimal, maybe it will be better with a different threshold

In [559]:
from sklearn.cluster import AffinityPropagation

from sklearn.metrics.pairwise import cosine_similarity

def merging_step3(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:  
        #the head of noun phrase is marked with value 0 for the word.head
        np_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]
        #np_pos_tags = {word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #print(np_heads_pos)
        cand_np_phrases = []
        for word, head, pos in np_heads_pos:
            #head-1 because the pointer to head does not use 0 index
            if (pos == 'JJ' or pos=='VBN') and 'NN' in np_heads_pos[head-1][2]:
                cand_np_phrases.append(f'{word}_{np_heads_pos[head-1][0]}')
        phrases.append(cand_np_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for longer_cand in range(len(candidate_list)):     
        i = candidate_list[longer_cand]
        long_cand_vectors = phrases_vectors(phrases[longer_cand])
        if len(long_cand_vectors)==0:
            pass
        else:
            for cand in range(longer_cand+1,len(candidate_list)): 
                short_cand_vectors = phrases_vectors(phrases[cand])
                if len(short_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(long_cand_vectors),len(short_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(long_cand_vectors)):
                        for j in range(len(short_cand_vectors)):

                            sim_matrix[i][j] = cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1))

                                
                    if np.mean(sim_matrix) > 0.3:
                        #print(f'{longer_cand} and {cand} are {numpy.mean(sim_matrix)} similar' )
                        indices_to_remove.add(cand)
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove
                


def phrases_vectors(cand_phrases):
    
#for cand_phrases in phrases:
    #print(cand_phrases)
    cand_phrase_vectors = []
    for phrase in cand_phrases:
        try:
            cand_phrase_vectors.append(model[phrase])
            #print(f'for existing phrase "{phrase}" the vector is {model[phrase][0]}')
        except KeyError:
            phrase_words = phrase.split('_')
            #print(model[phrase_words[1]])
            try:
                phrase_vectors = [model[phrase_word] for phrase_word in phrase_words]
                #print(f'for phrase "{phrase}" avg vector is "{sum(phrase_vectors)/len(phrase_vectors)}') 
                cand_phrase_vectors.append(sum(phrase_vectors)/len(phrase_vectors))
            except KeyError:
                pass
    #print(len(cand_phrase_vectors))
    return cand_phrase_vectors
    
    
cand_df = merge_indices(cand_df, merging_step3(cand_df))
#print(indices_to_remove)

Initial amount of candidates: 509
Amount of candidates: 444, after removing 65 indices


In [568]:
for cand in cand_df['cand_text']:
    print(cand)
    

Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece
t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset
Its sounds more a kin to a penny dreadful sci fi novel than an economic treatise .
camp on Lesbos, where just 

### Merging step 4

In [574]:
# missing the second method - we check for the lexical identity of specific stems in multiple candidates.

def merging_step4(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:

        #the head of noun phrase is marked with value 0 for the word.head
        np_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]

        #print(np_heads_pos)
        cand_np_phrases = []
        for word, head, pos in np_heads_pos:
            i = np_heads_pos.index((word, head, pos))
            #print(np_heads_pos)
            #print(np_heads_pos[i])
            #print(np_heads_pos[head-1])
            #'NN' in np_heads_pos[head-1][2] and
            try:
                if 'NN' in pos and 'NN' in np_heads_pos[i+1][2] : 
                    cand_np_phrases.append(f'{word}_{np_heads_pos[i+1][0]}')
                if 'NN' in pos and 'NN' in np_heads_pos[head-1][2]:
                    cand_np_phrases.append(f'{word}_{np_heads_pos[head-1][0]}')
            except IndexError:
                pass
        phrases.append(cand_np_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for longer_cand in range(len(candidate_list)):     
        i = candidate_list[longer_cand]
        long_cand_vectors = phrases_vectors(phrases[longer_cand])
        if len(long_cand_vectors)==0:
            pass
        else:
            for cand in range(longer_cand+1,len(candidate_list)): 
                short_cand_vectors = phrases_vectors(phrases[cand])
                if len(short_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(long_cand_vectors),len(short_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(long_cand_vectors)):
                        for j in range(len(short_cand_vectors)):
                            #print(cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)))
                            sim_matrix[i][j] = cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1))
                            """if cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.4:                
                                sim_matrix[i][j] = 2
                            elif cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.2:
                                sim_matrix[i][j] = 1
                            else:
                                sim_matrix[i][j] = 0"""

                                
                    if np.mean(sim_matrix) > 0.6:
                        print(f'{longer_cand} and {cand} are {np.mean(sim_matrix)} similar' )
                        indices_to_remove.add(cand)
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove

cand_df = merge_indices(cand_df, merging_step4(cand_df))
#print(merging_step4(candidate_list))

Initial amount of candidates: 342
Amount of candidates: 342, after removing 0 indices


In [575]:
for cand in cand_df['cand_text']:
    print(cand)

Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece
t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset
Its sounds more a kin to a penny dreadful sci fi novel than an economic treatise .
camp on Lesbos, where just 

### Merging step 5


In [1316]:
for i in event_df['Tweet Raw'][:100]:
    print(i)

@sztiv5 @Juliivan_ Yes, why? Why it wasn’t good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU? They must get help in the first safe country as asylum seeker,not in the 5th. No law says you can pick and choose and get it.
@GoTurkey ISIS refuge. Wouldn't go to Turkey if I was paid. You're likely to get your head lopped off if you stray off the beaten track. Go to Christian Greece and be safe while enjoying a similar climate to islamic turkey
Greece must improve refugee overcrowding, UN warns https://t.co/UDM4GDMcmo
@ThisIsOzcan @Nervana_1 @EGozuguzelli 1/3 Law? Let the idle stuff. All rights of the Turkish minority in Greece were taken away. Where is the law? Refugees are not accepted into the EU. Where is the law? Western states divided the states in the Middle East and Africa for underground resources. Where is the law?
@Juliivan_ @sztiv5 Anyway, how did the asylum seekers ended up at HU borders? They must have had a long journey through Greece,

## Frame identification

In [646]:
frame_properties = {'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'unreason/irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']}


frame_properties = {'settlement':['settlement','accomodation','accommodated', 'military barrack','tent','camp'],
                   'reception':['quota', 'reception','together','asylum','receive'],
                    'security':['security', 'border','crossing','fence','control','flow'],
                    'criminality':['officer','crime','offense','police','trafficking','suspect'],
                    'economisation':['euro','economic','million','thousand','cost','money'],
                    'humanitarian':['humane','voluntary','help','support','aid','care','solidarity'],
                    'victimization':['islamic','fight','war','dead','rescued','state'],
                    'integration': ['labour','employed','unemployed','integration','positive']
                   
                   }



"                   'importance':\n                    'unimportance':\n                    'power/leadership':\n                    'weakness/passiveness':\n                    'good quality':\n                    'poor quality':\n                    'safety':\n                    'unsafety':\n                    'positive':\n                    'negative':\n                    \n                    \n                   }"

In [120]:
import conceptnet_lite as cn
import gensim.downloader as api


# to run on the server we should use larger model according to the paper - "conceptnet-numberbatch-17-06-300"
#model = api.load("glove-twitter-200")


@Kkkk09240868
@0khalodi0
@POTUS
Also,
Eedogan
has
been
documented
using
ISIS
militants
aka
terrorists,
he
played
the
immigrants
card
as
a
way
to
political
threat
to
Europe,
he
pushed
immigrants
to
greece
and
europe
for
their
own
death,
after
letting
them
homeless
for
years
As
I
said,
All
Muslims
are
guilty
@Nionios1908
@kitsikis
Greece
unable
to
cope
with
60
thousand
refugees
with
a
population
of
10
million.
and
the
border
next
to
it
wants
a
country
of
83
million
to
be
torn
apart.
God,
I've
never
seen
a
fool
like
you
together
in
my
life.
the
problem
is,
you're
all
idiots.😂
@hama_ashad
@realDonaldTrump
and
from
there
you
can
try
to
pass
Europe
especially
Greece
there
are
lots
of
boats
you
know
but
you
have
to
know
you
might
die
from
all
of
that
this
refugge
thing
is
very
fishy
@hama_ashad
@realDonaldTrump
I
dont
know.
I
dont
live
in
Iraq.
In
Europe
many
migrants
walked
from
Greece
to
Norway/sweden/Germany.
I
guess
you
can
do
the
same.
Just
from
Iraq
to
Greece.
The
life
as
migrant
is
awf

dependend
of
EU
money.
Wothout
the
EU
greek
people
would
seek
refuge
in
Turkey.
HAHAHA
@Susan60190970
@AndreAp0ll0
@itvnews
@emmamurphyitv
Please
tell
me,
what
exactly
do
you
know
about
the
asylum
seekers
systems
in
countries
like
France,
Germany
and
Greece?
Details
please.
I'm
very
interested
in
how
you
know
their
systems
are
flawless
and
don't
discriminate
🙃
via
@PerilOfAfrica
#Newsdeck
COVID-19:
Greece
reports
first
coronavirus
case
in
Moria
migrant
camp
on
Lesbos:
ATHENS,
Sept
2
(Reuters)
-
Greece
recorded
its
first
coronavirus
case
in
the
overcrowded
migrant
camp
of
Moria
on
the
island
of
Lesbos
and
the…
https://t.co/O31oc3V6j0
https://t.co/24C6hWAyVz
@BenTheSilent
@AndreAp0ll0
@itvnews
@emmamurphyitv
That's
not
what
i
said
refugees
who
are
brought
in
through
the
proper
channels
are
vetted.
Do
you
watch
what's
happening
france
Germany
and
Greece.
If
they
were
genuine
they
wouldn't
have
been
refused
asylum
in
the
countries
they've
passed
No
I
don't
think
they
are
all
criminals
@Spu

In [346]:
#manual_candidates = ['refugees', 'migrant', 'immigrant', 'greece', 'turkey','people']

# this dictionary is going to be - target_concept: [(frame1,weight1),(frame2,weight2)]
frame_dictionary = {}

tags = tag_tweets(tweets_corpus)


#print(tags)



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:33<00:00,  1.16s/it]


In [761]:
cols = ['word'] + list(frame_properties.keys())

framed_words = pd.DataFrame.from_dict(cand_frames)

framed_words

Unnamed: 0,settlement,accomodation,accommodated,military barrack,tent,camp,quota,reception,together,asylum,...,fight,war,dead,rescued,state,labour,employed,unemployed,integration,positive
camp,"[0.096779585, 0.096779585, 0.022478957, 0.0224...","[0.20433454, 0.20433454, 0.003927827, 0.003927...","[0.027445737, 0.027445737, 0.17925097, 0.17925...",[],"[-0.04001374, -0.04001374, -0.0020679757, -0.0...","[0.005917713, 0.005917713, 0.031675775, 0.0316...","[0.079393506, 0.079393506, -0.018778834, -0.01...","[0.036053136, 0.036053136, 0.021431496, 0.0214...","[0.0037784167, 0.0037784167, -0.011410972, -0....","[0.1519616, 0.1519616, -0.038508326, -0.038508...",...,"[0.07469394, 0.07469394, 0.006586885, 0.006586...","[0.09578044, 0.09578044, -0.023129065, -0.0231...","[-0.024847355, -0.024847355, 0.13116385, 0.131...","[0.036126204, 0.036126204, 0.1757022, 0.175702...","[0.1324172, 0.1324172, 0.072251014, 0.07225101...",[],"[-0.045608632, -0.045608632, 0.18880078, 0.188...","[0.114989236, 0.114989236, 0.0006968416, 0.000...","[0.18403187, 0.18403187, 0.075890444, 0.075890...","[0.13560301, 0.13560301, 0.103027016, 0.103027..."
case,"[0.044889618, 0.108972326, 0.108972326, 0.0219...","[0.1975876, -0.01711282, -0.01711282, 0.136996...","[0.2931801, -0.030498743, -0.030498743, 0.0966...",[],"[0.0552354, 0.05133646, 0.05133646, 0.02544808...","[0.13421468, 0.07666254, 0.07666254, 0.0297146...","[0.12676318, 0.020955017, 0.020955017, 0.07371...","[0.12154389, 0.0066398564, 0.0066398564, 0.067...","[0.05211144, -0.013627447, -0.013627447, 0.069...","[0.2248644, 0.062638074, 0.062638074, 0.035149...",...,"[0.08583819, -0.0119549185, -0.0119549185, 0.1...","[0.02462985, 0.03602062, 0.03602062, 0.0396573...","[0.31372595, 0.1105189, 0.1105189, 0.06011012,...","[0.3866048, 0.14275861, 0.14275861, 0.07378995...","[0.08103147, 0.074176654, 0.074176654, 0.04161...",[],"[0.16430435, 0.041758478, 0.041758478, 0.10444...","[0.059248056, -0.013417903, -0.013417903, -0.0...","[0.052662343, 0.0003894046, 0.0003894046, 0.02...","[0.141266, 0.12976848, 0.12976848, 0.0731404, ..."
19,"[0.0544194, 0.0544194, 0.0544194, 0.0544194, 0...","[-0.014394398, -0.014394398, -0.014394398, -0....","[0.02126009, 0.02126009, 0.02126009, 0.0212600...",[],"[-0.03341804, -0.03341804, -0.03341804, -0.033...","[0.0919154, 0.0919154, 0.0919154, 0.0919154, 0...","[-0.015046243, -0.015046243, -0.015046243, -0....","[0.031472094, 0.031472094, 0.031472094, 0.0314...","[-0.02222292, -0.02222292, -0.02222292, -0.022...","[0.04686284, 0.04686284, 0.04686284, 0.0468628...",...,"[0.08653071, 0.08653071, 0.08653071, 0.0865307...","[0.013281414, 0.013281414, 0.013281414, 0.0132...","[0.12918106, 0.12918106, 0.12918106, 0.1291810...","[0.06322109, 0.06322109, 0.06322109, 0.0632210...","[0.0953567, 0.0953567, 0.0953567, 0.0953567, 0...",[],"[0.05923778, 0.05923778, 0.05923778, 0.0592377...","[-0.054391023, -0.054391023, -0.054391023, -0....","[0.09300181, 0.09300181, 0.09300181, 0.0930018...","[0.22546183, 0.22546183, 0.22546183, 0.2254618..."
migrant,"[0.14113313, 0.14113313, 0.108972326, 0.108972...","[0.20153728, 0.20153728, -0.01711282, -0.01711...","[0.19091144, 0.19091144, -0.030498743, -0.0304...",[],"[0.43063366, 0.43063366, 0.05133646, 0.0513364...","[1.0, 1.0, 0.07666254, 0.07666254, 0.10588831,...","[0.09900418, 0.09900418, 0.020955017, 0.020955...","[0.12727246, 0.12727246, 0.0066398564, 0.00663...","[0.02476906, 0.02476906, -0.013627447, -0.0136...","[0.19039115, 0.19039115, 0.062638074, 0.062638...",...,"[0.1417236, 0.1417236, -0.0119549185, -0.01195...","[0.14014623, 0.14014623, 0.03602062, 0.0360206...","[0.06850475, 0.06850475, 0.1105189, 0.1105189,...","[0.13938421, 0.13938421, 0.14275861, 0.1427586...","[0.0812741, 0.0812741, 0.074176654, 0.07417665...",[],"[0.04498983, 0.04498983, 0.041758478, 0.041758...","[0.100330524, 0.100330524, -0.013417903, -0.01...","[-0.06051269, -0.06051269, 0.0003894046, 0.000...","[0.10884626, 0.10884626, 0.12976848, 0.1297684..."
Covid,"[0.28724444, 0.28724444, 0.28724444, 0.28724444]","[0.0150301345, 0.0150301345, 0.0150301345, 0.0...","[0.072666705, 0.072666705, 0.072666705, 0.0726...",[],"[0.04901067, 0.04901067, 0.04901067, 0.04901067]","[0.098471, 0.098471, 0.098471, 0.098471]","[0.013520267, 0.013520267, 0.013520267, 0.0135...","[0.02983211, 0.02983211, 0.02983211, 0.02983211]","[0.054004237, 0.054004237, 0.054004237, 0.0540...","[0.12780023, 0.12780023, 0.12780023, 0.12780023]",...,"[0.25302434, 0.25302434, 0.25302434, 0.25302434]","[0.16647193, 0.16647193, 0.16647193, 0.16647193]","[0.09083742, 0.09083742, 0.09083742, 0.09083742]","[0.07353256, 0.07353256, 0.07353256, 0.07353256]","[0.15080929, 0.15080929, 0.15080929, 0.15080929]",[],"[0.09731477, 0.09731477, 0.09731477, 0.09731477]","[0.08290385, 0.08290385, 0.08290385, 0.08290385]","[0.03911755, 0.03911755, 0.03911755, 0.03911755]","[0.08254479, 0.08254479, 0.08254479, 0.08254479]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
member,"[0.046724994, 0.046724994]","[0.045278594, 0.045278594]","[0.20955454, 0.20955454]",[],"[0.049148176, 0.049148176]","[0.05835045, 0.05835045]","[0.02326626, 0.02326626]","[-0.03067905, -0.03067905]","[0.06304469, 0.06304469]","[0.049595706, 0.049595706]",...,"[0.046919152, 0.046919152]","[0.06789708, 0.06789708]","[0.09239874, 0.09239874]","[0.100803845, 0.100803845]","[0.0603837, 0.0603837]",[],"[0.15285343, 0.15285343]","[0.024691502, 0.024691502]","[-0.009682907, -0.009682907]","[0.23670721, 0.23670721]"
struggling,[0.10712366],[0.052662566],[0.052333802],[],[0.0796655],[0.076842785],[0.05853714],[0.020090828],[0.22240086],[0.07690313],...,[0.11988971],[0.15207517],[0.16518636],[0.09114686],[0.17694753],[],[0.09438409],[0.07204249],[0.018609447],[0.17996968]
situation,[0.046724994],[0.045278594],[0.20955454],[],[0.049148176],[0.05835045],[0.02326626],[-0.03067905],[0.06304469],[0.049595706],...,[0.046919152],[0.06789708],[0.09239874],[0.100803845],[0.0603837],[],[0.15285343],[0.024691502],[-0.009682907],[0.23670721]
%,"[0.008056514, 0.008056514]","[0.20469183, 0.20469183]","[0.026824536, 0.026824536]",[],"[0.015281253, 0.015281253]","[-0.034571387, -0.034571387]","[0.078825526, 0.078825526]","[0.023241833, 0.023241833]","[0.05053803, 0.05053803]","[0.18259677, 0.18259677]",...,"[0.07092823, 0.07092823]","[0.09300359, 0.09300359]","[-0.0017056912, -0.0017056912]","[0.023498055, 0.023498055]","[0.019637551, 0.019637551]",[],"[0.09812751, 0.09812751]","[0.05106765, 0.05106765]","[0.10707511, 0.10707511]","[0.0658559, 0.0658559]"


In [768]:
cand_df

Unnamed: 0,candidates,cand_tags,cand_text,cand_len
0,(Morocco Tunisia Libya Greece Turkey Each one ...,"[\n {\n ""id"": 1,\n ""text"": ""it"",\n ""...",Morocco Tunisia Libya Greece Turkey Each one o...,19
1,"(access to territory & asylum, living conditio...","[\n {\n ""id"": 1,\n ""text"": ""3"",\n ""l...","access to territory & asylum, living condition...",19
2,(economic migrants who cross developed nations...,"[\n {\n ""id"": 1,\n ""text"": ""They"",\n ...",economic migrants who cross developed nations ...,19
3,"(population & children for 39%, of whom more t...","[\n {\n ""id"": 1,\n ""text"": ""asylum"",\n ...","population & children for 39%, of whom more th...",19
4,"(brink of lockdown, outbreak at asylum seeker ...","[\n {\n ""id"": 1,\n ""text"": ""help"",\n ...","brink of lockdown, outbreak at asylum seeker h...",18
...,...,...,...,...
337,"(riots!, riots, {!, riots}, misc)","[\n {\n ""id"": 1,\n ""text"": ""Greece"",\n ...",riots!,1
338,"(who, who, {who}, group)","[\n {\n ""id"": 1,\n ""text"": ""they"",\n ...",who,1
339,"(MSM, MSM, {MSM}, group-ne)","[\n {\n ""id"": 1,\n ""text"": ""candidate_t...",MSM,1
340,"(2040, 2040, {2040}, misc)","[\n {\n ""id"": 1,\n ""text"": ""2"",\n ""l...",2040,1


In [765]:
from nltk.corpus import stopwords

nltk_words = list(stopwords.words('english'))


tweets_corpus = list(preprocessing.preprocess_tweets(event_df['Tweet Raw'][200:300]))


word_properties = defaultdict(dict)
for i in tweets_corpus:
    tweet_words = [word  for word in i.split() if word not in nltk_words and len(word)>1]
    for word in tweet_words:
        #print(word)
        property_list = []
        #print(list(frame_properties.keys()))
        for prop in list(frame_properties.keys()):
            #print(frame_properties[prop])
            
            try:
                #print(f'sim of {word}, {prop} is {model.similarity(word, prop)}')
                weights = [model.similarity(word, seed) for seed in frame_properties[prop]]
                #print(weights)
                if max(weights)>0.4:
                    word_properties[word][prop] = max(weights)
            except KeyError:
                pass
            



print(word_properties)
        

            

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 917.18it/s]


defaultdict(<class 'dict'>, {'Greeces': {'economisation': 0.4905119}, 'get': {'reception': 0.50504476}, 'EU': {'economisation': 0.51138544}, 'Greece': {'economisation': 0.42833072}, 'currency': {'economisation': 0.633761}, 'need': {'humanitarian': 0.46502197}, 'money': {'economisation': 1.0}, 'give': {'reception': 0.5135118}, 'refugee': {'reception': 0.60989624}, 'murder': {'criminality': 0.53367186, 'victimization': 0.40415478}, 'migrants': {'reception': 0.4879586}, 'cash': {'economisation': 0.6151221}, 'Refugee': {'reception': 0.46804923}, 'support': {'humanitarian': 1.0}, 'refuge': {'reception': 0.41259414}, 'country': {'victimization': 0.43598586}, 'send': {'reception': 0.46330065}, 'Muslims': {'victimization': 0.4740045}, 'jihadists': {'victimization': 0.40899324}, 'Islam': {'victimization': 0.5393352}, 'gang': {'criminality': 0.48914832}, 'financial': {'economisation': 0.56026417}, 'needed': {'humanitarian': 0.50178003}, 'help': {'humanitarian': 1.0}, 'helped': {'humanitarian': 0

In [864]:
# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(tweets_corpus) 

all_tweets_list = all_tweets_list + ['Muslim refugees is government']

for tweet in range(len(all_tweets_list)):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if tweet_sentokenized == []:
        tweet_sentokenized.append('empty_tweet')
        print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=1000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

101it [00:30,  3.36it/s]


'Greeces Moria migrant camp quarantined after first Covid - 19 case via @TheNationalUAE'

In [867]:
from collections import defaultdict

cand_frames = defaultdict(list)

framed_words = pd.DataFrame(columns=['word',list(frame_properties.keys())])


for tweet_id in tqdm(range(len(tagged_tweets))):
    #print(tweet)
    np_heads = [[word.id, word.text,word.head] for sent in tagged_tweets[tweet_id].sentences for word in sent.words]
    #print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')
    #print(np_heads)
    #print(len(cand_df['candidates']))
    for cand in cand_df['candidates']:
        print(cand[0])
        #print(get_head(str(cand)))
        for phrase_head in cand[2]:  
            
            #if str(cand[1]) in str(tweet):
            #print(phrase_head)
            if phrase_head in tagged_tweets[tweet_id].text and len(phrase_head)>1:
                #print(phrase_head)
                #find all dependencies of the phrase head
                for related in range(len(np_heads)):
                    #print(np_heads[related])
                    if phrase_head == np_heads[related][1]:
                        related_word = np_heads[np_heads[related][2]-1][1]
                        
                        print(f'\n\n checking {phrase_head}_{related_word}')
                
                        cand_frames['word'].append(phrase_head)
                        #cand_frames['word'].append(phrase_head)
                        for frame_property in list(frame_properties.keys()):
                            #print(frame_property)

                            try:
                                #print(word_properties[phrase_head][frame_property])
                                cand_frames[frame_property].append(word_properties[related_word][frame_property])
                                
                            except KeyError:
                                #print('Error')
                                #cand_frames[frame_property].append(word_properties['tent'][frame_property])
                                cand_frames[frame_property].append(None)
                
                    #print(len(cand_frames[frame_property]))
                    
                #print('\n')
                
                
                """for frame_property in list(frame_properties.keys()):
                    for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    #print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                            #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                        except KeyError:
                            pass"""
                        #[cand_frames[seed_word][cand].append(model.similarity(print(f'{cand} is related to {np_heads[np_heads[related][2]+1][1]}') if cand == np_heads[related][1] else print('nej') for related in range(len(np_heads))]
            #print(get_head(cand))
            #print(np_heads[19][1])
            #[f(x) if condition else g(x) for x in sequence]
            #[print(np_heads[np_heads[related][2]-1]) if get_head(cand)==np_heads[related][1] else print('hi') for related in range(len(np_heads))]

            
#became ___ (vb and vbx)
#(VP sit/VB (PP on/IN (NP the/DT mat/NN))))) 

#cand_frames

  2%|█▌                                                                                | 2/101 [00:00<00:05, 17.86it/s]

Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece
t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset
Its sounds more a kin to a penny dreadful sci fi novel than an economic treatise .
camp on Lesbos, where just 

  5%|████                                                                              | 5/101 [00:00<00:05, 18.28it/s]


aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_refugee
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmi

 10%|████████                                                                         | 10/101 [00:00<00:04, 19.31it/s]




 checking 19_COVID


 checking COVID_Case
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19


 checking 19_COVID


 checking COVID_Case
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_migration
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker

 15%|████████████                                                                     | 15/101 [00:00<00:04, 18.95it/s]


lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_return


 checking Greece_back


 checking Greece_Greece
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes


 checking people_foreigners
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece


 checking Greece_return


 checking Gre

 18%|██████████████▍                                                                  | 18/101 [00:00<00:03, 20.82it/s]

Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration


 checking covid_basic
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_to
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece


 checking Greece_to
t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset


 checking Greece_to
Its sounds more a ki

 21%|████████████████▊                                                                | 21/101 [00:00<00:03, 20.82it/s]


the supply of ready - made food
That' s how a waterless region
the latter claim (HRW & NGOs)
great solidarity towards 1 0 0
muslim britherhood and political islam expansionism
less than 72 hours, all thanks
the chance of breeding bird behaviour
m not defending them after it
the same colour passport as you
Your sneaky political game didn' t
no . . . . .
a hole in Syrian Refugees' boats
the island of Lesbos and the
isis terrorists they trade with israel
the poor quality of the it
the only foreign language you know
Turkish military & a refugee woman
their strategic partnership with @eBay
the work of refugee-led groups
their many services and resources
a hell of a lot
the weapons of mass migration
Pakistan and Afghanistan and Bagladesh
Somali migrant infected with coronavirus
us & our partners joy
the end of the year


 checking year_old
Mujtaba and Im from Afganistan
very constructive and open exchanges
m Greek, many in America
part of the new normal
both aegean and kastelorizo eez
the c

 24%|███████████████████▏                                                             | 24/101 [00:01<00:04, 19.01it/s]


the ones who want to contribute and have something to offer
afghanistan (76%), syria (7%) and Democratic Republic of the congo (7%)
its the result of an entire continent acting in its perceived
critical issues that must be addressed, incl the urgent need 2
Viotia Refugee Detention Camp in GREECE Locked Down for COVID-19 greece
The new Greek Asylum Service microsite () that replaces .
a recent lack of cooperation from the Turkish coast guard
its 12 mile maritime territory all turkey needs to do
It' s a shame Greece learned that too late .
I' m very interested in how you know their systems
the best ideas in our online event innowise challenge labs
Greece : 1/2 of UK size, twice of asylum seekers.


 checking Greece_Italy
a potential embellishments of hard facts and harsh reality
a chance to crack down on the unwanted visitors
Cemeteries just for unidentified migrant bodies from the Mediterranean
The infected person , a 40 year old man
a country that has been your dreams for centuries
ge

 28%|██████████████████████▍                                                          | 28/101 [00:01<00:03, 21.06it/s]

part of the new normal
both aegean and kastelorizo eez
the climate of the south
the exploitation of migrant workers
the Cordon Sanitaire & more


 checking more_billion
the catch an ISIS member
her phenomenal team in greece


 checking greece_been
Their stories matter too .
a way to political threat
UNHCR & other agencies
no immigrant in Birmingham
s Hell) Closed camps
closed and controlled structures
the integration of women
animals for manipulating refuges
2 claims in NYT
Surly a smallish length
control over the media
Roger biland Daniele Francois
a stronger, motivated govt
Grandmother who I knew
the city of Rome
the RICs very low
a local mini market
figures from Italian government
your army and people
the 2 k goal
the correct moral response
gift much needed shoes
room, roommate, work, collaboration
leaver no one behind
Nigerian embassy in Athens
Greek militans kil .
families who are struggling
the one their fled
uncertain fates in europe


 checking europe_spent
your line of reasoni

 31%|████████████████████████▊                                                        | 31/101 [00:01<00:03, 18.37it/s]


G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through 

 37%|█████████████████████████████▋                                                   | 37/101 [00:01<00:03, 19.12it/s]

Don' t
great hospitality
Giannis story
Bertelsmann study
Priti Patel
its obvious
Muslim factions
my fieldwork
migration fluxes
you officials
Lesbos latest
your workplaces
a flashpoint
ready dumbfuck
the spread
Didn '
The Bessudos
this burden
Polish Jews
the government
The transfer
a scoop
Raquel Bessudo
bird photography
terrorist propaganda
some brain
olive branches
a crime
an excuse
59 turks
severe claustrophobia
our culture
@tr724 araclyla
their Fathers
the start
Dear sir
the US
europeans latinos
Refugee Covid
NO BORDERS
t discriminate
a tent
the entrance
the U.K.
pro-government opinion
a definition
My names
their visit
gas Exploration
awful thogh
bombs Iraq
its arms
the call
the same
More evidence
raw realism
a necessity
deadly attacks
whos existence
Mr Stoltenberg
my data
The impossible
an email
URGENT FUNDS
this effort
nothing &
your help
Greek SMEs
SAFE France
Refugee Camps
Refugee Covid
2018
He
Anarchy
Nobody
access
Illyrians
@MSF
negotiations
HAHAHA
Details
Siktir
refugeeovercr

 40%|████████████████████████████████                                                 | 40/101 [00:01<00:02, 20.79it/s]


nothing &
your help
Greek SMEs
SAFE France
Refugee Camps
Refugee Covid
2018
He
Anarchy
Nobody
access
Illyrians
@MSF
negotiations
HAHAHA
Details
Siktir
refugeeovercrowding
flow
aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK




 43%|██████████████████████████████████▍                                              | 43/101 [00:02<00:03, 19.24it/s]


@MSF
negotiations
HAHAHA
Details
Siktir
refugeeovercrowding
flow
aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration


 checking asylum_irrelevant
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking France_happens


 checking Greece_People
population & children for 39%,

 46%|████████████████████████████████████▉                                            | 46/101 [00:02<00:02, 20.71it/s]




 checking asylum_seeking


 checking asylum_means


 checking islands_.
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking France_Those


 checking Greece_asylum


 checking UK_came


 checking Italy_in
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece


 checking Greece_asylum
t work as Earlier Greece have forcefully pushed migrants in Turkish water with which Turkey was Upset


 checking Greece_asylum
Its sounds more a kin to a penny dreadful sci fi novel than an economic treatise .
camp on Lesbos, where just under 13,000 people are l

 51%|█████████████████████████████████████████▋                                       | 52/101 [00:02<00:02, 19.35it/s]

10000
@CERCmigration
return
Spring
10,000
Borders/MSF


 checking Borders_Doctors
moria


 checking moria_camp
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos


 checking lesbos_island
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost


 checking Turkey_Many
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_know
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking

 55%|████████████████████████████████████████████▉                                    | 56/101 [00:02<00:02, 20.50it/s]

the weapons of mass migration
Pakistan and Afghanistan and Bagladesh
Somali migrant infected with coronavirus
us & our partners joy
the end of the year
Mujtaba and Im from Afganistan
very constructive and open exchanges
m Greek, many in America
part of the new normal
both aegean and kastelorizo eez
the climate of the south
the exploitation of migrant workers
the Cordon Sanitaire & more
the catch an ISIS member
her phenomenal team in greece


 checking greece_spirit
Their stories matter too .
a way to political threat
UNHCR & other agencies
no immigrant in Birmingham
s Hell) Closed camps
closed and controlled structures
the integration of women
animals for manipulating refuges
2 claims in NYT
Surly a smallish length
control over the media
Roger biland Daniele Francois
a stronger, motivated govt
Grandmother who I knew
the city of Rome
the RICs very low
a local mini market
figures from Italian government
your army and people
the 2 k goal
the correct moral response
gift much needed shoes
r

 58%|███████████████████████████████████████████████▎                                 | 59/101 [00:02<00:01, 22.60it/s]


refuge in a 3 rd class eastern European country
7000 Turks from the beginning of the year
top of your $20,000 salary to see what
1984 the loive of kidnapping are with rene
This idea that the uk is being overwhelmed
the heavy handed tactics on peaceful turkish protestors
the increasingly militaristic regime of Recep Tayyip Erdogan
any, but the xenophobic, right wing nut jobs,
scapegoating UK citizens with our migrant history
First COVID-19 Case in Moria Refugee Camp


 checking COVID_still
territory for those in need of protection
the life you could not even imagine
sheap kidnap pay for e vew jeer
precious & the only means of contact
libya & east med (greece cyprus turkey)
people crossing sea' s in little boats
all the challenges the country has faced
their own countries, and pro-islamic extremism/illegal migration
You' re likely to get your head
a new Ottoman Empire it might seem
the supply of ready - made food
That' s how a waterless region
the latter claim (HRW & NGOs)
great solidar

 61%|█████████████████████████████████████████████████▋                               | 62/101 [00:02<00:01, 22.85it/s]


Dear sir
the US
europeans latinos
Refugee Covid
NO BORDERS
t discriminate
a tent
the entrance
the U.K.
pro-government opinion
a definition
My names
their visit
gas Exploration
awful thogh
bombs Iraq
its arms
the call
the same
More evidence
raw realism
a necessity
deadly attacks
whos existence
Mr Stoltenberg
my data
The impossible
an email
URGENT FUNDS
this effort
nothing &
your help
Greek SMEs
SAFE France
Refugee Camps
Refugee Covid
2018
He
Anarchy
Nobody
access
Illyrians
@MSF
negotiations
HAHAHA
Details
Siktir
refugeeovercrowding
flow
aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040


 64%|████████████████████████████████████████████████████▏                            | 65/101 [00:03<00:01, 20.39it/s]

the same
More evidence
raw realism
a necessity
deadly attacks
whos existence
Mr Stoltenberg
my data
The impossible
an email
URGENT FUNDS
this effort
nothing &
your help
Greek SMEs
SAFE France
Refugee Camps
Refugee Covid
2018
He
Anarchy
Nobody
access
Illyrians
@MSF
negotiations
HAHAHA
Details
Siktir
refugeeovercrowding
flow
aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of th

 70%|████████████████████████████████████████████████████████▉                        | 71/101 [00:03<00:01, 20.90it/s]

return


 checking return_Crete
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_camp
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian 

 73%|███████████████████████████████████████████████████████████▎                     | 74/101 [00:03<00:01, 18.59it/s]

its arms
the call
the same
More evidence
raw realism
a necessity
deadly attacks
whos existence
Mr Stoltenberg
my data
The impossible
an email
URGENT FUNDS
this effort
nothing &
your help
Greek SMEs
SAFE France
Refugee Camps
Refugee Covid
2018
He
Anarchy
Nobody
access
Illyrians
@MSF
negotiations
HAHAHA
Details
Siktir
refugeeovercrowding
flow
aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, th

 77%|██████████████████████████████████████████████████████████████▌                  | 78/101 [00:03<00:01, 20.80it/s]

That' s how a waterless region
the latter claim (HRW & NGOs)


 checking claim_.


 checking claim_they
great solidarity towards 1 0 0
muslim britherhood and political islam expansionism
less than 72 hours, all thanks
the chance of breeding bird behaviour
m not defending them after it
the same colour passport as you
Your sneaky political game didn' t
no . . . . .
a hole in Syrian Refugees' boats
the island of Lesbos and the
isis terrorists they trade with israel
the poor quality of the it
the only foreign language you know
Turkish military & a refugee woman
their strategic partnership with @eBay
the work of refugee-led groups
their many services and resources
a hell of a lot
the weapons of mass migration
Pakistan and Afghanistan and Bagladesh
Somali migrant infected with coronavirus
us & our partners joy
the end of the year
Mujtaba and Im from Afganistan
very constructive and open exchanges
m Greek, many in America


 checking many_they
part of the new normal
both aegean and kasteloriz

 80%|████████████████████████████████████████████████████████████████▉                | 81/101 [00:03<00:00, 21.07it/s]

Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration


 checking asylum_come
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_come


 checking come_children
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old


 checking children_ago
brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources birmingham
enough people in the U.K. willing to convince everyone else in the U.K. that trafficking through illegal routes
The largest terrorist organization in the world, the names of the Syrian refugees

 83%|███████████████████████████████████████████████████████████████████▎             | 84/101 [00:04<00:00, 18.58it/s]

the way who hosted me and spoke to me
refuge in a 3 rd class eastern European country
7000 Turks from the beginning of the year
top of your $20,000 salary to see what
1984 the loive of kidnapping are with rene
This idea that the uk is being overwhelmed
the heavy handed tactics on peaceful turkish protestors
the increasingly militaristic regime of Recep Tayyip Erdogan
any, but the xenophobic, right wing nut jobs,
scapegoating UK citizens with our migrant history


 checking citizens_sorry
First COVID-19 Case in Moria Refugee Camp
territory for those in need of protection
the life you could not even imagine
sheap kidnap pay for e vew jeer
precious & the only means of contact
libya & east med (greece cyprus turkey)
people crossing sea' s in little boats
all the challenges the country has faced
their own countries, and pro-islamic extremism/illegal migration
You' re likely to get your head
a new Ottoman Empire it might seem
the supply of ready - made food
That' s how a waterless region
the

 88%|███████████████████████████████████████████████████████████████████████▍         | 89/101 [00:04<00:00, 16.87it/s]

The infected person , a 40 year old man


 checking year_sick
a country that has been your dreams for centuries
germany (162,000), france (110,000), greece (65,000) and italy (49,000)
Two high ranking UN Refugee Agency officials on Friday
if there will be an EU or what it


 checking it_has


 checking be_over
refugees, discussed with authorities and NGOs refugee protection challenges
Portugal and parts of Greek Island on the list
a severe economic crisis and now the COVID-19 pandemic-@RaoufMazou
the way who hosted me and spoke to me
refuge in a 3 rd class eastern European country
7000 Turks from the beginning of the year
top of your $20,000 salary to see what
1984 the loive of kidnapping are with rene
This idea that the uk is being overwhelmed
the heavy handed tactics on peaceful turkish protestors
the increasingly militaristic regime of Recep Tayyip Erdogan
any, but the xenophobic, right wing nut jobs,
scapegoating UK citizens with our migrant history
First COVID-19 Case in Moria Ref

 90%|████████████████████████████████████████████████████████████████████████▉        | 91/101 [00:04<00:00, 17.70it/s]


Erdogans ersatz invaders
The Trump family
the full article
a larger presence
@PerilOfAfrica newsdeck COVID-19
Isaac Bessudos widow
the coronavirus pandemic
an example and
a long journey
' caring' acts
our clinic doors
its a term
the other side
your general stateme
refugee covid 19
Greece iwo men
Sexual assault ect
photo gr apy
five for hope
parts of Syria
@RaoufMazou and @UNHCRGreece
absolute pure dogshite
UN Refugee Agency
the UN convention
w/Asst High Commissioners
Little Ringed Plovers
the immigrants card
the Greece event
top of that


 checking that_for
a distant memory
devastating consequences
friday motivation
The situation
white westerners
Don' t
great hospitality
Giannis story
Bertelsmann study
Priti Patel
its obvious
Muslim factions
my fieldwork
migration fluxes
you officials
Lesbos latest
your workplaces
a flashpoint
ready dumbfuck
the spread
Didn '
The Bessudos
this burden
Polish Jews
the government
The transfer
a scoop
Raquel Bessudo
bird photography
terrorist propaganda
s

 94%|████████████████████████████████████████████████████████████████████████████▏    | 95/101 [00:04<00:00, 20.60it/s]

the entrance
the U.K.
pro-government opinion
a definition
My names
their visit
gas Exploration
awful thogh
bombs Iraq
its arms
the call
the same
More evidence
raw realism
a necessity
deadly attacks
whos existence
Mr Stoltenberg
my data
The impossible
an email
URGENT FUNDS
this effort
nothing &
your help
Greek SMEs
SAFE France
Refugee Camps
Refugee Covid
2018
He
Anarchy
Nobody
access
Illyrians
@MSF
negotiations
HAHAHA
Details
Siktir
refugeeovercrowding
flow
aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies repres

100%|████████████████████████████████████████████████████████████████████████████████| 101/101 [00:04<00:00, 20.32it/s]


negotiations
HAHAHA
Details
Siktir
refugeeovercrowding
flow
aesthetic
ManfredWeber
Leeds
1914
HU
1/3
@scwacy
comparisons
IRC
COVID-19
EIT
Donations
safety
No-one
connector
GdnDevelopment
G
today
Asylum
Questions
mercenaries
documents
Matt
Wed
Ill
God
Here
covid
unemployment
/
we
Omfg
COVID-19
Doctors
10000
@CERCmigration
return
Spring
10,000
Borders/MSF
moria
parents/carers
SOS
Ubtil
lesvos
2040
@akrokentrwos
charge
islands
Wales
attention!!!!
2040
danger
lesbos
History
Obama
soufli
flood
WE
riots!
who
MSM
2040
Greece
Morocco Tunisia Libya Greece Turkey Each one of these unnamed bodies represents a family searching endlessly for a lost
access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration
economic migrants who cross developed nations such as Greece, Italy, Spain, Germany and France to come to the UK


 checking Greece_Were
population & children for 39%, of whom more than 7 out of 10 are younger than 12 years old
bri




In [866]:
print(cand_frames.keys())

framed_words = pd.DataFrame.from_dict(cand_frames)

framed_words[framed_words['word']=='refugees'].tail(50)


dict_keys(['word', 'settlement', 'reception', 'security', 'criminality', 'economisation', 'humanitarian', 'victimization', 'integration'])


Unnamed: 0,word,settlement,reception,security,criminality,economisation,humanitarian,victimization,integration
436,refugees,,,,,,,,
446,refugees,,,,,,,,
478,refugees,,,,0.488541,,,0.450321,
479,refugees,,,,,,,,
485,refugees,,,,0.488541,,,0.450321,
486,refugees,,,,,,,,
538,refugees,,,,,,,,
548,refugees,,,,,,,,
589,refugees,,,,,,,,
596,refugees,,,,,,,,


In [750]:
#print(cand_df['candidates'])
for candidate in tqdm(tweets_corpus):
    for frame in list(frame_properties.keys()):      
        print(f'communication bias towards {candidate.split()[1]} in the frame {frame}  is {np.mean(cand_frames[frame][candidate.split()[1]])}')

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 632.92it/s]

communication bias towards Moria in the frame settlement  is nan
communication bias towards Moria in the frame reception  is nan
communication bias towards Moria in the frame security  is nan
communication bias towards Moria in the frame criminality  is nan
communication bias towards Moria in the frame economisation  is nan
communication bias towards Moria in the frame humanitarian  is nan
communication bias towards Moria in the frame victimization  is nan
communication bias towards Moria in the frame integration  is nan
communication bias towards covid in the frame settlement  is 0.14751476049423218
communication bias towards covid in the frame reception  is 0.049112480133771896
communication bias towards covid in the frame security  is 0.12066757678985596
communication bias towards covid in the frame criminality  is nan
communication bias towards covid in the frame economisation  is nan
communication bias towards covid in the frame humanitarian  is nan
communication bias towards covi




In [657]:
for frame in list(frame_properties.keys()):
    print(f'for {frame} bias is {np.mean(cand_frames[frame]["refugees"])}')


for settlement bias is nan
for reception bias is nan
for security bias is nan
for criminality bias is nan
for economisation bias is nan
for humanitarian bias is nan
for victimization bias is nan
for integration bias is nan


# TESTING:

In [744]:
# batching the tweets speeds the model considerably and is enabled by splitting sentences using '\n\n' 
from stanza_batch import batch
from nltk.tokenize import sent_tokenize

# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(tweets_corpus) 
for tweet in range(len(all_tweets_list)):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if tweet_sentokenized == []:
        tweet_sentokenized.append('empty_tweet')
        print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=1000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

100it [00:28,  3.51it/s]


'Greeces Moria migrant camp quarantined after first Covid - 19 case via @TheNationalUAE'

In [748]:




for tweet in tqdm(range(len(tweets_corpus))):
    print(tweets_corpus[tweet])
    np_heads = [[word.id, word.text,word.head,word.deprel] for sent in tagged_tweets[tweet].sentences for word in sent.words]
    print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in tagged_tweets[tweet].sentences for word in sent.words], sep='\n')
    #print(np_heads)
    ph_ids = set([np_heads[i][2] for i in range(len(np_heads))])
    ph_words = [np_heads[i-1][1] for i in ph_ids]

    word_pairs = [(np_heads[word][1], np_heads[np_heads[word][2]-1][1]) for word in range(len(np_heads)) if np_heads[word][2] != 0]
    #print(word_pairs)
    
    compounds = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='compound']
    print(compounds)
    
    amods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='amod']
    print(amods)
    for pair in word_pairs:
        phrase = pair[0]+'_'+pair[1]

    #print(model.most_similar('illegal_immigrant'))

    
    """#print(len(cand_df['candidates']))
    candidate_list = cand_df['candidates']
    for cand in cand_df['candidates']:
        #print(cand[2])
        #print(get_head(str(cand)))
        for phrase_head in cand[2]:
            #print(phrase_head)
            #if str(cand[1]) in str(tweet):
            if str(phrase_head) in str(tweet) and len(phrase_head)>2:
                #print(phrase_head)
                ph_words = [np_heads[i-1][1] for i in phrase_heads]
                #print(ph_words)
                for related in range(len(np_heads)):
                    if phrase_head == np_heads[related][1]:
                        pass
                        #print(f'checking {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                  for frame_property in list(frame_properties.keys()):
                        for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    #cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                                #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                                except KeyError:
                                pass"""

 29%|███████████████████████▏                                                        | 29/100 [00:00<00:00, 287.14it/s]

Greeces Moria migrant camp quarantined after first Covid - 19 case via @TheNationalUAE
id: 1	word: Greeces        head id: 0    head: root      deprel: root
id: 2	word: Moria          head id: 1    head: Greeces   deprel: flat
id: 3	word: migrant        head id: 4    head: camp      deprel: amod
id: 4	word: camp           head id: 1    head: Greeces   deprel: appos
id: 5	word: quarantined    head id: 4    head: camp      deprel: acl
id: 6	word: after          head id: 11   head: case      deprel: case
id: 7	word: first          head id: 11   head: case      deprel: amod
id: 8	word: Covid          head id: 11   head: case      deprel: compound
id: 9	word: -              head id: 8    head: Covid     deprel: punct
id: 10	word: 19             head id: 8    head: Covid     deprel: nummod
id: 11	word: case           head id: 5    head: quarantineddeprel: obl
id: 12	word: via            head id: 14   head: TheNationalUAEdeprel: case
id: 13	word: @              head id: 14   head: TheNational

 65%|████████████████████████████████████████████████████                            | 65/100 [00:00<00:00, 191.85it/s]


id: 20	word: aren           head id: 22   head: t         deprel: cop
id: 21	word: '              head id: 22   head: t         deprel: punct
id: 22	word: t              head id: 8    head: awful     deprel: advcl
id: 23	word: to             head id: 25   head: tastes    deprel: case
id: 24	word: their          head id: 25   head: tastes    deprel: nmod:poss
id: 25	word: tastes         head id: 22   head: t         deprel: nmod
id: 26	word: .              head id: 2    head: guess     deprel: punct
id: 1	word: .              head id: 0    head: root      deprel: root
id: 1	word: .              head id: 0    head: root      deprel: root
[['asylum_seeker']]
[['European_country'], ['favorite_cereal']]
Oh do shut it with your silly comments . The UK takes on tiny percantage of refugees from around the world whilst Turkey, Greece, Germany, France take in hundreds of thousands and some, millions.
id: 1	word: Oh             head id: 3    head: shut      deprel: discourse
id: 2	word: do      

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 201.62it/s]


id: 11	word: detained       head id: 10   head: k         deprel: acl
id: 12	word: )              head id: 10   head: k         deprel: punct
id: 13	word: ,              head id: 14   head: afraid    deprel: punct
id: 14	word: afraid         head id: 5    head: prisons   deprel: parataxis
id: 15	word: to             head id: 16   head: express   deprel: mark
id: 16	word: express        head id: 14   head: afraid    deprel: xcomp
id: 17	word: their          head id: 18   head: opinion   deprel: nmod:poss
id: 18	word: opinion        head id: 16   head: express   deprel: obj
id: 19	word: or             head id: 20   head: asking    deprel: cc
id: 20	word: asking         head id: 16   head: express   deprel: conj
id: 21	word: asylum         head id: 20   head: asking    deprel: obj
id: 22	word: from           head id: 23   head: Greece    deprel: case
id: 23	word: Greece         head id: 20   head: asking    deprel: obl
id: 24	word: (              head id: 25   head: see       deprel: pun




In [732]:
%debug

> [1;32m<ipython-input-731-c1fc653e00ae>[0m(34)[0;36m<listcomp>[1;34m()[0m
[1;32m     32 [1;33m            [1;32mif[0m [0mstr[0m[1;33m([0m[0mphrase_head[0m[1;33m)[0m [1;32min[0m [0mstr[0m[1;33m([0m[0mtweet[0m[1;33m)[0m [1;32mand[0m [0mlen[0m[1;33m([0m[0mphrase_head[0m[1;33m)[0m[1;33m>[0m[1;36m2[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     33 [1;33m                [1;31m#print(phrase_head)[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 34 [1;33m                [0mph_words[0m [1;33m=[0m [1;33m[[0m[0mnp_heads[0m[1;33m[[0m[0mi[0m[1;33m-[0m[1;36m1[0m[1;33m][0m[1;33m[[0m[1;36m1[0m[1;33m][0m [1;32mfor[0m [0mi[0m [1;32min[0m [0mphrase_heads[0m[1;33m][0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     35 [1;33m                [1;31m#print(ph_words)[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     36 [1;33m                [1;32mfor[0m [0mrelated[0m [1;32min[0m [0mrange[0m[1;33m([0m