# Candidate merging and related preprocessing


Import relevant packages for the following parts

In [52]:
import numpy as np
import pandas as pd
import math
import re
import sys

#from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
%matplotlib inline
import preprocess

plt.style.use('ggplot')
#from preprocessing import get_processed_data, load_data
import csv
import stanza
import nltk
from nltk.corpus import wordnet
import spacy

from tqdm import tqdm

import time

### Import data 

In [14]:
event_df = pd.read_csv('moria_no_duplicates.csv', index_col=0)

event_df.reset_index(drop=True, inplace=True)
# given event date, split the dataset to pre and post event dfs

event_date = '2020-09-09'

#moria_df[['Date','Time']] = moria_df['Date Short'].astype(str).str.split(' ', 1, expand=True)
# create pre and post event partition
pre_event = event_df[event_df['Date Short'] < event_date]
post_event = event_df[event_df['Date Short'] >= event_date]

print('total tweets: ', event_df.shape[0])
print('Pre event tweets: ',pre_event.shape[0])
print('Post event tweets: ',post_event.shape[0])





total tweets:  18203
Pre event tweets:  3311
Post event tweets:  14892


## Data preprocessing

In [60]:
import re
import preprocessor as p
from ekphrasis.classes.tokenizer import SocialTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 
from ekphrasis.classes.segmenter import Segmenter
from ekphrasis.classes.spellcorrect import SpellCorrector

def remove_tweet_signatures(tweet):
    """
    Frequently occuring text and tweet signatures should be removed
    
    Input: full tweet text
    Output: tweet - the strings in the list
    """
    texts_to_remove = ["Greece has a deadly new migration policy and all of Europe is to blame",
                       "| The Guardian",
                       "| Photo via Evening Standard",
                       "| Greece",
                       "| DW News ",
                       "- @WashTimes",
                       "(Guardian) Story:",
                       " | Global development"
                  ]
    for text in texts_to_remove:
        tweet = tweet.replace(text,"")
    return tweet



puncttok = nltk.WordPunctTokenizer().tokenize
social_tokenizer = SocialTokenizer(lowercase=False).tokenize
detokenizer = TreebankWordDetokenizer()

sp = SpellCorrector(corpus="english") 
seg_eng = Segmenter(corpus="english") 

# preprocessor should remove emojis and urls in the tweets
p.set_options(p.OPT.URL, p.OPT.EMOJI)



Reading english - 1grams ...
Reading english - 1grams ...
Reading english - 2grams ...


In [217]:

def preprocess_tweets(sample_df):

    sample_df = sample_df.apply(p.clean)
    
    sample_df = sample_df.apply(lambda x: remove_tweet_signatures(x))
    #sample_df = sample_df.apply(lambda x: social_tokenizer(x))
    #sample_df = sample_df.apply(lambda x: x.split())
    
    for twt in range(len(sample_df)):
        print(twt)
        # we are using social tokenizer due to potentially improper text structure
        #tweet = sample_df[twt].split()
        tweet = social_tokenizer(sample_df.iloc[twt])
        #print(tweet)

        #removing the irrelevant hashtags and mention using the heuristic of mentions in the beginning of the tweet 
        # and at least 2 consecutive hashtags at the end of the tweet 
        while tweet[0].startswith('@'):
            tweet.remove(tweet[0])
        while tweet[-1].startswith('#') and tweet[-1].startswith('#'):
            if tweet[-1].startswith('#') and tweet[-1].startswith('#'):
                tweet.remove(tweet[-1])

        #for the hashtags within text that may contain information, we remove the # and split the word into more if necessary
        for word in range(len(tweet)):
            if tweet[word].startswith('#'):
                tweet[word] = tweet[word].replace('#','')
                tweet[word] = seg_eng.segment(tweet[word])

            # potentially correct spelling - but it is not working very well
            tweet[word] = sp.correct(tweet[word])

        # instead of .join we use detokenizer in order to reconstruct the cleaned sentence in a better way
        #sample_df[twt] =  " ".join(tweet) 
        sample_df.iloc[twt] = detokenizer.detokenize(tweet)
        
    return sample_df

sample_df = preprocess_tweets(event_df['Tweet Raw'][101:103])
list(sample_df)

0
1


['greece unable to cope with of thousand refugees with as population of of million of and the border next to it wants as country of of million to be torn apart of god of of of ve never seen as fool like you together in my life of the problem is of you of re all idiots of',
 'and from there you can try to pass europe especially greece there are lots of boats you know but you have to know you might die from all of that this refuge thing is very fishy']

## Instantiate stanza english language module

In [119]:
#stanza.download("en")

In [63]:
en_nlp = stanza.Pipeline("en", ner_batch_size=4096)

2021-03-13 14:05:15 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-13 14:05:15 INFO: Use device: cpu
2021-03-13 14:05:15 INFO: Loading: tokenize
2021-03-13 14:05:15 INFO: Loading: pos
2021-03-13 14:05:16 INFO: Loading: lemma
2021-03-13 14:05:16 INFO: Loading: depparse
2021-03-13 14:05:17 INFO: Loading: sentiment
2021-03-13 14:05:18 INFO: Loading: ner
2021-03-13 14:05:19 INFO: Done loading processors!


In [7]:
# testing code
"""start = time.time()
en_doc = event_df["Tweet Raw"][:100].apply(en_nlp)
end = time.time()
print(f"Preprocessing the data took {end-start} seconds.")"""

Preprocessing the data took 168.26420331001282 seconds.


In [28]:
#directory = '../../export CORENLP_HOME=' ##ADD DIRECTORY HERE
stanza.install_corenlp()

#import os
#os.environ["CORENLP_HOME"] = directory



## As initial WCL candidates, we extract coreference chains and noun phrases (NPs).

### SOME PREPROCESSING NEEDED
* remove links - check
* remove # from hashtags? - check
* remove/merge mentions? - check


* remove recurring texts (signatures of news media)
* remove posts of some accounts (refugee_list)
* exclude NERs that tag numbers - should we mark phrase as NE if the head is not NE?
* play around with candidate types
* optimize code and make it neater



In [64]:
from stanza.server import CoreNLPClient

# get noun phrases with tregex
def noun_phrases(_client, tweet, _annotators=None):
    """
    Input: _client = CoreNLPClient instance
           _text = tweet text
           _annotators = allowed CoreNLP operations
    Output: list of all noun phrases in the tweet
    """
    pattern = 'NP'
    matches = _client.tregex(tweet,pattern,annotators=_annotators)

    return [sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence]


In [65]:
noun_phrase_list = []
with CoreNLPClient(timeout=300000, memory='16G') as client:
    for tweet in tqdm(sample_df):
        noun_phrase = noun_phrases(client,str(tweet),_annotators="tokenize,ssplit,pos,lemma,parse,ner,coref")
        noun_phrase_list.append(noun_phrase)
        #noun_phrases(client,event_df["Tweet Raw"][:100],_annotators="tokenize,ssplit,pos,lemma,parse")

2021-03-13 14:05:42 INFO: Writing properties to tmp file: corenlp_server-ab539ba3e7af4ef8.props
2021-03-13 14:05:42 INFO: Starting server with command: java -Xmx16G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 300000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-ab539ba3e7af4ef8.props -preload -outputFormat serialized
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:23<00:00,  2.04s/it]


In [75]:
def get_cand_len(cand_list):
    # calculates number of candidates in the corpus
    sum_len = 0
    for tweet_cands in cand_list:
        sum_len += len(tweet_cands)
    return sum_len


def tag_tweets(corpus):
    """
    Input: corpus of tweets to tag
    Output: List of tuples containing (POS-tags of each word, NER-tags of each named entity)
    """
    tweet_tags=[]
    for tweet in tqdm(corpus):
        #for np in np_tweets:
            #annotate the tweet
            doc = en_nlp(tweet)
            #extract POS and NE tags
            tweet_pos_tags={word.text: word.xpos for sent in doc.sentences for word in sent.words}
            tweet_ner= {ent.text: ent.type for sent in doc.sentences for ent in sent.ents}
            tweet_tags.append((tweet_pos_tags,tweet_ner))
    return tweet_tags  

tweet_tags = tag_tweets(sample_df) 


tweet_tags_set = set()

#get a set of all NER tags existing in corpus
for tweet in tweet_tags:
    tweet_ner_tags = set(tweet[1].values())
    tweet_tags_set.update(tweet_ner_tags)

print(tweet_tags_set)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:55<00:00,  1.75s/it]

{'TIME', 'NORP', 'ORG', 'WORK_OF_ART', 'PERCENT', 'GPE', 'EVENT', 'ORDINAL', 'QUANTITY', 'LOC', 'PERSON', 'DATE', 'CARDINAL'}





In [7]:
#Store the noun phrases in the pickle file
import pickle

with open('file_name_to_save', 'wb') as fp:
    pickle.dump(noun_phrase_list, fp)

NameError: name 'noun_phrase_list' is not defined

In [71]:
# Load NPs from pickle file
import pickle

with open(r"moria_noun_phrases", "rb") as input_file:
    noun_phrase_list = pickle.load(input_file)

#noun_phrase_list

## Keep only NPs shorter than 20 words and parent NPs 

In [76]:
print(get_cand_len(noun_phrase_list))
for tweet_nps in noun_phrase_list:
    #reverse the list of tweets nps so we avoid moving indexes and leaving out some phrases 
    for np in reversed(tweet_nps):
        i = tweet_nps.index(np)
        np_split = np.split()
        if len(np_split) > 19:
            tweet_nps.remove(tweet_nps[i])
            
print(get_cand_len(noun_phrase_list))

660
654


In [77]:
#silly but easy way to remove the child NP and keep only parents, run until the sum_len stops decreasing
after_removal_len = 0
while after_removal_len != get_cand_len(noun_phrase_list):
    after_removal_len = get_cand_len(noun_phrase_list)
    for tweet_nps in noun_phrase_list:
        for np in range(len(tweet_nps)):
            try:
                #if the subsequent np (child np) is contained in the current one, remove the child np
                if tweet_nps[np].find(tweet_nps[np+1]) != -1:
                    tweet_nps.remove(tweet_nps[np+1])

            #ignore the error caused with end of the list
            except IndexError:
                pass

print(get_cand_len(noun_phrase_list))

654


## Assign candidate types to candidates

In [78]:
#dictionary to assign candidate types based on named entities and part of speech tags
#the key tuple consists of (isNE, lexicographer type, plural)
cand_types = {(True,'PERSON',None):'person-ne',
              (True,'NORP',None):'person-ne',
              (True,'PERSON','plural'):'person-nes',
              (True,'NORP','plural'):'person-nes',
              (False,'PERSON',None):'person-nn',
              (False,'PERSON','plural'):'person-nns',
              (True,'ORG',None):'group-ne',
              (True,'FAC',None):'group-ne',
              (False,'ORG',None):'group',
              (True,'LOC',None):'loc-ne',
              (True,'GPE',None):'loc-ne',
              (False, 'LOC',None):'loc'
            }


In [81]:
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter

def get_synt_category(head):
    """
    Input: head word of the noun phrase e.g. 'aliens' from NP 'Illegal aliens' 
    Output: syntactic category of the head word as categorized using worndet
    """
    
    person_ss = wn.synsets("person")[0]
    #group_ss = wn.synsets("facility")[0]    
    place_ss = wn.synsets("location")[0]
    org_ss = wn.synsets("organization")[0]
    counter = 0
    synt_category = head
    try:
        while synt_category not in [None,'PERSON','LOC','ORG']:
            # words without meaning return empty lists and cause infinite loop, we need to throw error
            assert len(wn.synsets(synt_category))>0, f"{synt_category} has no synonyms"
            
            for ss in wn.synsets(synt_category):
                counter += 1                
                #print(ss.lemmas())
                #for hyper in ss.hypernyms():
                assert len(ss.hypernyms())>0, f"{ss} has no hypernyms"
                hyper = ss.hypernyms()[0]
                
                #print(f'for {synt_category} synonyms are: {ss}, hypernyms are: {hyper}')
                #print(f'synonym with person: {ss.wup_similarity(person_ss)}')
                #print(f'hypernym with person: {hyper.wup_similarity(person_ss)}')
                #print(f'with group: {ss.wup_similarity(group_ss)}')
                #print(f'synonym with place: {ss.wup_similarity(place_ss)}')
                #print(f'hypernym with place: {hyper.wup_similarity(place_ss)}')

                #if the syntactic similarity to one of the categories is more than 0.7, select the category
                if ss.wup_similarity(person_ss) >= 0.7:
                    synt_category = 'PERSON'
                    break
                #elif ss.wup_similarity(group_ss) >= 0.7:
                    #synt_category = 'facility'
                    #break
                elif ss.wup_similarity(place_ss) >= 0.7:
                    synt_category = 'LOC'
                    break
                elif ss.wup_similarity(org_ss) >= 0.7:
                    synt_category = 'ORG'
                    break
                else:
                    # if the synset is not similar assign the hypernym synset
                    synt_category = hyper.lemma_names()[0]

                #force stop at level 5 of hypernym search
                if counter == 5:
                    synt_category = None
                    break
            
    except AssertionError:
        synt_category = None
        return synt_category

    #print(f'{head} turned into a candidate {synt_category}')  
    
    return synt_category


#test syntactic categories on list of noun phrases
results = Counter(word for tweet_nps in noun_phrase_list for np in tweet_nps for word in np.split())
#print(results)
list_of_cand_types = []
for head in results.keys():
    list_of_cand_types.append(get_synt_category(head))
    
count_types = Counter(list_of_cand_types)
print(count_types)

Counter({None: 748, 'PERSON': 81, 'ORG': 52, 'LOC': 47})


## assign candidate type to noun phrases

In [84]:
def get_cand_type(candidate, WCLcands = noun_phrase_list, i=i):
    """
    Input: list of all noun phrases occurring in one tweet
    Output: list of pairs of np (string) and its candidate type (string) in a tuple for each np of the tweet
    """
    #i = WCLcands.index(candidate)
    np_cand_type = []
    for np in candidate:
        
        #annotate the noun phrase to find noun head (((((could be done before?)))))
        doc = en_nlp(np)  
        
        #the head of noun phrase is marked with value 0 for the word.head
        np_heads = {word.text: word.head for sent in doc.sentences for word in sent.words}
        for word, head in np_heads.items():  
            if head == int(0):
                np_head = word
        
        #print(f'the head of "{np}" is {np_head}')
        
        #check if the noun phrase contains an NE tag
        # possible problem - if three is marked as NE then three children will be a NE, should they be???????????
        isNE = False
        #print(np_pos_tags[i])
        for key in tweet_tags[i][1].keys():
            if key in np:
                isNE = True                
        
        # identified entity will be none if the head is not a named entity, if it is, the NER tag will be assigned
        ner_tag = None
        
        for key in tweet_tags[i][1].keys():
            if np_head in key:
                ner_tag = tweet_tags[i][1][key]
        #if np_head in np_pos_tags[i][1].keys():
        #   identified_ner = np_pos_tags[i][1][np_head]       
        
        identified_ner = ner_tag if ner_tag != None else get_synt_category(np_head)
            
        #print(np_pos_tags[i])
        pos_number = None
        if np_head in tweet_tags[i][0].keys():
            pos_tag = tweet_tags[i][0][np_head]
            pos_number = 'plural' if pos_tag in ['NNS','NNPS'] and identified_ner in ['person','PERSON'] else None
        
        #we want to create a tuple of (is_named_entity, NE_tag/synt_category, POS-tag)
        pre_cand_type = (isNE, identified_ner, pos_number)
        
        #print(f'\n isNE: {isNE}, ner: {identified_ner}, pos: {pos_number}')
        cand_type = cand_types[pre_cand_type] if pre_cand_type in cand_types.keys() else 'misc'
        np_cand_type_pair = (np,np_head,cand_type)
        #print(np_cand_type_pair)
        np_cand_type.append(np_cand_type_pair)
    return np_cand_type



In [1277]:
noun_phrase_list

[['it',
  'asylum in Greece , MAC , SER or CRO or BUL',
  '3 EU members',
  'HU',
  'They',
  'help',
  'the first safe country as asylum seeker',
  'the 5',
  'No law',
  'you',
  'it'],
 ['ISIS refuge .',
  "Wouldn ' t",
  'Turkey',
  'I',
  "You ' re likely to get your head",
  'the beaten track',
  'Christian Greece',
  'a similar climate',
  'islamic turkey'],
 ['Greece', 'refugee overcrowding', 'UN'],
 ['1 / 3',
  'Law',
  'the idle stuff',
  'All rights of the Turkish minority in Greece',
  'the law',
  'Refugees',
  'the EU',
  'the law',
  'Western states',
  'the states',
  'the Middle East and Africa',
  'underground resources',
  'the law'],
 ['the asylum seekers',
  'HU borders',
  'They',
  'a long journey',
  'Greece',
  'they',
  'help',
  'they',
  'multiple more countries'],
 ['I',
  '#photograpy pushing aesthetic into #news',
  'it',
  'a potential embellishments of hard facts and harsh reality',
  'I',
  'raw realism',
  'I',
  'this boundary',
  '@FiLMiCPro',
  'th

In [85]:
# label the noun phrases with the candidate types

np_and_cand_list = []
#print(noun_phrase_list[10])
for i in tqdm(range(len(noun_phrase_list))):
    np_and_cand_list.append(get_cand_type(noun_phrase_list[i]))
print(np_and_cand_list)   

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:44<00:00,  1.65s/it]

[[('it', 'it', 'misc'), ('asylum in Greece, MAC, SER or CRO or BUL', 'asylum', 'group-ne'), ('3 EU members', 'members', 'person-nes'), ('HU', 'HU', 'group-ne'), ('They', 'They', 'misc'), ('help', 'help', 'person-nn'), ('the first safe country as asylum seeker', 'country', 'group-ne'), ('the 5th', '5th', 'misc'), ('No law', 'law', 'misc'), ('you', 'you', 'misc'), ('it', 'it', 'misc')], [('ISIS refuge.', 'refuge', 'loc'), ('Turkey', 'Turkey', 'misc'), ('I', 'I', 'misc'), ('You', 'You', 'misc'), ('your head', 'head', 'person-nn'), ('the beaten track', 'track', 'loc'), ('Christian Greece', 'Christian', 'person-ne'), ('a similar climate', 'climate', 'misc'), ('islamic turkey', 'turkey', 'misc')], [('Greece', 'Greece', 'loc-ne'), ('refugee overcrowding', 'overcrowding', 'group'), ('UN', 'UN', 'group')], [('1/3', '1/3', 'misc'), ('Law', 'Law', 'misc'), ('the idle stuff', 'stuff', 'loc'), ('All rights of the Turkish minority in Greece', 'rights', 'loc-ne'), ('the law', 'law', 'misc'), ('Refuge




In [86]:
from collections import Counter

cand_type_count = [np[1] for nps in np_and_cand_list for np in nps]
    
counts = Counter(cand_type_count)
print(counts)

Counter({'Greece': 37, 'refugees': 16, 'it': 15, 'you': 15, 'I': 15, 'they': 14, 'EU': 8, 'people': 8, 'greece': 7, 'asylum': 6, 'It': 6, 'this': 6, 'camp': 6, 'them': 6, 'Turkey': 5, 'visit': 5, 'we': 5, 'This': 5, 'He': 5, 'thousands': 5, 'They': 4, 'country': 4, 'law': 4, 'seekers': 4, 'countries': 4, 'sparks': 4, 'There': 4, 'solidarity': 4, 'sea': 4, 'help': 3, 'refuge': 3, 'You': 3, 'Europe': 3, 'immigrants': 3, 'States': 3, 'immigration': 3, 'work': 3, 'applications': 3, 'challenges': 3, 'he': 3, 'covid': 3, 'one': 3, 'case': 3, 'members': 2, 'UN': 2, 'states': 2, 'borders': 2, 'France': 2, 'war': 2, 'Raquel': 2, 'parents': 2, 'years': 2, 'crisis': 2, 'border': 2, 'me': 2, 'problem': 2, 'life': 2, 'family': 2, 'Erdogan': 2, 'tension': 2, 'migration': 2, 'uk': 2, 'end': 2, 'support': 2, 'there': 2, 'Athens': 2, 'August': 2, 'tent': 2, 'camps': 2, 'issues': 2, 'Commissioners': 2, 'world': 2, 'route': 2, 'migrants': 2, 'way': 2, 'lesvos': 2, 'quarantine': 2, 'time': 2, 'conditions'

## If wordnet is not working well, try sense2vec and spacy

In [40]:
import spacy
from sense2vec import Sense2VecComponent

nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk(r"C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\s2v_old")


#vector_map.load(r"C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\s2v_old")


<sense2vec.component.Sense2VecComponent at 0x2d9ce2c6b08>

In [63]:
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk(r"C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\s2v_old")
vector = s2v["natural_language_processing|NOUN"]
most_similar = s2v.most_similar("animal|NOUN", n=10)


In [64]:
most_similar


[('wild_animal|NOUN', 0.8854),
 ('living_animal|NOUN', 0.8663),
 ('animals|NOUN', 0.863),
 ('domesticated_animal|NOUN', 0.8566),
 ('living_creature|NOUN', 0.8442),
 ('other_animals|NOUN', 0.8427),
 ('other_animal|NOUN', 0.8415),
 ('farm_animal|NOUN', 0.8369),
 ('live_animal|NOUN', 0.8266),
 ('single_animal|NOUN', 0.8223)]

## Comparison of NER performance using SpaCy language module

In [22]:
import spacy


en_nlp = spacy.load("en_core_web_sm")


spacy_ner=[]
for np_tweets in event_df["Tweet Raw"][:50]:
    #for np in np_tweets:
        doc = en_nlp(np_tweets)
        #ner = en_nlp.add_pipe("ner")
        tweet_pos_tags={token.text: token.tag_ for token in doc}
        tweet_ner= {ent.text: ent.label_ for ent in doc.ents}
        #tweet_pos_tags={word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #tweet_ner= {ent.text: ent.type for sent in doc.sentences for ent in sent.ents}
        spacy_ner.append((tweet_pos_tags,tweet_ner))
        
spacy_ner

[({'@sztiv5': 'NN',
   '@Juliivan': 'NNP',
   '_': 'NN',
   'Yes': 'UH',
   ',': ',',
   'why': 'WRB',
   '?': '.',
   'Why': 'WRB',
   'it': 'PRP',
   'was': 'VBD',
   'n’t': 'RB',
   'good': 'JJ',
   'to': 'TO',
   'apply': 'VB',
   'for': 'IN',
   'asylum': 'NN',
   'in': 'IN',
   'Greece': 'NNP',
   'MAC': 'NNP',
   'SER': 'NNP',
   'or': 'CC',
   'CRO': 'NNP',
   'BUL': 'NNP',
   'together': 'RB',
   '3': 'CD',
   'EU': 'NNP',
   'members': 'NNS',
   'before': 'IN',
   'HU': 'NNP',
   'They': 'PRP',
   'must': 'MD',
   'get': 'VB',
   'help': 'NN',
   'the': 'DT',
   'first': 'JJ',
   'safe': 'JJ',
   'country': 'NN',
   'as': 'IN',
   'seeker': 'NN',
   'not': 'RB',
   '5th': 'NN',
   '.': '.',
   'No': 'DT',
   'law': 'NN',
   'says': 'VBZ',
   'you': 'PRP',
   'can': 'MD',
   'pick': 'VB',
   'and': 'CC',
   'choose': 'VB'},
  {'@Juliivan': 'PERSON',
   'Greece': 'GPE',
   'MAC': 'ORG',
   'SER': 'ORG',
   'CRO': 'ORG',
   'BUL': 'ORG',
   '3': 'CARDINAL',
   'EU': 'ORG',
   'H

In [178]:
pos_tags_set = set()
for tweet in spacy_ner:
    tweet_pos_tags = set(tweet[1].values())
    pos_tags_set.update(tweet_pos_tags)

print(len(spacy_ner))
for nm in range(len(spacy_ner)):
    
    #print(event_df["Tweet Raw"][nm])
    

    #print(xposses)
    print(np_xpos[nm][1])
    print('\n\n')

50
{'Juliivan_': 'PERSON', 'Greece': 'GPE', 'MAC': 'ORG', 'SER': 'ORG', 'CRO': 'ORG', 'BUL': 'ORG', '3': 'CARDINAL', 'EU': 'ORG', 'HU': 'ORG', 'first': 'ORDINAL', 'the 5th': 'DATE'}



{'ISIS': 'ORG', 'Turkey': 'GPE', 'Christian': 'NORP'}



{'Greece': 'GPE', 'UN': 'ORG'}



{'1': 'CARDINAL', 'Turkish': 'NORP', 'Greece': 'GPE', 'EU': 'ORG', 'Western': 'NORP', 'the Middle East': 'LOC', 'Africa': 'LOC'}



{'HU': 'ORG', 'Greece': 'GPE'}



{'#Lesbos': 'FAC', 'Moria #migrants camp': 'FAC'}



{'GreeceMFA': 'ORG', 'Manfred': 'PERSON', 'Weber': 'PERSON', 'Europe': 'LOC', 'Germany': 'GPE', 'Greece': 'GPE', 'German': 'NORP', 'MFA': 'ORG'}



{'Refugee Covid case': 'EVENT', 'Guardian': 'ORG'}



{'9PM': 'TIME', 'UK': 'GPE', 'Leeds': 'GPE', 'Birmingham': 'GPE', 'Greece': 'GPE', '2': 'CARDINAL'}



{'Trump': 'PERSON', 'France': 'GPE', 'Turkey': 'GPE', 'Syria': 'GPE', 'Iraq': 'GPE', 'Libya': 'GPE', 'Greece': 'GPE', 'Cyprus': 'GPE', 'Hamas': 'ORG'}



{'Raquel Bessudo': 'PERSON', 'Isaac Bessudo': 

## Get coreference chains from the tweet corpus

In [87]:
def get_coref_chain(tweet,client):

    ann = client.annotate(tweet)        
    tweet_chains = ann.corefChain
    all_chains = list()
    
    
    for chain in tweet_chains:
        mychain = list()
        # Loop through every mention of this chain
        for mention in chain.mention:
            # Get the sentence in which this mention is located, and get the words which are part of this mention
            words_list = ann.sentence[mention.sentenceIndex].token[mention.beginIndex:mention.endIndex]
            #build a string out of the words of this mention
            ment_word = ' '.join([x.word for x in words_list])
            
            mychain.append(ment_word)
            
        #the corefering words will be stored alongside the index of their representative in a tuple
        coref_group = (mychain,chain.representative)
        all_chains.append(coref_group)
    return all_chains


dict_of_tweet_corefs = {}
#corefs = []
with CoreNLPClient(properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'}, memory='16G') as client:
    for tweet_index in tqdm(range(len(sample_df))):
        tweet_corefs=[]
        #print(f'Coreferences for the tweet {list(event_df["Tweet Raw"]).index(tweet)} are:')
        for chain in get_coref_chain(sample_df[tweet_index],client):
            tweet_corefs.append(chain)
            #print(' <-> '.join(chain),'\n')
        #corefs.append(tweet_corefs)
        dict_of_tweet_corefs[tweet_index] = tweet_corefs

2021-03-13 17:11:16 INFO: Writing properties to tmp file: corenlp_server-ce493a4013fe4d83.props
2021-03-13 17:11:16 INFO: Starting server with command: java -Xmx16G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-ce493a4013fe4d83.props -preload -outputFormat serialized
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:05<00:00,  1.26s/it]


In [1196]:
corefs

[[(['No law', 'it'], 0)],
 [(['You', 'your', 'you'], 0)],
 [],
 [(['the law', 'the law', 'the law'], 0)],
 [(['They', 'they', 'the asylum seekers', 'they'], 2)],
 [(['#photograpy pushing aesthetic', 'it'], 0), (['I', 'I'], 0)],
 [(['@GreeceMFA ManfredWeber', 'It'], 0)],
 [],
 [],
 [(['Turkey', 'Turkey', 'it', 'it'], 0), (['you', 'you'], 0)],
 [(['The Bessudos', 'Isaac Bessudos'], 1),
  (['Raquel', '@realPR_Phoenix @DrEstella @BravoTV Raquel Bessudo'], 1)],
 [(['@Mproyklis @fragoua @LearnerLerner', 'them', 'their'], 0)],
 [(['you', 'you', 'your', 'your', 'you'], 1),
  (['it', 'it'], 1),
  (['they', 'they'], 0)],
 [],
 [(['It', 'Greece'], 1)],
 [],
 [],
 [(['their', 'Both men', 'they', 'them', 'they'], 1)],
 [(['Those refugees',
    '10 refugees , including three children with disabilities'],
   1),
  (['Greece', 'Greece'], 0)],
 [(['UN Refugee Agency', 'Anadolu Agency'], 0)],
 [],
 [],
 [],
 [(['we', 'we'], 0)],
 [],
 [],
 [],
 [(['they', 'Enterprise Greece', 'their', 'they', 'their'], 

In [26]:
# FOR TESTING PURPOSES
all_chains = []
for chain in tweet_chains:
        mychain = list()
        # Loop through every mention of this chain
        for mention in chain.mention:
            # Get the sentence in which this mention is located, and get the words which are part of this mention
            words_list = ann.sentence[mention.sentenceIndex].token[mention.beginIndex:mention.endIndex]
            #build a string out of the words of this mention
            ment_word = ' '.join([x.word for x in words_list])
            #chain_rep = chain.representative
            #coref_group = (ment_word,chain_rep)
            
            mychain.append(ment_word)
            
        coref_group = (mychain,chain.representative)
        all_chains.append(coref_group)
        
all_chains

NameError: name 'tweet_chains' is not defined

In [239]:
with open('moria_tweet_corefs', 'wb') as fp:
    pickle.dump(dict_of_tweet_corefs, fp)

dict_of_tweet_corefs

{0: [(['No law', 'it'], 0)],
 1: [(['You', 'your', 'you'], 0)],
 2: [],
 3: [(['the law', 'the law', 'the law'], 0)],
 4: [(['They', 'they', 'the asylum seekers', 'they'], 2)],
 5: [(['#photograpy pushing aesthetic', 'it'], 0), (['I', 'I'], 0)],
 6: [(['@GreeceMFA ManfredWeber', 'It'], 0)],
 7: [],
 8: [],
 9: [(['Turkey', 'Turkey', 'it', 'it'], 0), (['you', 'you'], 0)],
 10: [(['Raquel',
    'Raquel',
    '@realPR_Phoenix @DrEstella @BravoTV Raquel Bessudo',
    'Isaac Bessudo'],
   2)],
 11: [(['@Mproyklis @fragoua @LearnerLerner', 'them', 'their'], 0)],
 12: [(['you', 'you', 'your', 'your', 'you'], 1),
  (['it', 'it'], 1),
  (['they', 'they'], 0)],
 13: [],
 14: [(['It', 'Greece'], 1)],
 15: [],
 16: [],
 17: [(['their', 'Both men', 'they', 'them', 'they'], 1)],
 18: [(['Those refugees',
    '10 refugees , including three children with disabilities'],
   1),
  (['Greece', 'Greece'], 0)],
 19: [(['Anadolu Agency', 'UN Refugee Agency'], 1)],
 20: [],
 21: [],
 22: [],
 23: [(['we', 'w

## Determining candidate's type for corefs

In [362]:
# Load NPs from pickle file
import pickle

with open(r"moria_tweet_corefs", "rb") as input_file:
    corefs = pickle.load(input_file)



In [89]:
corefs_list = []
for tweet_corefs in dict_of_tweet_corefs:
    try:
        tw_corefs = [coref[0][coref[1]] for coref in dict_of_tweet_corefs[tweet_corefs]]        
        corefs_list.append(tw_corefs)
    except IndexError:
        corefs_list.append([])

        
corefs_list       
#corefs[9][0][0][corefs[0][0][1]]

[['No law'],
 ['You'],
 [],
 ['the law'],
 ['the asylum seekers'],
 ['I', 'aesthetic'],
 [],
 [],
 [],
 ['Turkey', 'you'],
 ['Isaac Bessudos', 'Raquel Bessudo'],
 ['them'],
 ['you', 'it', 'they'],
 [],
 ['Greece'],
 [],
 [],
 ['Both men'],
 ['10 refugees , including three children with disabilities', 'Greece'],
 ['UN Refugee Agency'],
 [],
 [],
 [],
 ['we'],
 [],
 [],
 [],
 ['Enterprise Greece'],
 [],
 ['they'],
 ['He'],
 ['Erdoan'],
 ['UK', 'UK size'],
 [],
 ['the camp', 'The infected person , a 40 year old man', 'Moria'],
 [],
 ['Greece', 'Protection & Operations , @GillianTriggs & @RaoufMazou'],
 ['greece', 'I', 'you'],
 [],
 [],
 [],
 [],
 [],
 ['you'],
 ['Turkey - Greece'],
 ['I'],
 [],
 [],
 ['Greece',
  'Those who seek refuge in Greece , those who want to make a coup',
  'freedom of expression',
  'your'],
 [],
 ['Europe', '2040'],
 ['People', 'me', 'this poor nation'],
 ['Syrian refugees', 'Erdogan'],
 [],
 [],
 [],
 ['it'],
 ['your'],
 [],
 [],
 [],
 [],
 [],
 ['it', 'you'],
 

In [90]:
coref_and_cand_list = []
for i in tqdm(range(len(corefs_list))):
    coref_and_cand_list.append(get_cand_type(corefs_list[i],corefs_list))
print(coref_and_cand_list)  

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.74it/s]

[[('No law', 'law', 'misc')], [('You', 'You', 'misc')], [], [('the law', 'law', 'misc')], [('the asylum seekers', 'seekers', 'person-nn')], [('I', 'I', 'misc'), ('aesthetic', 'aesthetic', 'misc')], [], [], [], [('Turkey', 'Turkey', 'misc'), ('you', 'you', 'misc')], [('Isaac Bessudos', 'Isaac', 'misc'), ('Raquel Bessudo', 'Raquel', 'misc')], [('them', 'them', 'misc')], [('you', 'you', 'misc'), ('it', 'it', 'misc'), ('they', 'they', 'misc')], [], [('Greece', 'Greece', 'loc-ne')], [], [], [('Both men', 'men', 'group')], [('10 refugees , including three children with disabilities', 'refugees', 'person-nn'), ('Greece', 'Greece', 'loc-ne')], [('UN Refugee Agency', 'Agency', 'group')], [], [], [], [('we', 'we', 'misc')], [], [], [], [('Enterprise Greece', 'Greece', 'loc-ne')], [], [('they', 'they', 'misc')], [('He', 'He', 'misc')], [('Erdoan', 'Erdoan', 'misc')], [('UK', 'UK', 'misc'), ('UK size', 'size', 'misc')], [], [('the camp', 'camp', 'group'), ('The infected person , a 40 year old man'




## Candidate merging

We organize candidates in a list sorted by their number of phrases

In [91]:
#concatenate corefs and noun phrase lists
candidate_list = coref_and_cand_list + np_and_cand_list
#unpack list of lists into one list
candidate_list = [cand for cands in candidate_list for cand in cands]

In [95]:
"""def get_cand_len(cand):
    #sort candidate list by count of words in the first element of the tuple
    return len(cand[0].split())"""

candidate_list.sort(reverse=True, key=get_cand_len) 

[('offered high-quality, trauma -sensitive, identity-informed pss & education support to child & youth refugees in northern Greece',
  'offered',
  'misc'),
 ('access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration',
  'access',
  'misc'),
 ('The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece',
  'organization',
  'group-ne'),
 ('Greek, many in America consider Greeks white because white westerners claimed our culture, history and ethnic heritage',
  'consider',
  'misc'),
 ('Leeds on brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources',
  'Leeds',
  'misc'),
 ('The first recorded coronavirus case in Moria refugee camp on Lesbos, where just under 13,000 people are l.....',
  'case',
  'misc'),
 ('4 million migrants housed in turkey!if Greece gets its 12 mile maritime territory all turkey needs to do',
  'g

In [96]:


def get_head(phrase):
    #annotate the noun phrase to find noun head (((((could be done before?)))))
    doc = en_nlp(phrase)  

    #the head of noun phrase is marked with value 0 for the word.head
    relations = {word.text: word.head for sent in doc.sentences for word in sent.words}
    for word, phrase in relations.items():  
        if phrase == int(0):
            np_head = word
            return np_head


### First merging step

In [101]:
#
# THIS IS THE FIRST MERGING STEP
#
        
def merging_step1(candidate_list):
    """
    In the first merging step, we merge two candidates if the head of each of their representative phrase 
     is identical by string comparison.
    """
    indices_to_remove = set()
    for longer_cand in range(len(candidate_list)):     
        for cand in range(longer_cand+1,len(candidate_list)): 
            #print(f'for index {candidate_list[longer_cand][1]} checking the index {candidate_list[cand][1]}')

            #performing merging only for NE candidates of the same type
            if 'ne' in candidate_list[longer_cand][2]:
                #mark for merging if the head and its head's cand type is the same for 2 candidates
                if candidate_list[longer_cand][1] == candidate_list[cand][1] and candidate_list[longer_cand][2] == candidate_list[cand][2]:
                    #print(f'matching "{longer_cand}" with "{cand}"')
                    indices_to_remove.add(cand)
    return indices_to_remove

def merge_indices(candidate_list,indices_to_remove):                
    print(len(candidate_list))                
    print(len(sorted(indices_to_remove)))

    for index in reversed(sorted(indices_to_remove)):
        candidate_list.remove(candidate_list[index])

    print(len(candidate_list)) 
    return candidate_list


candidate_list = merge_indices(candidate_list, merging_step1(candidate_list))

665
0
665


In [1050]:
candidate_list[136]

('Law', 'Law', 'misc')

### Second merging step

Simplification - we are taking the vector of the main phrase head instead of the mean of all heads

In [102]:
import gensim

#load the GoogleNews 300dim model (fix path)
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\nikodemicek\Desktop\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [107]:
#adjust for sets of phrases in the candidate
def merging_step2(candidate_list):
    
    indices_to_remove = set()
    for longer_cand in range(len(candidate_list)):     
        i = candidate_list[longer_cand]

        for cand in range(longer_cand+1,len(candidate_list)): 
            #print(f'for index {candidate_list.index(longer_cand)} checking the index {candidate_list.index(cand)}')
            if candidate_list[longer_cand][1] == candidate_list[cand][1]:
                print(f'matching "{longer_cand}" with "{cand}"')
            try:
                if candidate_list[longer_cand][2] == candidate_list[cand][2]:
                    if model.similarity(candidate_list[longer_cand][1], candidate_list[cand][1]) >= 0.5:
                        print(f'matching "{longer_cand}" with "{cand}"') 
                        #print(f'{longer_cand[1]} and  {cand[1]} matched with sim {model.similarity(longer_cand[1], cand[1])}')
                        indices_to_remove.add(cand)
                elif model.similarity(candidate_list[longer_cand][1], candidate_list[cand][1]) >= 0.7:
                    print(f'matching "{longer_cand}" with "{cand}"') 
                    #print(f'{longer_cand[1]} and  {cand[1]} matched with sim {model.similarity(longer_cand[1], cand[1])}')
                    indices_to_remove.add(cand)
                else:
                    pass
            except KeyError:
                pass
    return indices_to_remove

candidate_list = merge_indices(candidate_list, merging_step2(candidate_list))


matching "27" with "227"
matching "178" with "240"
243
0
243


In [111]:
candidate_list[240]

('2040', '2040', 'misc')

## Third merging step representative labeling

currently working on average cosine similarity of each phrase in the candidate - maybe not optimal, maybe it will be better with a different threshold

In [113]:
from sklearn.cluster import AffinityPropagation
import numpy 
from sklearn.metrics.pairwise import cosine_similarity

def merging_step3(candidate_list):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for np in candidate_list:
        #annotate the noun phrase to find noun head (((((could be done before?)))))
        doc = en_nlp(np[0])  

        #the head of noun phrase is marked with value 0 for the word.head
        np_heads_pos = [(word.text, word.head, word.xpos) for sent in doc.sentences for word in sent.words]
        #np_pos_tags = {word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #print(np_heads_pos)
        cand_np_phrases = []
        for word, head, pos in np_heads_pos:
            #head-1 because the pointer to head does not use 0 index
            if (pos == 'JJ' or pos=='VBN') and 'NN' in np_heads_pos[head-1][2]:
                cand_np_phrases.append(f'{word}_{np_heads_pos[head-1][0]}')
        phrases.append(cand_np_phrases)
        
    # 2. we compare the similarities of candidates' phrases
    for longer_cand in range(len(candidate_list)):     
        i = candidate_list[longer_cand]
        long_cand_vectors = phrases_vectors(phrases[longer_cand])
        if len(long_cand_vectors)==0:
            pass
        else:
            for cand in range(longer_cand+1,len(candidate_list)): 
                short_cand_vectors = phrases_vectors(phrases[cand])
                if len(short_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = numpy.zeros((len(long_cand_vectors),len(short_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(long_cand_vectors)):
                        for j in range(len(short_cand_vectors)):

                            sim_matrix[i][j] = cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1))

                                
                    if numpy.mean(sim_matrix) > 0.3:
                        #print(f'{longer_cand} and {cand} are {numpy.mean(sim_matrix)} similar' )
                        indices_to_remove.add(cand)
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove
                


def phrases_vectors(cand_phrases):
    
#for cand_phrases in phrases:
    #print(cand_phrases)
    cand_phrase_vectors = []
    for phrase in cand_phrases:
        try:
            cand_phrase_vectors.append(model[phrase])
            #print(f'for existing phrase "{phrase}" the vector is {model[phrase][0]}')
        except KeyError:
            phrase_words = phrase.split('_')
            #print(model[phrase_words[1]])
            try:
                phrase_vectors = [model[phrase_word] for phrase_word in phrase_words]
                #print(f'for phrase "{phrase}" avg vector is "{sum(phrase_vectors)/len(phrase_vectors)}') 
                cand_phrase_vectors.append(sum(phrase_vectors)/len(phrase_vectors))
            except KeyError:
                pass
    #print(len(cand_phrase_vectors))
    return cand_phrase_vectors
    
    
candidate_list = merge_indices(candidate_list, merging_step3(candidate_list))
#print(indices_to_remove)

174
0
174


In [1055]:
candidate_list[66]
    

('HU borders', 'borders', 'misc')

### Merging step 4

In [114]:
# missing the second method - we check for the lexical identity of specific stems in multiple candidates.

def merging_step4(candidate_list):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for np in candidate_list:
        #annotate the noun phrase to find noun head (((((could be done before?)))))
        doc = en_nlp(np[0])  

        #the head of noun phrase is marked with value 0 for the word.head
        np_heads_pos = [(word.text, word.head, word.xpos) for sent in doc.sentences for word in sent.words]
        #np_pos_tags = {word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #print(np_heads_pos)
        cand_np_phrases = []
        for word, head, pos in np_heads_pos:
            i = np_heads_pos.index((word, head, pos))
            #print(np_heads_pos)
            #print(np_heads_pos[i])
            #print(np_heads_pos[head-1])
            #'NN' in np_heads_pos[head-1][2] and
            try:
                if 'NN' in pos and 'NN' in np_heads_pos[i+1][2] : 
                    cand_np_phrases.append(f'{word}_{np_heads_pos[i+1][0]}')
                if 'NN' in pos and 'NN' in np_heads_pos[head-1][2]:
                    cand_np_phrases.append(f'{word}_{np_heads_pos[head-1][0]}')
            except IndexError:
                pass
        phrases.append(cand_np_phrases)
    
    # 2. we compare the similarities of candidates' phrases
    for longer_cand in range(len(candidate_list)):     
        i = candidate_list[longer_cand]
        long_cand_vectors = phrases_vectors(phrases[longer_cand])
        if len(long_cand_vectors)==0:
            pass
        else:
            for cand in range(longer_cand+1,len(candidate_list)): 
                short_cand_vectors = phrases_vectors(phrases[cand])
                if len(short_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = numpy.zeros((len(long_cand_vectors),len(short_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(long_cand_vectors)):
                        for j in range(len(short_cand_vectors)):
                            #print(cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)))
                            sim_matrix[i][j] = cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1))
                            """if cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.4:                
                                sim_matrix[i][j] = 2
                            elif cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.2:
                                sim_matrix[i][j] = 1
                            else:
                                sim_matrix[i][j] = 0"""

                                
                    if numpy.mean(sim_matrix) > 0.6:
                        print(f'{longer_cand} and {cand} are {numpy.mean(sim_matrix)} similar' )
                        indices_to_remove.add(cand)
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove

candidate_list = merge_indices(candidate_list, merging_step4(candidate_list))
#print(merging_step4(candidate_list))

35 and 53 are 0.7770951787630717 similar
35 and 55 are 0.7165836095809937 similar
35 and 58 are 0.6098455190658569 similar
36 and 58 are 0.639373779296875 similar
43 and 64 are 0.7002375523249308 similar
43 and 76 are 0.7452705502510071 similar
53 and 55 are 0.7586894035339355 similar
63 and 134 are 0.6065908074378967 similar
64 and 76 are 0.6570812861124674 similar
100 and 119 are 0.7657126188278198 similar
100 and 121 are 0.6914123296737671 similar
119 and 121 are 0.761222779750824 similar
174
8
166


In [115]:
candidate_list

[('offered high-quality, trauma -sensitive, identity-informed pss & education support to child & youth refugees in northern Greece',
  'offered',
  'misc'),
 ('access to territory & asylum, living conditions on mainland & islands, the management of the covid 19 response, integration',
  'access',
  'misc'),
 ('The largest terrorist organization in the world, the names of the Syrian refugees, to enter Bulgaria to Greece',
  'organization',
  'group-ne'),
 ('Greek, many in America consider Greeks white because white westerners claimed our culture, history and ethnic heritage',
  'consider',
  'misc'),
 ('Leeds on brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources',
  'Leeds',
  'misc'),
 ('The first recorded coronavirus case in Moria refugee camp on Lesbos, where just under 13,000 people are l.....',
  'case',
  'misc'),
 ("recent flare ups in TX&Florida (migrants from Mexico&Caribbean), Greece early closing vs Spain' s late 1",
  'u

### Merging step 5


In [1316]:
for i in event_df['Tweet Raw'][:100]:
    print(i)

@sztiv5 @Juliivan_ Yes, why? Why it wasn’t good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU? They must get help in the first safe country as asylum seeker,not in the 5th. No law says you can pick and choose and get it.
@GoTurkey ISIS refuge. Wouldn't go to Turkey if I was paid. You're likely to get your head lopped off if you stray off the beaten track. Go to Christian Greece and be safe while enjoying a similar climate to islamic turkey
Greece must improve refugee overcrowding, UN warns https://t.co/UDM4GDMcmo
@ThisIsOzcan @Nervana_1 @EGozuguzelli 1/3 Law? Let the idle stuff. All rights of the Turkish minority in Greece were taken away. Where is the law? Refugees are not accepted into the EU. Where is the law? Western states divided the states in the Middle East and Africa for underground resources. Where is the law?
@Juliivan_ @sztiv5 Anyway, how did the asylum seekers ended up at HU borders? They must have had a long journey through Greece,

## Frame identification

In [151]:
frame_properties = {'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'unreason/irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']}


"""                   'importance':
                    'unimportance':
                    'power/leadership':
                    'weakness/passiveness':
                    'good quality':
                    'poor quality':
                    'safety':
                    'unsafety':
                    'positive':
                    'negative':
                    
                    
                   }"""

"                   'importance':\n                    'unimportance':\n                    'power/leadership':\n                    'weakness/passiveness':\n                    'good quality':\n                    'poor quality':\n                    'safety':\n                    'unsafety':\n                    'positive':\n                    'negative':\n                    \n                    \n                   }"

In [120]:
import conceptnet_lite as cn
import gensim.downloader as api


# to run on the server we can use larger "conceptnet-numberbatch-17-06-300"
model = api.load("glove-twitter-200")


@Kkkk09240868
@0khalodi0
@POTUS
Also,
Eedogan
has
been
documented
using
ISIS
militants
aka
terrorists,
he
played
the
immigrants
card
as
a
way
to
political
threat
to
Europe,
he
pushed
immigrants
to
greece
and
europe
for
their
own
death,
after
letting
them
homeless
for
years
As
I
said,
All
Muslims
are
guilty
@Nionios1908
@kitsikis
Greece
unable
to
cope
with
60
thousand
refugees
with
a
population
of
10
million.
and
the
border
next
to
it
wants
a
country
of
83
million
to
be
torn
apart.
God,
I've
never
seen
a
fool
like
you
together
in
my
life.
the
problem
is,
you're
all
idiots.😂
@hama_ashad
@realDonaldTrump
and
from
there
you
can
try
to
pass
Europe
especially
Greece
there
are
lots
of
boats
you
know
but
you
have
to
know
you
might
die
from
all
of
that
this
refugge
thing
is
very
fishy
@hama_ashad
@realDonaldTrump
I
dont
know.
I
dont
live
in
Iraq.
In
Europe
many
migrants
walked
from
Greece
to
Norway/sweden/Germany.
I
guess
you
can
do
the
same.
Just
from
Iraq
to
Greece.
The
life
as
migrant
is
awf

dependend
of
EU
money.
Wothout
the
EU
greek
people
would
seek
refuge
in
Turkey.
HAHAHA
@Susan60190970
@AndreAp0ll0
@itvnews
@emmamurphyitv
Please
tell
me,
what
exactly
do
you
know
about
the
asylum
seekers
systems
in
countries
like
France,
Germany
and
Greece?
Details
please.
I'm
very
interested
in
how
you
know
their
systems
are
flawless
and
don't
discriminate
🙃
via
@PerilOfAfrica
#Newsdeck
COVID-19:
Greece
reports
first
coronavirus
case
in
Moria
migrant
camp
on
Lesbos:
ATHENS,
Sept
2
(Reuters)
-
Greece
recorded
its
first
coronavirus
case
in
the
overcrowded
migrant
camp
of
Moria
on
the
island
of
Lesbos
and
the…
https://t.co/O31oc3V6j0
https://t.co/24C6hWAyVz
@BenTheSilent
@AndreAp0ll0
@itvnews
@emmamurphyitv
That's
not
what
i
said
refugees
who
are
brought
in
through
the
proper
channels
are
vetted.
Do
you
watch
what's
happening
france
Germany
and
Greece.
If
they
were
genuine
they
wouldn't
have
been
refused
asylum
in
the
countries
they've
passed
No
I
don't
think
they
are
all
criminals
@Spu

In [220]:
tweets_corpus = list(preprocess_tweets(event_df['Tweet Raw'][101:103]))

# add stopwords removal
word_properties = {}
for i in tweets_corpus:
    #print(i)
    for word in social_tokenizer(i):
        #print(word)
        property_list = []
        #print(list(frame_properties.keys()))
        for prop in list(frame_properties.keys()):
            #print(frame_properties[prop])
            
            try:
                #print(model.similarity(word, 'affection'))
                weights = [model.similarity(word, seed) for seed in frame_properties[prop]]
            except KeyError:
                pass
            
            if max(weights)>0.4:
                property_list.append((prop,max(weights)))
        word_properties[word] = property_list

print(word_properties)
        

            

0
1
['greece unable to cope with of thousand refugees with as population of of million of and the border next to it wants as country of of million to be torn apart of god of of of ve never seen as fool like you together in my life of the problem is of you of re all idiots of', 'and from there you can try to pass europe especially greece there are lots of boats you know but you have to know you might die from all of that this refuge thing is very fishy']
{'greece': [], 'unable': [], 'to': [('affection', 0.6572346), ('refusal', 0.6572346), ('reason', 0.62308306), ('unreason/irrationality', 0.62308306), ('easiness', 0.62308306), ('difficulty', 0.45636457), ('honor', 0.4067731), ('dishonor', 0.4067731)], 'cope': [], 'with': [('affection', 0.7271991), ('refusal', 0.7271991), ('reason', 0.59030914), ('unreason/irrationality', 0.59030914), ('easiness', 0.59030914), ('difficulty', 0.5170113)], 'of': [('affection', 0.69872916), ('refusal', 0.69872916), ('no trustworthiness', 0.42871174), ('reas