## Part 3 Loading Data & EDA

Import relevant packages for the following parts

In [1]:
import numpy as np
import pandas as pd
import math
import re
import sys

#from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
%matplotlib inline
import preprocess

plt.style.use('ggplot')
#from preprocessing import get_processed_data, load_data
import csv
import stanza
from nltk.corpus import wordnet
import spacy

from tqdm import tqdm

import time

### Import data cleaned by using the selfwritten preprocessing module

In [2]:
event_df = pd.read_csv('moria_no_duplicates.csv', index_col=0)

event_df.reset_index(drop=True, inplace=True)
# given event date, split the dataset to pre and post event dfs

event_date = '2020-09-09'

#moria_df[['Date','Time']] = moria_df['Date Short'].astype(str).str.split(' ', 1, expand=True)
# create pre and post event partition
pre_event = event_df[event_df['Date Short'] < event_date]
post_event = event_df[event_df['Date Short'] >= event_date]

print('total tweets: ', event_df.shape[0])
print('Pre event tweets: ',pre_event.shape[0])
print('Post event tweets: ',post_event.shape[0])





total tweets:  18203
Pre event tweets:  3311
Post event tweets:  14892


In [3]:
list(event_df["Tweet Raw"])[:100]

['@sztiv5 @Juliivan_ Yes, why? Why it wasn’t good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU? They must get help in the first safe country as asylum seeker,not in the 5th. No law says you can pick and choose and get it.',
 "@GoTurkey ISIS refuge. Wouldn't go to Turkey if I was paid. You're likely to get your head lopped off if you stray off the beaten track. Go to Christian Greece and be safe while enjoying a similar climate to islamic turkey",
 'Greece must improve refugee overcrowding, UN warns https://t.co/UDM4GDMcmo',
 '@ThisIsOzcan @Nervana_1 @EGozuguzelli 1/3 Law? Let the idle stuff. All rights of the Turkish minority in Greece were taken away. Where is the law? Refugees are not accepted into the EU. Where is the law? Western states divided the states in the Middle East and Africa for underground resources. Where is the law?',
 '@Juliivan_ @sztiv5 Anyway, how did the asylum seekers ended up at HU borders? They must have had a long journ

In [5]:
#stanza.download("en")

In [4]:
en_nlp = stanza.Pipeline("en", ner_batch_size=128)

2021-03-05 15:14:09 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-05 15:14:09 INFO: Use device: cpu
2021-03-05 15:14:09 INFO: Loading: tokenize
2021-03-05 15:14:09 INFO: Loading: pos
2021-03-05 15:14:10 INFO: Loading: lemma
2021-03-05 15:14:10 INFO: Loading: depparse
2021-03-05 15:14:11 INFO: Loading: sentiment
2021-03-05 15:14:12 INFO: Loading: ner
2021-03-05 15:14:13 INFO: Done loading processors!


In [7]:
start = time.time()
en_doc = event_df["Tweet Raw"][:100].apply(en_nlp)
end = time.time()
print(f"Preprocessing the data took {end-start} seconds.")

Preprocessing the data took 168.26420331001282 seconds.


In [10]:
en_doc.iloc[0]

[
  [
    {
      "id": 1,
      "text": "@sztiv5",
      "lemma": "@sztiv5",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 0,
      "deprel": "root",
      "misc": "start_char=0|end_char=7",
      "ner": "O"
    },
    {
      "id": 2,
      "text": "@",
      "lemma": "@",
      "upos": "ADP",
      "xpos": "IN",
      "head": 3,
      "deprel": "case",
      "misc": "start_char=8|end_char=9",
      "ner": "O"
    },
    {
      "id": 3,
      "text": "Juliivan_",
      "lemma": "Juliivan_",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 1,
      "deprel": "nmod",
      "misc": "start_char=9|end_char=18",
      "ner": "S-PERSON"
    }
  ],
  [
    {
      "id": 1,
      "text": "Yes",
      "lemma": "yes",
      "upos": "INTJ",
      "xpos": "UH",
      "head": 3,
      "deprel": "discourse",
      "misc": "start_char=19|end_char=22",
      "ner": "O"
    },
    {
      "id": 2,
      "text": ",",
    

In [28]:
#directory = '../../export CORENLP_HOME=' ##ADD DIRECTORY HERE
stanza.install_corenlp()

#import os
#os.environ["CORENLP_HOME"] = directory



## As initial WCL candidates, we extract coreference chains and noun phrases (NPs).

### SOME PREPROCESSING NEEDED
* remove links
* remove/merge mentions?
* remove recurring texts (signatures of news media)
* remove # from hashtags?
* exclude NERs that tag numbers
* play around with candidate types
* optimize code and make it neater
* remove noun phrases longer than 20 words

In [5]:
from stanza.server import CoreNLPClient

# get noun phrases with tregex
def noun_phrases(_client, tweet, _annotators=None):
    """
    Input: _client = CoreNLPClient instance
           _text = tweet text
           _annotators = allowed CoreNLP operations
    Output: list of all noun phrases in the tweet
    """
    pattern = 'NP'
    matches = _client.tregex(tweet,pattern,annotators=_annotators)

    return [sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence]


In [8]:
noun_phrase_list = []
with CoreNLPClient(timeout=300000, memory='16G') as client:
    for tweet in tqdm(event_df["Tweet Raw"]):
        noun_phrase = noun_phrases(client,str(tweet),_annotators="tokenize,ssplit,pos,lemma,parse,ner,coref")
        noun_phrase_list.append(noun_phrase)
        #noun_phrases(client,event_df["Tweet Raw"][:100],_annotators="tokenize,ssplit,pos,lemma,parse")

2021-02-24 10:01:34 INFO: Writing properties to tmp file: corenlp_server-799910cec1b243da.props
2021-02-24 10:01:34 INFO: Starting server with command: java -Xmx16G -cp export CORENLP_HOME=/path/to/stanford-corenlp-4.1.0\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 300000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-799910cec1b243da.props -preload -outputFormat serialized
100%|██████████████████████████████████████████████████████████████████████████| 18203/18203 [3:00:23<00:00,  1.68it/s]


In [128]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS IS A TEST CODE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

with CoreNLPClient(timeout=300000, memory='16G') as client:
    #ann = client.annotate(event_df["Tweet Raw"][:10])  
    noun_phrase_list = []
    for tweet in tqdm(event_df["Tweet Raw"][:3]):
        doc = en_nlp(tweet)

        noun_phrase,np_head = noun_phrases(client,str(tweet),doc, _annotators="tokenize,ssplit,pos,lemma,parse,ner,coref")
        noun_phrase_list.append((noun_phrase,np_head))

noun_phrase_list

2021-03-04 21:23:48 INFO: Writing properties to tmp file: corenlp_server-f4d04816a03c4a14.props
2021-03-04 21:23:48 INFO: Starting server with command: java -Xmx16G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 300000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-f4d04816a03c4a14.props -preload -outputFormat serialized
 10%|████████▎                                                                          | 1/10 [01:10<10:33, 70.39s/it]

@sztiv5 @Juliivan_ Yes, why? Why it wasn’t good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU? They must get help in the first safe country as asylum seeker,not in the 5th. No law says you can pick and choose and get it.


 20%|████████████████▌                                                                  | 2/10 [01:15<06:47, 50.88s/it]

@GoTurkey ISIS refuge. Wouldn't go to Turkey if I was paid. You're likely to get your head lopped off if you stray off the beaten track. Go to Christian Greece and be safe while enjoying a similar climate to islamic turkey


 30%|████████████████████████▉                                                          | 3/10 [01:17<04:13, 36.17s/it]

Greece must improve refugee overcrowding, UN warns https://t.co/UDM4GDMcmo


 40%|█████████████████████████████████▏                                                 | 4/10 [01:23<02:43, 27.24s/it]

@ThisIsOzcan @Nervana_1 @EGozuguzelli 1/3 Law? Let the idle stuff. All rights of the Turkish minority in Greece were taken away. Where is the law? Refugees are not accepted into the EU. Where is the law? Western states divided the states in the Middle East and Africa for underground resources. Where is the law?


 50%|█████████████████████████████████████████▌                                         | 5/10 [01:28<01:41, 20.32s/it]

@Juliivan_ @sztiv5 Anyway, how did the asylum seekers ended up at HU borders? They must have had a long journey through Greece, where they must apply for help, yet they travelled through multiple more countries, why?


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [01:33<01:03, 15.84s/it]

I never loved #photograpy pushing aesthetic into #news cause it is a potential embellishments of hard facts and harsh reality.I like raw realism. Did I pass this boundary using @FiLMiCPro #firstlight this week in #Lesbos infamous Moria #migrants camp? That’s a big question4me https://t.co/rlEZP5mXmd


 70%|██████████████████████████████████████████████████████████                         | 7/10 [01:36<00:36, 12.11s/it]

@YOsmanli1453 @ArisKallimachos @GreeceMFA ManfredWeber: It must be made clear that Europe and Germany are on the side of Greece. The German MFA must be careful not to be blackmailed in fear of a new refugee crisis.


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [01:39<00:18,  9.35s/it]

Refugee Covid case sparks 'closed camps' fears on Lesbos | Greece | The Guardian https://t.co/hp8JcOf8dB


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [01:45<00:08,  8.13s/it]

9PM Coronavirus UK LIVE: Leeds on brink of lockdown, outbreak at asylum seeker hostel in Birmingham and Greece chaos Stories from 2 sources | Photo via Evening Standard https://t.co/hrcDUI72DF #birmingham #coronavirus #coronavirusoutbreak https://t.co/FOn9kkanHn


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:49<00:00, 10.99s/it]

@Iceman070259590 @OPCW @realDonaldTrump @mikepompeo Are you sure you' re talking about France and not Turkey? Turkey has invaded parts of Syria, bombs Iraq, conducts a proxy war in Libya creating refugees. On top of that, it violates Greece's and Cyprus' sovereignties. Not to mention that it provides safe heaven for Hamas.





[(['@sztiv5 @Juliivan_ Yes, why?',
   '@sztiv5',
   '@Juliivan_ Yes, why',
   '@Juliivan_',
   'it',
   'asylum in Greece, MAC, SER or CRO or BUL',
   'asylum in Greece, MAC, SER or CRO',
   'asylum',
   'Greece, MAC, SER or CRO',
   'BUL',
   '3 EU members',
   'HU',
   'They',
   'help',
   'the first safe country as asylum seeker',
   'the first safe country',
   'asylum seeker',
   'the 5th',
   'No law',
   'you',
   'it'],
  'says'),
 (['@GoTurkey ISIS',
   'refuge',
   'Turkey',
   'I',
   'You',
   'your head',
   'you',
   'the beaten track',
   'Christian Greece',
   'a similar climate',
   'islamic turkey'],
  'Go'),
 (['Greece', 'refugee overcrowding', 'UN', 'https://t.co/UDM4GDMcmo'],
  'improve'),
 (['@ThisIsOzcan @Nervana_1 @EGozuguzelli',
   '1/3',
   'Law',
   'the idle stuff',
   'All rights of the Turkish minority in Greece',
   'All rights',
   'the Turkish minority in Greece',
   'the Turkish minority',
   'Greece',
   'the law',
   'Refugees',
   'the EU',
   'the

In [11]:
#Store the noun phrases in the pickle file
import pickle

with open('file_name_to_save', 'wb') as fp:
    pickle.dump(noun_phrase_list, fp)

In [6]:
# Load NPs from pickle file
import pickle

with open(r"moria_noun_phrases", "rb") as input_file:
    noun_phrase_list = pickle.load(input_file)

## Keep only parent NPs (haven't removed longer than 20 words)

In [214]:
#silly but easy way to remove the child NP and keep only parents, run until the sum_len stops decreasing

for tweet_nps in noun_phrase_list:
    for np in range(len(tweet_nps)):
        #print(tweet_nps[np+1])
        try:
            if tweet_nps[np].find(tweet_nps[np+1]) != -1:
                #print('hey')
                tweet_nps.remove(tweet_nps[np+1])
                #print('no')
        #ignore the error caused with end of the list
        except IndexError:
            pass

sum_len = 0
for tweet in noun_phrase_list:
    sum_len += len(tweet)
    
sum_len

122277

In [198]:
noun_phrase_list

[['@sztiv5 @Juliivan_ Yes, why?',
  '@Juliivan_ Yes, why',
  'it',
  'asylum in Greece, MAC, SER or CRO or BUL',
  'asylum',
  'Greece, MAC, SER or CRO',
  'BUL',
  '3 EU members',
  'HU',
  'They',
  'help',
  'the first safe country as asylum seeker',
  'asylum seeker',
  'the 5th',
  'No law',
  'you',
  'it'],
 ['@GoTurkey ISIS',
  'refuge',
  'Turkey',
  'I',
  'You',
  'your head',
  'the beaten track',
  'Christian Greece',
  'a similar climate',
  'islamic turkey'],
 ['Greece', 'refugee overcrowding', 'UN', 'https://t.co/UDM4GDMcmo'],
 ['@ThisIsOzcan @Nervana_1 @EGozuguzelli',
  '1/3',
  'Law',
  'the idle stuff',
  'All rights of the Turkish minority in Greece',
  'the Turkish minority in Greece',
  'Greece',
  'the law',
  'Refugees',
  'the EU',
  'the law',
  'Western states',
  'the states',
  'the Middle East and Africa',
  'Africa',
  'underground resources',
  'the law'],
 ['@Juliivan_ @sztiv5',
  'the asylum seekers',
  'HU borders',
  'They',
  'a long journey',
  'Gr

In [60]:

def tag_tweets(corpus):
    """
    Input: corpus of tweets to tag
    Output: List of tuples containing (POS-tags of each word, NER-tags of each named entity)
    """
    np_pos_tags=[]
    for tweet in tqdm(corpus):
        #for np in np_tweets:
            #annotate the tweet
            doc = en_nlp(tweet)
            #extract POS and NE tags
            tweet_pos_tags={word.text: word.xpos for sent in doc.sentences for word in sent.words}
            tweet_ner= {ent.text: ent.type for sent in doc.sentences for ent in sent.ents}
            np_pos_tags.append((tweet_pos_tags,tweet_ner))
    return np_pos_tags  

np_pos_tags = tag_tweets(event_df["Tweet Raw"][:50]) 



pos_tags_set = set()

#get a set of all NER tags existing in corpus
for tweet in np_pos_tags:
    tweet_pos_tags = set(tweet[1].values())
    pos_tags_set.update(tweet_pos_tags)

print(pos_tags_set)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:05<00:00,  1.32s/it]

{'ORDINAL', 'EVENT', 'GPE', 'NORP', 'PERSON', 'ORG', 'LOC', 'MONEY', 'PRODUCT', 'TIME', 'FAC', 'CARDINAL', 'DATE'}





In [61]:
#dictionary to assign candidate types based on named entities and part of speech tags
#the key tuple consists of (isNE, lexicographer type, plural)
cand_types = {(True,'PERSON',None):'person-ne',
              (True,'NORP',None):'person-ne',
              (True,'PERSON','plural'):'person-nes',
              (True,'NORP','plural'):'person-nes',
              (False,'PERSON',None):'person-nn',
              (False,'PERSON','plural'):'person-nns',
              (True,'ORG',None):'group-ne',
              (True,'FAC',None):'group-ne',
              (False,'ORG',None):'group',
              (True,'LOC',None):'loc-ne',
              (True,'GPE',None):'loc-ne',
              (False, 'LOC',None):'loc'
            }


In [64]:
import nltk
from nltk.corpus import wordnet as wn


def get_synt_category(head):
    """
    Input: head word of the noun phrase e.g. 'aliens' from NP 'Illegal aliens' 
    Output: syntactic category of the head word as categorized using worndet
    """
    
    person_ss = wn.synsets("person")[0]
    #group_ss = wn.synsets("facility")[0]    
    place_ss = wn.synsets("location")[0]
    org_ss = wn.synsets("organization")[0]
    counter = 0
    synt_category = head
    try:
        while synt_category not in [None,'PERSON','LOC','ORG']:
            # words without meaning return empty lists and cause infinite loop, we need to throw error
            assert len(wn.synsets(synt_category))>0, f"{synt_category} has no synonyms"
            
            for ss in wn.synsets(synt_category):
                counter += 1                
                #print(ss.lemmas())
                #for hyper in ss.hypernyms():
                assert len(ss.hypernyms())>0, f"{ss} has no hypernyms"
                hyper = ss.hypernyms()[0]
                
                #print(f'for {synt_category} synonyms are: {ss}, hypernyms are: {hyper}')
                #print(f'synonym with person: {ss.wup_similarity(person_ss)}')
                #print(f'hypernym with person: {hyper.wup_similarity(person_ss)}')
                #print(f'with group: {ss.wup_similarity(group_ss)}')
                #print(f'synonym with place: {ss.wup_similarity(place_ss)}')
                #print(f'hypernym with place: {hyper.wup_similarity(place_ss)}')

                #if the syntactic similarity to one of the categories is more than 0.7, select the category
                if ss.wup_similarity(person_ss) >= 0.7:
                    synt_category = 'PERSON'
                    break
                #elif ss.wup_similarity(group_ss) >= 0.7:
                    #synt_category = 'facility'
                    #break
                elif ss.wup_similarity(place_ss) >= 0.7:
                    synt_category = 'LOC'
                    break
                elif ss.wup_similarity(org_ss) >= 0.7:
                    synt_category = 'ORG'
                    break
                else:
                    # if the synset is not similar assign the hypernym synset
                    synt_category = hyper.lemma_names()[0]

                #force stop at level 5 of hypernym search
                if counter == 5:
                    synt_category = None
                    break
            
    except AssertionError:
        synt_category = None
        return synt_category

    #print(f'{head} turned into a candidate {synt_category}')  
    
    return synt_category

list_of_cand_types = []
for head in count_heads.keys():
    list_of_cand_types.append(get_synt_category(head))
count_types = Counter(list_of_cand_types)
print(count_types)
    

Counter({None: 541, 'PERSON': 58, 'LOC': 41, 'ORG': 40})


## assign candidate type to noun phrases

In [63]:
def get_cand_type(tweet_nps):
    """
    Input: list of all noun phrases occurring in one tweet
    Output: list of pairs of np (string) and its candidate type (string) in a tuple for each np of the tweet
    """
    i = noun_phrase_list.index(tweet_nps)
    np_cand_type = []
    for np in tweet_nps:
        
        #annotate the noun phrase to find noun head (((((could be done before?)))))
        doc = en_nlp(np)  
        
        #the head of noun phrase is marked with value 0 for the word.head
        np_heads = {word.text: word.head for sent in doc.sentences for word in sent.words}
        for word, head in np_heads.items():  
            if head == int(0):
                np_head = word
        
        #print(f'the head of "{np}" is {np_head}')
        
        #check if the noun phrase contains an NE tag
        # possible problem - if three is marked as NE then three children will be a NE, should they be???????????
        isNE = False
        for key in np_pos_tags[i][1].keys():
            if key in np:
                isNE = True                
        
        # identified entity will be none if the head is not a named entity, if it is, the NER tag will be assigned
        ner_tag = None
        
        for key in np_pos_tags[i][1].keys():
            if np_head in key:
                ner_tag = np_pos_tags[i][1][key]
        #if np_head in np_pos_tags[i][1].keys():
        #   identified_ner = np_pos_tags[i][1][np_head]       
        
        identified_ner = ner_tag if ner_tag != None else get_synt_category(np_head)
            
        #print(np_pos_tags[i])
        
        if np_head in np_pos_tags[i][0].keys():
            pos_tag = np_pos_tags[i][0][np_head]
            pos_number = 'plural' if pos_tag in ['NNS','NNPS'] and identified_ner in ['person','PERSON'] else None
        
        #we want to create a tuple of (is_named_entity, NE_tag/synt_category, POS-tag)
        pre_cand_type = (isNE, identified_ner, pos_number)
        
        #print(f'\n isNE: {isNE}, ner: {identified_ner}, pos: {pos_number}')
        cand_type = cand_types[pre_cand_type] if pre_cand_type in cand_types.keys() else 'misc'
        np_cand_type_pair = (np,cand_type)
        #print(np_cand_type_pair)
        np_cand_type.append(np_cand_type_pair)
    return np_cand_type

np_and_cand_list = []
#print(noun_phrase_list[10])
for tweet_nps in tqdm(noun_phrase_list[:20]):
    np_and_cand_list.append(get_cand_type(tweet_nps))
print(np_and_cand_list)    

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:05<00:00,  3.29s/it]

[[('@sztiv5 @Juliivan_ Yes, why?', 'misc'), ('@sztiv5', 'misc'), ('@Juliivan_ Yes, why', 'misc'), ('@Juliivan_', 'misc'), ('it', 'misc'), ('asylum in Greece, MAC, SER or CRO or BUL', 'group-ne'), ('asylum in Greece, MAC, SER or CRO', 'group-ne'), ('asylum', 'group'), ('Greece, MAC, SER or CRO', 'loc-ne'), ('BUL', 'group-ne'), ('3 EU members', 'person-nes'), ('HU', 'group-ne'), ('They', 'misc'), ('help', 'person-nn'), ('the first safe country as asylum seeker', 'group-ne'), ('the first safe country', 'group-ne'), ('asylum seeker', 'person-nn'), ('the 5th', 'misc'), ('No law', 'misc'), ('you', 'misc'), ('it', 'misc')], [('@GoTurkey ISIS', 'misc'), ('refuge', 'loc'), ('Turkey', 'loc-ne'), ('I', 'group'), ('You', 'misc'), ('your head', 'person-nn'), ('you', 'misc'), ('the beaten track', 'loc'), ('Christian Greece', 'person-ne'), ('a similar climate', 'misc'), ('islamic turkey', 'misc')], [('Greece', 'loc-ne'), ('refugee overcrowding', 'group'), ('UN', 'group-ne'), ('https://t.co/UDM4GDMcmo




In [58]:
cand_type_count = [np[1] for nps in np_and_cand_list for np in nps]
    
counts = Counter(cand_type_count)
print(counts)

Counter({'misc': 170, 'loc-ne': 41, 'group-ne': 20, 'person-nes': 13, 'loc': 13, 'person-nns': 12, 'person-ne': 9, 'group': 8, 'person-nn': 4})


## If wordnet is not working well, try sense2vec and spacy

In [40]:
import spacy
from sense2vec import Sense2VecComponent

nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk(r"C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\s2v_old")


#vector_map.load(r"C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\s2v_old")


<sense2vec.component.Sense2VecComponent at 0x2d9ce2c6b08>

In [63]:
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk(r"C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\s2v_old")
vector = s2v["natural_language_processing|NOUN"]
most_similar = s2v.most_similar("animal|NOUN", n=10)


In [64]:
most_similar


[('wild_animal|NOUN', 0.8854),
 ('living_animal|NOUN', 0.8663),
 ('animals|NOUN', 0.863),
 ('domesticated_animal|NOUN', 0.8566),
 ('living_creature|NOUN', 0.8442),
 ('other_animals|NOUN', 0.8427),
 ('other_animal|NOUN', 0.8415),
 ('farm_animal|NOUN', 0.8369),
 ('live_animal|NOUN', 0.8266),
 ('single_animal|NOUN', 0.8223)]

## Comparison of NER performance using SpaCy language module

In [22]:
import spacy


en_nlp = spacy.load("en_core_web_sm")


spacy_ner=[]
for np_tweets in event_df["Tweet Raw"][:50]:
    #for np in np_tweets:
        doc = en_nlp(np_tweets)
        #ner = en_nlp.add_pipe("ner")
        tweet_pos_tags={token.text: token.tag_ for token in doc}
        tweet_ner= {ent.text: ent.label_ for ent in doc.ents}
        #tweet_pos_tags={word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #tweet_ner= {ent.text: ent.type for sent in doc.sentences for ent in sent.ents}
        spacy_ner.append((tweet_pos_tags,tweet_ner))
        
spacy_ner

[({'@sztiv5': 'NN',
   '@Juliivan': 'NNP',
   '_': 'NN',
   'Yes': 'UH',
   ',': ',',
   'why': 'WRB',
   '?': '.',
   'Why': 'WRB',
   'it': 'PRP',
   'was': 'VBD',
   'n’t': 'RB',
   'good': 'JJ',
   'to': 'TO',
   'apply': 'VB',
   'for': 'IN',
   'asylum': 'NN',
   'in': 'IN',
   'Greece': 'NNP',
   'MAC': 'NNP',
   'SER': 'NNP',
   'or': 'CC',
   'CRO': 'NNP',
   'BUL': 'NNP',
   'together': 'RB',
   '3': 'CD',
   'EU': 'NNP',
   'members': 'NNS',
   'before': 'IN',
   'HU': 'NNP',
   'They': 'PRP',
   'must': 'MD',
   'get': 'VB',
   'help': 'NN',
   'the': 'DT',
   'first': 'JJ',
   'safe': 'JJ',
   'country': 'NN',
   'as': 'IN',
   'seeker': 'NN',
   'not': 'RB',
   '5th': 'NN',
   '.': '.',
   'No': 'DT',
   'law': 'NN',
   'says': 'VBZ',
   'you': 'PRP',
   'can': 'MD',
   'pick': 'VB',
   'and': 'CC',
   'choose': 'VB'},
  {'@Juliivan': 'PERSON',
   'Greece': 'GPE',
   'MAC': 'ORG',
   'SER': 'ORG',
   'CRO': 'ORG',
   'BUL': 'ORG',
   '3': 'CARDINAL',
   'EU': 'ORG',
   'H

In [178]:
pos_tags_set = set()
for tweet in spacy_ner:
    tweet_pos_tags = set(tweet[1].values())
    pos_tags_set.update(tweet_pos_tags)

print(len(spacy_ner))
for nm in range(len(spacy_ner)):
    
    #print(event_df["Tweet Raw"][nm])
    

    #print(xposses)
    print(np_xpos[nm][1])
    print('\n\n')

50
{'Juliivan_': 'PERSON', 'Greece': 'GPE', 'MAC': 'ORG', 'SER': 'ORG', 'CRO': 'ORG', 'BUL': 'ORG', '3': 'CARDINAL', 'EU': 'ORG', 'HU': 'ORG', 'first': 'ORDINAL', 'the 5th': 'DATE'}



{'ISIS': 'ORG', 'Turkey': 'GPE', 'Christian': 'NORP'}



{'Greece': 'GPE', 'UN': 'ORG'}



{'1': 'CARDINAL', 'Turkish': 'NORP', 'Greece': 'GPE', 'EU': 'ORG', 'Western': 'NORP', 'the Middle East': 'LOC', 'Africa': 'LOC'}



{'HU': 'ORG', 'Greece': 'GPE'}



{'#Lesbos': 'FAC', 'Moria #migrants camp': 'FAC'}



{'GreeceMFA': 'ORG', 'Manfred': 'PERSON', 'Weber': 'PERSON', 'Europe': 'LOC', 'Germany': 'GPE', 'Greece': 'GPE', 'German': 'NORP', 'MFA': 'ORG'}



{'Refugee Covid case': 'EVENT', 'Guardian': 'ORG'}



{'9PM': 'TIME', 'UK': 'GPE', 'Leeds': 'GPE', 'Birmingham': 'GPE', 'Greece': 'GPE', '2': 'CARDINAL'}



{'Trump': 'PERSON', 'France': 'GPE', 'Turkey': 'GPE', 'Syria': 'GPE', 'Iraq': 'GPE', 'Libya': 'GPE', 'Greece': 'GPE', 'Cyprus': 'GPE', 'Hamas': 'ORG'}



{'Raquel Bessudo': 'PERSON', 'Isaac Bessudo': 

## Get coreference chains from the tweet corpus

In [238]:
def get_coref_chain(tweet,client):

    ann = client.annotate(tweet)        
    tweet_chains = ann.corefChain
    all_chains = list()
    
    
    for chain in tweet_chains:
        mychain = list()
        # Loop through every mention of this chain
        for mention in chain.mention:
            # Get the sentence in which this mention is located, and get the words which are part of this mention
            words_list = ann.sentence[mention.sentenceIndex].token[mention.beginIndex:mention.endIndex]
            #build a string out of the words of this mention
            ment_word = ' '.join([x.word for x in words_list])
            
            mychain.append(ment_word)
            
        #the corefering words will be stored alongside the index of their representative in a tuple
        coref_group = (mychain,chain.representative)
        all_chains.append(coref_group)
    return all_chains


dict_of_tweet_corefs = {}
with CoreNLPClient(properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'}, memory='16G') as client:
    for tweet in tqdm(event_df["Tweet Raw"]):
        tweet_corefs=[]
        #print(f'Coreferences for the tweet {list(event_df["Tweet Raw"]).index(tweet)} are:')
        for chain in get_coref_chain(tweet,client):
            tweet_corefs.append(chain)
            #print(' <-> '.join(chain),'\n')
        dict_of_tweet_corefs[list(event_df["Tweet Raw"]).index(tweet)] = tweet_corefs

2021-02-25 22:28:20 INFO: Writing properties to tmp file: corenlp_server-09eec83ac96340ea.props
2021-02-25 22:28:20 INFO: Starting server with command: java -Xmx16G -cp export CORENLP_HOME=/path/to/stanford-corenlp-4.1.0\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-09eec83ac96340ea.props -preload -outputFormat serialized
100%|████████████████████████████████████████████████████████████████████████████| 18203/18203 [57:38<00:00,  5.26it/s]


In [26]:
# FOR TESTING PURPOSES
all_chains = []
for chain in tweet_chains:
        mychain = list()
        # Loop through every mention of this chain
        for mention in chain.mention:
            # Get the sentence in which this mention is located, and get the words which are part of this mention
            words_list = ann.sentence[mention.sentenceIndex].token[mention.beginIndex:mention.endIndex]
            #build a string out of the words of this mention
            ment_word = ' '.join([x.word for x in words_list])
            #chain_rep = chain.representative
            #coref_group = (ment_word,chain_rep)
            
            mychain.append(ment_word)
            
        coref_group = (mychain,chain.representative)
        all_chains.append(coref_group)
        
all_chains

NameError: name 'tweet_chains' is not defined

In [239]:
with open('moria_tweet_corefs', 'wb') as fp:
    pickle.dump(dict_of_tweet_corefs, fp)

dict_of_tweet_corefs

{0: [(['No law', 'it'], 0)],
 1: [(['You', 'your', 'you'], 0)],
 2: [],
 3: [(['the law', 'the law', 'the law'], 0)],
 4: [(['They', 'they', 'the asylum seekers', 'they'], 2)],
 5: [(['#photograpy pushing aesthetic', 'it'], 0), (['I', 'I'], 0)],
 6: [(['@GreeceMFA ManfredWeber', 'It'], 0)],
 7: [],
 8: [],
 9: [(['Turkey', 'Turkey', 'it', 'it'], 0), (['you', 'you'], 0)],
 10: [(['Raquel',
    'Raquel',
    '@realPR_Phoenix @DrEstella @BravoTV Raquel Bessudo',
    'Isaac Bessudo'],
   2)],
 11: [(['@Mproyklis @fragoua @LearnerLerner', 'them', 'their'], 0)],
 12: [(['you', 'you', 'your', 'your', 'you'], 1),
  (['it', 'it'], 1),
  (['they', 'they'], 0)],
 13: [],
 14: [(['It', 'Greece'], 1)],
 15: [],
 16: [],
 17: [(['their', 'Both men', 'they', 'them', 'they'], 1)],
 18: [(['Those refugees',
    '10 refugees , including three children with disabilities'],
   1),
  (['Greece', 'Greece'], 0)],
 19: [(['Anadolu Agency', 'UN Refugee Agency'], 1)],
 20: [],
 21: [],
 22: [],
 23: [(['we', 'w

## Determining candidate's type