## Part 3 Loading Data & EDA

Import relevant packages for the following parts

In [1]:
import numpy as np
import pandas as pd
import math
import re
import sys

from gensim.models.word2vec import Word2Vec

import matplotlib.pyplot as plt
%matplotlib inline
import preprocess

plt.style.use('ggplot')
#from preprocessing import get_processed_data, load_data
import csv
import stanza

from tqdm import tqdm

import time

### Import data cleaned by using the selfwritten preprocessing module

In [2]:
event_df = pd.read_csv('moria_no_duplicates.csv', index_col=0)
# given event date, split the dataset to pre and post event dfs

event_date = '2020-09-09'

#moria_df[['Date','Time']] = moria_df['Date Short'].astype(str).str.split(' ', 1, expand=True)
# create pre and post event partition
pre_event = event_df[event_df['Date Short'] < event_date]
post_event = event_df[event_df['Date Short'] >= event_date]

print('total tweets: ', event_df.shape[0])
print('Pre event tweets: ',pre_event.shape[0])
print('Post event tweets: ',post_event.shape[0])


total tweets:  18203
Pre event tweets:  3311
Post event tweets:  14892


In [3]:
event_df.head()

Unnamed: 0,Date,URL,Tweet Raw,Influencer,Country,Language,Reach,Engagement,AVE,Sentiment,...,Time,State,City,Hashtags,Mentions,Linked Content,Retweet,Quote Tweet,Tweet Clean,Tweet Clean Tokens
0,04-Sep-2020 11:52PM,https://twitter.com/SophieTBHonest/statuses/13...,"@sztiv5 @Juliivan_ Yes, why? Why it wasn’t goo...",@sophietbhonest,Australia,English,45,,0.42,Neutral,...,11:52 PM,,,[],"['sztiv5', 'Juliivan_']",[],False,False,"yes, why? why it wasnt good to apply for asylu...","['wasnt', 'good', 'apply', 'asylum', 'greece',..."
1,04-Sep-2020 11:44PM,https://twitter.com/WarWorl/statuses/130199956...,@GoTurkey ISIS refuge. Wouldn't go to Turkey i...,@warworl,United Kingdom,English,5,,0.05,Neutral,...,11:44 PM,England,Ravenstone,[],['GoTurkey'],[],False,False,isis refuge. wouldn't go to turkey if i was pa...,"['refuge', 'would', 'turkey', 'paid', 'likely'..."
2,04-Sep-2020 11:36PM,https://twitter.com/brownrecluse101/statuses/1...,"Greece must improve refugee overcrowding, UN w...",@brownrecluse101,Unknown,English,1348,,12.47,Neutral,...,11:36 PM,,,[],[],['https://t.co/UDM4GDMcmo'],False,False,"greece must improve refugee overcrowding, un w...","['greece', 'must', 'improve', 'refugee', 'over..."
3,04-Sep-2020 10:59PM,https://twitter.com/sbaslan22/statuses/1301988...,@ThisIsOzcan @Nervana_1 @EGozuguzelli 1/3 Law?...,@sbaslan22,Turkey,English,315,,2.91,Neutral,...,10:59 PM,Edirne,Uzunköprü,[],"['ThisIsOzcan', 'Nervana_1', 'EGozuguzelli']",[],False,False,/3 law? let the idle stuff. all rights of the ...,"['idle', 'stuff', 'right', 'turkish', 'minorit..."
4,04-Sep-2020 10:47PM,https://twitter.com/SophieTBHonest/statuses/13...,"@Juliivan_ @sztiv5 Anyway, how did the asylum ...",@sophietbhonest,Australia,English,45,,0.42,Neutral,...,10:47 PM,,,[],"['Juliivan_', 'sztiv5']",[],False,False,"anyway, how did the asylum seekers ended up at...","['anyway', 'asylum', 'seeker', 'ended', 'borde..."


In [4]:
list(event_df["Tweet Raw"])[:100]

['@sztiv5 @Juliivan_ Yes, why? Why it wasn’t good to apply for asylum in Greece, MAC, SER or CRO or BUL, together 3 EU members before HU? They must get help in the first safe country as asylum seeker,not in the 5th. No law says you can pick and choose and get it.',
 "@GoTurkey ISIS refuge. Wouldn't go to Turkey if I was paid. You're likely to get your head lopped off if you stray off the beaten track. Go to Christian Greece and be safe while enjoying a similar climate to islamic turkey",
 'Greece must improve refugee overcrowding, UN warns https://t.co/UDM4GDMcmo',
 '@ThisIsOzcan @Nervana_1 @EGozuguzelli 1/3 Law? Let the idle stuff. All rights of the Turkish minority in Greece were taken away. Where is the law? Refugees are not accepted into the EU. Where is the law? Western states divided the states in the Middle East and Africa for underground resources. Where is the law?',
 '@Juliivan_ @sztiv5 Anyway, how did the asylum seekers ended up at HU borders? They must have had a long journ

In [None]:
#stanza.download("en")

In [5]:
en_nlp = stanza.Pipeline("en")

2021-02-24 09:58:59 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-24 09:58:59 INFO: Use device: cpu
2021-02-24 09:58:59 INFO: Loading: tokenize
2021-02-24 09:58:59 INFO: Loading: pos
2021-02-24 09:58:59 INFO: Loading: lemma
2021-02-24 09:59:00 INFO: Loading: depparse
2021-02-24 09:59:00 INFO: Loading: sentiment
2021-02-24 09:59:01 INFO: Loading: ner
2021-02-24 09:59:02 INFO: Done loading processors!


In [6]:
start = time.time()
en_doc = event_df["Tweet Raw"][:100].apply(en_nlp)
end = time.time()
print(f"Preprocessing the data took {end-start} seconds.")

Preprocessing the data took 151.7979772090912 seconds.


In [None]:
en_doc.iloc[1].sentences[0]

In [None]:
#directory = '../../export CORENLP_HOME=' ##ADD DIRECTORY HERE
#stanza.install_corenlp()

#import os
#os.environ["CORENLP_HOME"] = directory

## As initial WCL candidates, we extract coreference chains and noun phrases (NPs).

In [7]:
from stanza.server import CoreNLPClient

# get noun phrases with tregex
def noun_phrases(_client, _text, _annotators=None):
    pattern = 'NP'
    matches = _client.tregex(_text,pattern,annotators=_annotators)
    return [sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence]


In [8]:
noun_phrase_list = []
anns_list = []
with CoreNLPClient(timeout=300000, memory='16G') as client:
    for tweet in tqdm(event_df["Tweet Raw"]):
        noun_phrase = noun_phrases(client,str(tweet),_annotators="tokenize,ssplit,pos,lemma,parse,ner,coref")
        noun_phrase_list.append(noun_phrase)
        #noun_phrases(client,event_df["Tweet Raw"][:100],_annotators="tokenize,ssplit,pos,lemma,parse")
    client.stop()

2021-02-24 10:01:34 INFO: Writing properties to tmp file: corenlp_server-799910cec1b243da.props
2021-02-24 10:01:34 INFO: Starting server with command: java -Xmx16G -cp export CORENLP_HOME=/path/to/stanford-corenlp-4.1.0\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 300000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-799910cec1b243da.props -preload -outputFormat serialized
100%|██████████████████████████████████████████████████████████████████████████| 18203/18203 [3:00:23<00:00,  1.68it/s]


In [11]:
import pickle

with open('moria_noun_phrases', 'wb') as fp:
    pickle.dump(noun_phrase_list, fp)

In [None]:
def get_coref_chain(tweet,client):

    ann = client.annotate(tweet)        
    tweet_chains = ann.corefChain
    all_chains = list()
    
    
    for chain in tweet_chains:
        mychain = list()
        # Loop through every mention of this chain
        for mention in chain.mention:
            # Get the sentence in which this mention is located, and get the words which are part of this mention
            words_list = ann.sentence[mention.sentenceIndex].token[mention.beginIndex:mention.endIndex]
            #build a string out of the words of this mention
            ment_word = ' '.join([x.word for x in words_list])
            
            mychain.append(ment_word)
            
        #the corefering words will be stored alongside the index of their representative in a tuple
        coref_group = (mychain,chain.representative)
        all_chains.append(coref_group)
    return all_chains


dict_of_tweet_corefs = {}
with CoreNLPClient(properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'}, memory='16G') as client:
    for tweet in tqdm(event_df["Tweet Raw"]):
        tweet_corefs=[]
        #print(f'Coreferences for the tweet {list(event_df["Tweet Raw"]).index(tweet)} are:')
        for chain in get_coref_chain(tweet,client):
            tweet_corefs.append(chain)
            #print(' <-> '.join(chain),'\n')
        dict_of_tweet_corefs[list(event_df["Tweet Raw"]).index(tweet)] = tweet_corefs

2021-02-25 22:28:20 INFO: Writing properties to tmp file: corenlp_server-09eec83ac96340ea.props
2021-02-25 22:28:20 INFO: Starting server with command: java -Xmx16G -cp export CORENLP_HOME=/path/to/stanford-corenlp-4.1.0\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-09eec83ac96340ea.props -preload -outputFormat serialized
 45%|██████████████████████████████████▎                                          | 8121/18203 [29:43<43:07,  3.90it/s]

In [237]:
# FOR TESTING PURPOSES
all_chains = []
for chain in tweet_chains:
        mychain = list()
        # Loop through every mention of this chain
        for mention in chain.mention:
            # Get the sentence in which this mention is located, and get the words which are part of this mention
            words_list = ann.sentence[mention.sentenceIndex].token[mention.beginIndex:mention.endIndex]
            #build a string out of the words of this mention
            ment_word = ' '.join([x.word for x in words_list])
            #chain_rep = chain.representative
            #coref_group = (ment_word,chain_rep)
            
            mychain.append(ment_word)
            
        coref_group = (mychain,chain.representative)
        all_chains.append(coref_group)
        
all_chains

[(['They', 'they', 'the asylum seekers', 'they'], 2)]

In [None]:
with open('moria_tweet_corefs', 'wb') as fp:
    pickle.dump(dict_of_tweet_corefs, fp)

dict_of_tweet_corefs

## Keep only parent NPs (haven't removed longer than 20 words)

In [203]:
#silly but easy way to remove the child NP and keep only parents, run until the sum_len stops decreasing

for tweet_nps in noun_phrase_list:
    for np in range(len(tweet_nps)):
        #print(tweet_nps[np+1])
        try:
            if tweet_nps[np].find(tweet_nps[np+1]) != -1:
                #print('hey')
                tweet_nps.remove(tweet_nps[np+1])
                #print('no')
        #ignore the error caused with end of the list
        except IndexError:
            pass

sum_len = 0
for tweet in noun_phrase_list:
    sum_len += len(tweet)
    
sum_len

122277

In [204]:
noun_phrase_list

[['@sztiv5 @Juliivan_ Yes, why?',
  'it',
  'asylum in Greece, MAC, SER or CRO or BUL',
  '3 EU members',
  'HU',
  'They',
  'help',
  'the first safe country as asylum seeker',
  'the 5th',
  'No law',
  'you',
  'it'],
 ['@GoTurkey ISIS',
  'refuge',
  'Turkey',
  'I',
  'You',
  'your head',
  'the beaten track',
  'Christian Greece',
  'a similar climate',
  'islamic turkey'],
 ['Greece', 'refugee overcrowding', 'UN', 'https://t.co/UDM4GDMcmo'],
 ['@ThisIsOzcan @Nervana_1 @EGozuguzelli',
  '1/3',
  'Law',
  'the idle stuff',
  'All rights of the Turkish minority in Greece',
  'the law',
  'Refugees',
  'the EU',
  'the law',
  'Western states',
  'the states',
  'the Middle East and Africa',
  'underground resources',
  'the law'],
 ['@Juliivan_ @sztiv5',
  'the asylum seekers',
  'HU borders',
  'They',
  'a long journey',
  'Greece',
  'they',
  'help',
  'they',
  'multiple more countries'],
 ['#photograpy pushing aesthetic into #news',
  'it',
  'a potential embellishments of 