# Candidate merging and related preprocessing


Import relevant packages for the following parts

In [1]:
#python libraries
import stanza
from stanza_batch import batch

import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import os
import re
import csv
from tqdm import tqdm
import time
from collections import Counter, defaultdict


# self written modules
import preprocessing

import candidate_processing as cand_prep
import candidate_extraction as cand_ex

"""import candidate_extraction as cand_ex
from ekphrasis.classes.segmenter import Segmenter
seg = Segmenter() """

from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 


Reading english - 1grams ...
Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. We import the data

In [204]:
beirut_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_beirut.csv" # for Beirut

moria_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_moria.csv" # for Moria


def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(beirut_url)
#channel_df = read_event_df(channel_url)
tqdm.pandas()
event_df['text_clean']= event_df['text'].progress_apply(preprocessing.preprocess_tweets)

  2%|█▋                                                                          | 559/24511 [00:00<00:08, 2679.15it/s]

loaded 24511 tweets!


100%|██████████████████████████████████████████████████████████████████████████| 24511/24511 [00:08<00:00, 2754.93it/s]


In [205]:
FILE_PATH = "/Users/nikodemicek/Dropbox (CBS)/Master thesis data"
USERS_PATH = FILE_PATH + "/df_users.csv"
# Read the users csv
df_users = pd.read_csv(USERS_PATH)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)

df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}


def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['text_clean'] = event_df['text_clean'].progress_apply(resolve_username_to_name)

100%|█████████████████████████████████████████████████████████████████████████| 24511/24511 [00:00<00:00, 95374.35it/s]


In [211]:
from stanza.server import CoreNLPClient
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer 

detokenize = TreebankWordDetokenizer().detokenize

def replace_corefs(tweet_series, all=True):

    #corefs_list = list()
    #tweets_list = list(event_df)

    #so we have control over whether we extract only np or coref candidates
    #nps = True if all == True or all == 'nps' else False
    #corefs = True if all == True or all == 'corefs' else False
    
    with CoreNLPClient(annotators=['tokenize','ssplit','pos','parse',"coref"], 
                       properties ={'coref.algorithm' : 'neural','ssplit':'eolonly'}, 
                       timeout=600000, memory='8G') as client:

        def resolve_corefs(tweet,client=client):

            ann = client.annotate(tweet)        
            tweet_chains = ann.corefChain
            all_chains = list()
            all_locs = list()
            #print(tweet)
            
            for chain in tweet_chains:
                chain_words = list()
                word_locs = list()
                # Loop through every mention of this chain
                for mention in chain.mention:
                    # Get the sentence in which this mention is located, and get the words which are part of this mention
                    words_list = ann.sentence[mention.sentenceIndex].token[mention.beginIndex:mention.endIndex]
                    #build a string out of the words of this mention
                    coref_mention = ' '.join([word.word for word in words_list])
                    identified_mention_loc = (mention.sentenceIndex,mention.beginIndex,mention.endIndex)
                    
                    chain_words.append(coref_mention)
                    word_locs.append(identified_mention_loc)
                    
                #the corefering words will be stored alongside the index of their representative in a tuple
                coref_group = (chain_words,chain.representative)
                #coref_cand = coref_group[0][coref_group[1]]
                all_chains.append(coref_group)
                all_locs.append(word_locs)
            
            #print(all_locs)
            #print(all_chains)
            tweet = sent_tokenize(tweet)
            for sent_id in range(len(tweet)):
                tweet[sent_id]=word_tokenize(tweet[sent_id])
            #print(tweet)
            for coref_words,chain_locs in zip(all_chains,all_locs):
                #print(coref,lc)
                rep_mention_id = coref_words[1]
                rep_mention = coref_words[0][rep_mention_id]
                for word,loc in zip(coref_words[0],chain_locs):
                    tweet[loc[0]][loc[1]:loc[2]] = [rep_mention]
                    #print(tweet)

            for sent_id in range(len(tweet)):
                tweet[sent_id] = detokenize(tweet[sent_id])
                #print(tweet[sent_id])
                
                
            tweet = detokenize(tweet)  
                
            #tweet = [detokenize(sent) for sents in tweet for sent in detokenize(sents)]
            #print(tweet)
            return tweet
        
        
        def tokenizer(tweet):
            tweet = word_tokenize(tweet)
            tweet = ' '.join(tweet)
            tweet = sent_tokenize(tweet)
            tweet = '\n'.join(tweet)
            return tweet
        # get noun phrases with tregex using get_noun_phrases function
        #print('extracting noun phrases...')
        tqdm.pandas()
        #noun_phrase_list = list(event_df.progress_apply(get_noun_phrases,args=(client,"tokenize,ssplit,pos,lemma,parse")))
        #noun_phrase_list = [get_noun_phrases(client,tweets_list[tweet_id], annotators="tokenize,ssplit,pos,lemma,parse") for tweet_id in tqdm(range(len(tweets_list)))]


        print('extracting coreference chains...')
        # get coreference chains using the .annotate method of client handled by get_coref_chain function  
        tweet_series = tweet_series.progress_apply(tokenizer)
        
        print('extracting coreference chains...')    
        corefs_list = tweet_series.progress_apply(resolve_corefs)
        #for tweet_id in tqdm(range(len(tweets_list))):
            #coref_chains = [chain for chain in get_coref_chain(event_df[tweet_id],client)] 

        #corefs_list.append(['no_candidate']) if len(corefs_list) == 0 else corefs_list.append(coref_chains)
                
             

    return corefs_list

event_corefs_resolved = replace_corefs(event_df['text_clean'])

2021-04-10 14:14:18 INFO: Writing properties to tmp file: corenlp_server-ba00c23f696f48af.props
2021-04-10 14:14:18 INFO: Starting server with command: java -Xmx8G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 600000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-ba00c23f696f48af.props -annotators tokenize,ssplit,pos,parse,coref -preload -outputFormat serialized
  0%|▎                                                                           | 118/24511 [00:00<00:20, 1168.42it/s]

extracting coreference chains...


100%|██████████████████████████████████████████████████████████████████████████| 24511/24511 [00:15<00:00, 1542.71it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24511/24511 [4:27:31<00:00,  1.53it/s]


In [212]:
event_corefs_resolved

0        I read all your books professor, and even wait...
1        Im was a Lebanese immigrant and fought amongst...
2        migrant _ workers _ lives _ matter Lets not fo...
3        More than a dozen refugees in eastern Lebanon ...
4        What a lovely way to start the week with inspi...
                               ...                        
24506    Why was the CIA not issued a report on the (Be...
24507    , I am a Syrian refugee in Lebanon. I have fou...
24508    i am literally lebanese?????? im not a refuge ...
24509    Whenever a Palestinian auntie asks me what a P...
24510    we all hope a better year that we can touch ea...
Name: text_clean, Length: 24511, dtype: object

In [267]:
def get_noun_phrases(tweet, client, annotators=None):
    """
    Input: client = CoreNLPClient instance
           tweet = tweet text
           annotators = allowed CoreNLP operations
    Output: list of all noun phrases in the tweet
    """
    pattern = 'NP'
    matches = client.tregex(tweet,pattern,annotators=annotators)
    list_of_nps = [sentence[match_id]['spanString']  for sentence in matches['sentences'] for match_id in sentence if len(sentence[match_id]['spanString'].split())<5 ]
    #print(list_of_nps)

    return list_of_nps 


with CoreNLPClient(annotators=["tokenize,ssplit,pos,parse"], 
                   timeout=6000000, memory='8G') as client:

        # get noun phrases with tregex using get_noun_phrases function
        print('extracting noun phrases...')
        tqdm.pandas()
        noun_phrase_list = list(event_corefs_resolved.progress_apply(get_noun_phrases,args=(client,"tokenize,ssplit,pos,parse")))

np_list = noun_phrase_list.copy()

2021-04-11 10:52:00 INFO: Writing properties to tmp file: corenlp_server-8dada1fab204493d.props
2021-04-11 10:52:00 INFO: Starting server with command: java -Xmx8G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 6000000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-8dada1fab204493d.props -annotators tokenize,ssplit,pos,parse -preload -outputFormat serialized
  0%|                                                                                        | 0/24511 [00:00<?, ?it/s]

extracting noun phrases...


100%|██████████████████████████████████████████████████████████████████████████| 24511/24511 [4:35:37<00:00,  1.48it/s]


In [164]:
np_list = noun_phrase_list.copy()

In [271]:

def get_cand_len(cand_list):
    # calculates number of candidates in the corpus
    sum_len = 0
    for tweet_cands in cand_list:
        sum_len += len(tweet_cands)
    return sum_len

def remove_child_nps(noun_phrase_list):
    print(f'removing child NP candidates...')
    initial_len = get_cand_len(noun_phrase_list)
    # remove the child NPs and keep only parents, run until the sum_len stops decreasing
    after_removal_len = 0
    while after_removal_len != get_cand_len(noun_phrase_list):
        after_removal_len = get_cand_len(noun_phrase_list)
        for tweet_nps in noun_phrase_list:
            for noun_p in range(len(tweet_nps)):
                try:
                    #if the subsequent noun_p (child np) is contained in the current one, remove the child np
                    if tweet_nps[noun_p].find(tweet_nps[noun_p+1]) != -1:
                        tweet_nps.remove(tweet_nps[noun_p+1])
                        
                #ignore the error caused with end of the list
                except IndexError:
                    pass

    len_after_removal = get_cand_len(noun_phrase_list)
    print(f'Removed {initial_len-len_after_removal} child NP candidates!')
    return noun_phrase_list

np_list = remove_child_nps(np_list)


removing child NP candidates...
Removed 44551 child NP candidates!


In [236]:
tweet_corpus = event_df['text_clean'].apply(lambda tweet:re.sub(r'[^A-Za-z0-9 ]+', '', tweet.lower()))
tweet_corpus

0        i read all your books professor and even waite...
1        my grandfather was a lebanese immigrant and fo...
2        migrant  workers  lives  matter lets not forge...
3        more than a dozen refugees in eastern lebanon ...
4        what a lovely way to start the week with inspi...
                               ...                        
24506    why was the cia not issued a report on the bei...
24507     i am a syrian refugee in lebanon  i have four...
24508    i am literally lebanese im not a refuge or imm...
24509    whenever a palestinian auntie asks me what my ...
24510    we all hope a better year that we can touch ea...
Name: text_clean, Length: 24511, dtype: object

In [237]:
from gensim.models.phrases import Phrases#, ENGLISH_CONNECTOR_WORDS

#tweet_corpus_tokens = [tweet.split() for tweet in tweet_corpus]
#tweet_corpus_tokens
bigram = Phrases(tweet_corpus, min_count=20, threshold=20) # higher threshold fewer phrases.
trigram = Phrases(bigram[tweet_corpus_tokens], threshold=20) 


trigram.vocab

defaultdict(int,
            {b'i': 3607,
             b'read': 425,
             b'i_read': 14,
             b'all': 1901,
             b'read_all': 5,
             b'your': 1039,
             b'all_your': 10,
             b'books': 27,
             b'your_books': 2,
             b'professor': 10,
             b'books_professor': 1,
             b'and': 19174,
             b'professor_and': 2,
             b'even': 577,
             b'and_even': 33,
             b'waited': 5,
             b'even_waited': 1,
             b'on': 4233,
             b'waited_on': 1,
             b'you': 3364,
             b'on_you': 20,
             b'when': 933,
             b'you_when': 6,
             b'when_you': 63,
             b'came': 266,
             b'you_came': 5,
             b'to': 17961,
             b'came_to': 81,
             b'ilili': 1,
             b'to_ilili': 1,
             b'this': 3341,
             b'ilili_this': 1,
             b'tweet': 78,
             b'this_tweet': 5,
     

In [190]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [269]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords

model_phrases = [trigram[tweet.split()] for tweet in tweet_corpus]
#print(model_phrases[:10])

#negatives = [5,10,20]
sizes = [100,200,300]
sgs=[0,1]
windows =[5,7] 
#cbow_means = [0,1]
#iters=[10]



for size in sizes:
        for window in windows:
            #print(f'\nfor params size={size},negative={neg},sg={sg},hs={hs},window={window},cbow_mean={cbow},iter={it}')
            print(f'\nfor params size={size},window={window}')
            model = Word2Vec(model_phrases,size=size,window=window)
            print(model.wv.most_similar('refugees'))



for params size=100,window=5
[('refugee_camps', 0.7530064582824707), ('exceed', 0.6699388027191162), ('camps', 0.648147702217102), ('refugee', 0.6430160403251648), ('15_mil', 0.6423872113227844), ('89', 0.6361641883850098), ('taken_refuge', 0.6245043277740479), ('percentage', 0.6201708316802979), ('forcibly_exiled', 0.6079879999160767), ('15_million', 0.6018157005310059)]

for params size=100,window=7
[('refugee_camps', 0.7207822799682617), ('sheltered', 0.6638866662979126), ('1960s', 0.6482441425323486), ('15_mil', 0.6410447359085083), ('refugee', 0.6318071484565735), ('camps', 0.6107119917869568), ('not_beignored', 0.595453679561615), ('poverty', 0.5937148332595825), ('report_trapped', 0.5917030572891235), ('15_million', 0.590601921081543)]

for params size=200,window=5
[('refugee_camps', 0.7206195592880249), ('camps', 0.6818684339523315), ('15_mil', 0.6752337217330933), ('sheltered', 0.6639184355735779), ('refugee', 0.6545926928520203), ('15_million', 0.6437801122665405), ('current

In [282]:
model = Word2Vec(model_phrases,size=300,window=7)
model.wv.most_similar('refugees')


[('refugee_camps', 0.7634189128875732),
 ('sheltered', 0.7147684097290039),
 ('camps', 0.6536272764205933),
 ('difficult_conditions', 0.6474202871322632),
 ('refugee', 0.628537654876709),
 ('poverty', 0.6162228584289551),
 ('forcibly_exiled', 0.6024370193481445),
 ('15_million', 0.5982092618942261),
 ('percentage', 0.5728586912155151),
 ('arab_countries', 0.571622908115387)]

## 3. We instantiate stanza english language module

In [277]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ needed when running first time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#

#stanza.download("en")

#stanza.install_corenlp()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# loading the pipeline
en_nlp = stanza.Pipeline("en", tokenize_pretokenized=True, ner_batch_size=4096)

2021-04-11 16:51:49 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-04-11 16:51:49 INFO: Use device: cpu
2021-04-11 16:51:49 INFO: Loading: tokenize
2021-04-11 16:51:49 INFO: Loading: pos
2021-04-11 16:51:50 INFO: Loading: lemma
2021-04-11 16:51:50 INFO: Loading: depparse
2021-04-11 16:51:51 INFO: Loading: sentiment
2021-04-11 16:51:52 INFO: Loading: ner
2021-04-11 16:51:54 INFO: Done loading processors!


In [209]:
import pickle

def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)

In [275]:
def load_event_data(event_name):
    assert event_name in ['moria','tigray','channel','all','beirut'], f"Oh no! We do not analyze {event_name} event"
    
    print(f'Loading {event_name} data...')
    try:
        #sample = 2000
        event_np_list = load_pickle(event_name + '_np_list')#[1000:sample]
        event_crf_list = load_pickle(event_name + '_crf_list')#[1000:sample]
        event_tagged_tweets = load_pickle(event_name + '_tagged_tweets')#[1000:sample]
        
        return event_np_list,event_crf_list,event_tagged_tweets
    except:
        print(f'The {event_name} files not found! Run candidate_extraction.py file on the {eventname}_df')
        return None


In [None]:
def pipeline(event_name):
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 1. LOAD THE DATA ~~~~~~~~~~~~~~~~~~~~~
    event_np_list,event_crf_list,event_tagged_tweets = load_event_data(event_name)
    
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 2. GET POS AND NER TAGS ~~~~~~~~~~~~~~~~~~~~~
    # get easily accessible list of tuples (POS-tags of each word, NER-tags of each named entity) 
    tweet_tags = cand_prep.get_tweet_tags(event_tagged_tweets) 
    
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 3. PREPROCESS CANDIDATES ~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~ processing of noun phrases ~~~~~~~~~~~~~~~~~~~~~
    print(f'Processing {event_name} noun phrase candidates...')
    
    tqdm.pandas()
    # remove NP candidates longer than threshold and remove all child NPs of parent NPs
    event_np_list = cand_prep.remove_long_nps(event_np_list)
    event_np_list = cand_prep.remove_child_nps(event_np_list) 
    #event_np_list = remove_weird_chars(event_np_list)
    event_np_list = cand_prep.remove_char(event_np_list,'@')

    event_np_list = [['no_candidate'] if len(noun_ps)==0 or noun_ps ==' ' else noun_ps for noun_ps in event_np_list ]
    
    #print(event_np_list)
    print(f'Tagging {event_name} noun phrase candidates...')
    #tag all tweets and save them in a list    

    #tagged_np_cands = batched_np_list.progress_apply(en_nlp)
    tagged_np_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_np_list)]
    #tagged_np_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_np_list, en_nlp, batch_size=6000))]

    np_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_np_cands]
    #print(np_cand_heads)
    
    np_and_cand_list = cand_prep.get_cand_type(event_np_list,np_cand_heads, tweet_tags)
    #print(event_np_list)
          
    # ~~~~~~~~~~~~ processing of coref candidates ~~~~~~~~~~~~~~~~~~~~~
    print(f'Processing {event_name} coreference candidates...')    
    
    #extract only the representative mentions as representative phrases of candidates
    event_crf_list = [[coref_group[0][coref_group[1]] for coref_group in tweet_corefs] for tweet_corefs in event_crf_list]
    
    #event_crf_list = remove_weird_chars(event_crf_list)
    event_crf_list = cand_prep.remove_char(event_crf_list,'@')

    event_crf_list = [['no_candidate'] if len(crf_ps)==0 else crf_ps for crf_ps in event_crf_list ]
    
    print(f'Tagging {event_name} coreference candidates...')       
    #tag all tweets and save them in a list    
    #batched_coref_list = cand_prep.prep_candlist_for_batching(event_crf_list)
    #print(batched_coref_list)
    tagged_coref_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_crf_list)]
    #tagged_coref_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_coref_list, en_nlp, batch_size=6000))] 
    #print(tagged_coref_cands)
        
    coref_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_coref_cands]
          
    coref_and_cand_list = cand_prep.get_cand_type(event_crf_list, coref_cand_heads, tweet_tags)
          
    # ~~~~~~~~~~~~~~~~~~~~ combining candidate lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #concatenate corefs and noun phrase lists
    nps_cands = [cand for cands in np_and_cand_list for cand in cands]
    crf_cands = [cand for cands in coref_and_cand_list for cand in cands]
    #candidate_list = coref_and_cand_list + np_and_cand_list
    #print(f'Len = {len(candidate_list)} should be 2x amount of tweets')
    #print(len(nps_cands), len(crf_cands))
    #unpack list of lists into one list
    candidate_list = nps_cands + crf_cands
    print(f'The amount of all candidates is {len(candidate_list)} -  nps: {len(nps_cands)}, crfs:{len(crf_cands)}')
          
    nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences ]
    crf_tagged = [sent for tagged_cand in tagged_coref_cands for sent in tagged_cand.sentences ]
    print(len(nps_tagged), len(crf_tagged))
    all_cands_tagged = nps_tagged + crf_tagged

        
    #print(len(candidate_list),'vs', len(all_cands_tagged))
    cand_df = pd.DataFrame(
        {'candidates': candidate_list,
         'cand_tags': all_cands_tagged
        })

    cand_df['cand_text'] = cand_df.candidates.apply(lambda x: x[0])
    cand_df['cand_len'] = cand_df.cand_text.apply(lambda x: len(x.split()))


    count_cands = Counter(cand_df['cand_text'])
    cand_df['cand_freq'] = cand_df["cand_text"].map(count_cands)
    
    #count_cands[cand_df['cand_text']]
    #count_sorted = sorted(count_cands.items(),key=lambda x: x[1],reverse=True)
    cand_df.columns = cand_df.columns.str.strip()
    
          
    # we sort the candidates by their length
    cand_df.sort_values('cand_freq', ascending=False,inplace=True)

    #cand_df = cand_df[cand_df.cand_text not in  ['no_candidate', 'candidate_to_be_removed']]

    cand_df.reset_index(drop=True, inplace = True)
    #remove dummy candidates that were used to avoid errors

    print(len(cand_df))
    cand_df = cand_df[cand_df.cand_text != 'candidate_to_be_removed']
    cand_df = cand_df[cand_df.cand_text != 'no_candidate']
    len(cand_df)
    cand_df.reset_index(drop=True,inplace=True)
          
    return cand_df
          
          
moria_cands = pipeline('moria')

pickle_files('moria_cands_df', moria_cands)

In [279]:
def pipeline2(event_name,np_list):
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 1. LOAD THE DATA ~~~~~~~~~~~~~~~~~~~~~
    event_np_list,event_crf_list,event_tagged_tweets = load_event_data(event_name)
    event_np_list = np_list
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 2. GET POS AND NER TAGS ~~~~~~~~~~~~~~~~~~~~~
    # get easily accessible list of tuples (POS-tags of each word, NER-tags of each named entity) 
    tweet_tags = cand_prep.get_tweet_tags(event_tagged_tweets) 
    
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 3. PREPROCESS CANDIDATES ~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~ processing of noun phrases ~~~~~~~~~~~~~~~~~~~~~
    print(f'Processing {event_name} noun phrase candidates...')
    
    tqdm.pandas()
    # remove NP candidates longer than threshold and remove all child NPs of parent NPs
    event_np_list = cand_prep.remove_long_nps(event_np_list)
    event_np_list = cand_prep.remove_child_nps(event_np_list) 
    #event_np_list = remove_weird_chars(event_np_list)
    event_np_list = cand_prep.remove_char(event_np_list,'@')

    event_np_list = [['no_candidate'] if len(noun_ps)==0 or noun_ps ==' ' else noun_ps for noun_ps in event_np_list ]
    
    #print(event_np_list)
    print(f'Tagging {event_name} noun phrase candidates...')
    #tag all tweets and save them in a list    

    #tagged_np_cands = batched_np_list.progress_apply(en_nlp)
    tagged_np_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_np_list)]
    #tagged_np_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_np_list, en_nlp, batch_size=6000))]

    np_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_np_cands]
    #print(np_cand_heads)
    
    np_and_cand_list = cand_prep.get_cand_type(event_np_list,np_cand_heads, tweet_tags)
    #print(event_np_list)
          
          
    # ~~~~~~~~~~~~~~~~~~~~ combining candidate lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #concatenate corefs and noun phrase lists
    nps_cands = [cand for cands in np_and_cand_list for cand in cands]
    #candidate_list = coref_and_cand_list + np_and_cand_list
    #print(f'Len = {len(candidate_list)} should be 2x amount of tweets')
    #print(len(nps_cands), len(crf_cands))
    #unpack list of lists into one list
    candidate_list = nps_cands
          
    nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences ]

    all_cands_tagged = nps_tagged

        
    #print(len(candidate_list),'vs', len(all_cands_tagged))
    cand_df = pd.DataFrame(
        {'candidates': candidate_list,
         'cand_tags': all_cands_tagged
        })

    cand_df['cand_text'] = cand_df.candidates.apply(lambda x: x[0])
    cand_df['cand_len'] = cand_df.cand_text.apply(lambda x: len(x.split()))


    count_cands = Counter(cand_df['cand_text'])
    cand_df['cand_freq'] = cand_df["cand_text"].map(count_cands)
    
    #count_cands[cand_df['cand_text']]
    #count_sorted = sorted(count_cands.items(),key=lambda x: x[1],reverse=True)
    cand_df.columns = cand_df.columns.str.strip()
    
          
    # we sort the candidates by their length
    cand_df.sort_values('cand_freq', ascending=False,inplace=True)

    #cand_df = cand_df[cand_df.cand_text not in  ['no_candidate', 'candidate_to_be_removed']]

    cand_df.reset_index(drop=True, inplace = True)
    #remove dummy candidates that were used to avoid errors

    
    cand_df = cand_df[cand_df.cand_text != 'candidate_to_be_removed']
    cand_df = cand_df[cand_df.cand_text != 'no_candidate']
    print(len(cand_df))    
    cand_df.reset_index(drop=True,inplace=True)
          
    return cand_df
          
          
beirut_cands = pipeline2('beirut',np_list)

#pickle_files('moria_cands_df', moria_cands)

Loading beirut data...


  0%|                                                                                        | 0/24511 [01:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24511/24511 [00:00<00:00, 29013.26it/s]


Processing beirut noun phrase candidates...
removing long candidates...
Removed 0 candidates longer than 9 words!
removing child NP candidates...


  0%|                                                                                        | 0/24511 [00:00<?, ?it/s]

Removed 0 child NP candidates!
Tagging beirut noun phrase candidates...


100%|██████████████████████████████████████████████████████████████████████████| 24511/24511 [4:34:45<00:00,  1.49it/s]
100%|████████████████████████████████████████████████████████████████████████████| 24511/24511 [12:30<00:00, 32.67it/s]


207566


## 4. We apply stanza module on the tweets to get NER and POS tags. We do it in batches to speed things up.

## 5. As initial WCL candidates, we extract noun phrases (NPs) and coreference chains.

## We do so using CoreNLPClient wrapper

### SOME PREPROCESSING NEEDED
* remove links - check
* remove # from hashtags? - check
* remove/merge mentions? - check


* remove recurring texts (signatures of news media) - any new spotted should be added in preprocessing file's '__remove_tweet_signatures__' function
* remove posts of some accounts (refugee_list)
* exclude NERs that tag numbers - should we mark phrase as NE if the head is not NE? - check
* play around with candidate types
* optimize code and make it neater



## 6. We keep only NPs shorter than 20 words and remove children of parent NPs 

## 7. We get the heads of noun phrases (in batches)

## 8. We define candidate types 

## 9. We assign candidate types to noun phrase candidates

## 10. We get coreference chains candidates from the tweet corpus

## 11. We determine candidate's type for representative mentions of coref candidates (in batches)

## 12. We combine the candidate lists for candidate merging

We organize candidates in a list sorted by their number of phrases

In [8]:
moria_cands = load_pickle('moria_cands_df')

In [9]:
beirut_whatmerged = load_pickle('beirut_whatmerged2')

In [273]:
candidates = beirut_cands

0        [I, all your books, professor, you, Ilili, thi...
1        [a Lebanese immigrant, the second wave, Norman...
2                      [migrant, workers, Lebanese racism]
3        [eastern Lebanon, the coronavirus, the vulnera...
4        [a lovely way, the week, inspiring art, the be...
                               ...                        
24506    [the CIA, a report, the (Beirut port explosion...
24507    [I, a Syrian refugee, Lebanon, I, four childre...
24508                                    [i, im, i family]
24509    [me, a Palestinian auntie familys, my familys ...
24510                  [we, a better year, we, each other]
Length: 24511, dtype: object

### First merging step

In [281]:
#
# THIS IS THE FIRST MERGING STEP
#

        
def merging_step1(candidate_list):
    """
    In the first merging step, we merge two candidates if the head of each of their representative phrase 
     is identical by string comparison.
    """

    indices_to_remove = set()
    for up_cand_id in tqdm(range(len(candidate_list))):   

        if up_cand_id in indices_to_remove:
            continue
        up_cand = candidate_list[up_cand_id]    
            
        for low_cand_id in range(up_cand_id+1,len(candidate_list)):
            low_cand = candidate_list[low_cand_id]

            if up_cand[1].lower() == low_cand[1].lower():# and upper_cand[3] == lower_cand[3]:


                indices_to_remove.add(low_cand_id)

                
    return indices_to_remove


def merge_indices(cand_df,indices_to_remove):                

    print(f'Initial amount of candidates: {len(cand_df)}')                
    #print(len(sorted(indices_to_remove)))

    #for index in reversed(sorted(indices_to_remove)):
    cand_df.drop(indices_to_remove,inplace=True)
        
    cand_df.reset_index(drop=True,inplace=True)
    print(f'Amount of candidates: {len(cand_df)}, after removing {len(sorted(indices_to_remove))} indices') 
    return cand_df



In [None]:
event_cands_merged

In [None]:
for cand in cand_df['candidates']:
    print(cand[1], cand[3])

### Second merging step

We merge 2 candidates if their sets of phrases heads are semantically similar

In [11]:
import gensim

#load the GoogleNews 300dim model (fix path)
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [18]:
from gensim.models import Word2Vec

event_series = event_df['text'].apply(lambda x: x.split(' '))
model_moria = Word2Vec(event_series,size=200)

In [19]:
model.most_similar('refugee')


[('refugees', 0.8041340112686157),
 ('asylum_seeker', 0.7111446857452393),
 ('asylum_seekers', 0.6694400906562805),
 ('Refugee', 0.6444518566131592),
 ('refugee_status', 0.6259119510650635),
 ('asylum', 0.6098962426185608),
 ('refugee_resettlement', 0.6018669605255127),
 ('UNHCR', 0.5838466882705688),
 ('displaced_persons', 0.5829589366912842),
 ('Refugees', 0.5805562734603882)]

In [26]:
model_moria.most_similar('moria')

  """Entry point for launching an IPython kernel.


[('lesbos', 0.7489250302314758),
 ('Eleonas', 0.6656873226165771),
 ('La', 0.664945125579834),
 ('Mae', 0.6619389057159424),
 ('Moria.', 0.6439236402511597),
 ('#Lipa', 0.6374841332435608),
 ('Lipa', 0.633926510810852),
 ('Karatepe', 0.6262264847755432),
 ('#Jerusalem,', 0.610672116279602),
 ('Karen', 0.6037241220474243)]

In [283]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import numpy as np

def merging_step2(candidate_list):
    
    indices_to_remove = set()
    
    for upper_cand_id in tqdm(range(len(candidate_list))):     
        upper_cand = candidate_list[upper_cand_id]
        
        up_cand_mean_vec = phrase_heads_avg_vector(upper_cand[2])
        
        for lower_cand_id in range(upper_cand_id+1,len(candidate_list)): 
            lower_cand = candidate_list[lower_cand_id]
            #print(f'for index {candidate_list.index(longer_cand)} checking the index {candidate_list.index(cand)}')
            #if candidate_list[longer_cand][1] == candidate_list[cand][1]:
                #print(f'matching "{longer_cand}" with "{cand}"')
            low_cand_mean_vec = phrase_heads_avg_vector(lower_cand[2])

            if upper_cand[3] == lower_cand[3]:
                #try:
                    #print(1-cosine(long_cand_mean_vec,cand_mean_vec))
                    #print(long_cand_mean_vec.reshape(-1,1).shape, cand_mean_vec.reshape(1,-1).shape)
                    if 1-cosine(up_cand_mean_vec,low_cand_mean_vec) >= 0.7:
                        #print(f'matching "{longer_cand}" with "{cand}"') 
                        indices_to_remove.add(lower_cand_id)
                        what_merged2[upper_cand[0].lower()].append(lower_cand[0].lower())
                        
                #except AttributeError:
                    #pass

            else:

                if 1-cosine(up_cand_mean_vec,low_cand_mean_vec) >= 0.8:
                    #print(f'matching "{longer_cand}" with "{cand}"') 
                    indices_to_remove.add(lower_cand_id)
                    what_merged2[upper_cand[0].lower()].append(lower_cand[0].lower())



    return indices_to_remove

def phrase_heads_avg_vector(phrase_set):
    phrase_head_vectors = []
    for phrase_head in phrase_set:    
        try:
            phrase_head_vectors.append(model[phrase_head])
        except KeyError:
            phrase_head_vectors.append(np.NaN)
    #phrase_head_vectors = [model[phrase_head] for phrase_head in phrase_set]
    if len(phrase_head_vectors) != 0:
        return np.mean(phrase_head_vectors,axis=0)
    else: 
        return np.NaN

        




In [None]:
list(event_cands_merged['cand_text'])


In [None]:
for cand in cand_df['candidates']:
    print(cand[1], cand[3])

## Third merging step representative labeling

currently working on average cosine similarity of each phrase in the candidate - maybe not optimal, maybe it will be better with a different threshold

In [284]:
from sklearn.cluster import AffinityPropagation

from sklearn.metrics.pairwise import cosine_similarity

def merging_step3(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:  
        #the head of noun phrase is marked with value 0 for the word.head
        cand_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]
        #np_pos_tags = {word.text: word.xpos for sent in doc.sentences for word in sent.words}
        #print(np_heads_pos)
        cand_labeling_phrases = []
        for word, head, pos in cand_heads_pos:
            #head-1 because the pointer to head does not use 0 index
            if (pos == 'JJ' or pos=='VBN') and 'NN' in cand_heads_pos[head-1][2]:
                cand_labeling_phrases.append(f'{word}_{cand_heads_pos[head-1][0]}')
        phrases.append(cand_labeling_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for up_cand_id in range(len(candidate_list)):     
        up_cand = candidate_list[up_cand_id]
        up_cand_vectors = phrases_vectors(phrases[up_cand_id])
        if len(up_cand_vectors)==0:
            pass
        else:
            for low_cand_id in range(up_cand_id+1,len(candidate_list)): 
                low_cand = candidate_list[low_cand_id]
                low_cand_vectors = phrases_vectors(phrases[low_cand_id])
                if len(low_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(up_cand_vectors),len(low_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(up_cand_vectors)):
                        for j in range(len(low_cand_vectors)):

                            sim_matrix[i][j] = 1-cosine(up_cand_vectors[i],low_cand_vectors[j])

                    # can we compute matrix mean like this? 
                    #print(sim_matrix)
                    if np.mean(sim_matrix) > 0.6:
                        #print(f'{longer_cand} and {cand} are {numpy.mean(sim_matrix)} similar' )
                        indices_to_remove.add(low_cand_id)
                        what_merged3[up_cand[0].lower()].append(low_cand[0].lower())
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove
                


def phrases_vectors(cand_phrases):
    
#for cand_phrases in phrases:
    #print(cand_phrases)
    cand_phrase_vectors = []
    for phrase in cand_phrases:
        try:
            cand_phrase_vectors.append(model[phrase])
            #print(f'for existing phrase "{phrase}" the vector is {model[phrase][0]}')
        except KeyError:
            phrase_words = phrase.split('_')
            #print(model[phrase_words[1]])
            try:
                phrase_vectors = [model[phrase_word] for phrase_word in phrase_words]
                #print(f'for phrase "{phrase}" avg vector is "{sum(phrase_vectors)/len(phrase_vectors)}') 
                cand_phrase_vectors.append(sum(phrase_vectors)/len(phrase_vectors))
            except KeyError:
                cand_phrase_vectors.append(np.NaN)
    #print(len(cand_phrase_vectors))
    return cand_phrase_vectors
    
 
#event_cands_merged = merge_indices(event_cands_merged, merging_step3(event_cands_merged))
#print(indices_to_remove)

In [None]:
what_merged3
    

### Merging step 4

In [285]:
# missing the second method - we check for the lexical identity of specific stems in multiple candidates.

def merging_step4(cand_df):
    phrases = []
    indices_to_remove = set()
    # 1. first we find adj-nn phrases within the candidate
    for candidate in cand_df['cand_tags']:

        #the head of noun phrase is marked with value 0 for the word.head
        cand_heads_pos = [(word.text, word.head, word.xpos) for word in candidate.words]

        #print(np_heads_pos)
        cand_compound_phrases = []
        for word, head, pos in cand_heads_pos:
            #i = np_heads_pos.index((word, head, pos))
            #print(np_heads_pos)
            #print(np_heads_pos[i])
            #print(np_heads_pos[head-1])
            #'NN' in np_heads_pos[head-1][2] and
            try:
                #if 'NN' in pos and 'NN' in cand_heads_pos[i+1][2] : 
                    #cand_compound_phrases.append(f'{word}_{cand_heads_pos[i+1][0]}')
                if 'NN' in pos and 'NN' in cand_heads_pos[head-1][2]:
                    cand_compound_phrases.append(f'{word}_{cand_heads_pos[head-1][0]}')
            except IndexError:
                pass
        phrases.append(cand_compound_phrases)
    
    candidate_list = cand_df['candidates']
    # 2. we compare the similarities of candidates' phrases
    for up_cand_id in range(len(candidate_list)):     
        up_cand = candidate_list[up_cand_id]
        up_cand_vectors = phrases_vectors(phrases[up_cand_id])
        if len(up_cand_vectors)==0:
            pass
        else:
            for low_cand_id in range(up_cand_id+1,len(candidate_list)):
                low_cand = candidate_list[low_cand_id]
                low_cand_vectors = phrases_vectors(phrases[low_cand_id])
                if len(low_cand_vectors)==0:
                    pass
                else:
                    sim_matrix = np.zeros((len(up_cand_vectors),len(low_cand_vectors)))
                    #print(sim_matrix)
                    for i in range(len(up_cand_vectors)):
                        for j in range(len(low_cand_vectors)):
                            #print(cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)))
                            sim_matrix[i][j] = 1-cosine(up_cand_vectors[i],low_cand_vectors[j])
                            """if cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.4:                
                                sim_matrix[i][j] = 2
                            elif cosine_similarity(long_cand_vectors[i].reshape(1,-1),short_cand_vectors[j].reshape(1,-1)) > 0.2:
                                sim_matrix[i][j] = 1
                            else:
                                sim_matrix[i][j] = 0"""

                    #print(sim_matrix, up_cand,low_cand)            
                    if np.mean(sim_matrix) > 0.6:
                        print(f'{up_cand_id} and {low_cand_id} are {np.mean(sim_matrix)} similar' )
                        indices_to_remove.add(low_cand_id)
                        what_merged4[up_cand[0].lower()].append(low_cand[0].lower())
                    #else:
                        #print(f'{numpy.mean(sim_matrix)} is not similar' )
                    
    return indices_to_remove


#event_cands_merged = merge_indices(event_cands_merged, merging_step4(event_cands_merged))
#print(merging_step4(candidate_list))

In [None]:
what_merged

### Merging step 5


In [286]:
what_merged1,what_merged2,what_merged3,what_merged4 = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
event_cands = beirut_cands

event_cands_merged = merge_indices(event_cands, merging_step1(event_cands['candidates']))




100%|██████████████████████████████████████████████████████████████████████████| 207551/207551 [55:17<00:00, 62.57it/s]


Initial amount of candidates: 207551
Amount of candidates: 14465, after removing 193086 indices


In [None]:
event_cands_merged = merge_indices(event_cands_merged, merging_step2(event_cands_merged['candidates']))

what_merged2

  5%|███▊                                                                        | 717/14465 [17:18<7:50:16,  2.05s/it]

In [None]:
event_cands_merged = merge_indices(event_cands_merged, merging_step3(event_cands_merged))

what_merged3

In [None]:
event_cands_merged = merge_indices(event_cands_merged, merging_step4(event_cands_merged))
what_merged4

In [None]:
pickle_files('moria_cands_merged',event_cands_merged)
pickle_files('moria_whatmerged2',what_merged2)
pickle_files('moria_whatmerged3',what_merged3)
pickle_files('moria_whatmerged4',what_merged4)

## Frame identification

In [None]:
"""frame_properties = {'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'unreason/irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']}""" #from Hamborg's paper

# from paper Shifting the refugee narratives? by Greussing & Boomgaarden (2015)
frame_properties = {'settlement':['settlement','accomodation','permanent','temporary','barracks','accommodated','tent','camp', 'shelter'],
                   'reception':['quota', 'distribution', 'limit', 'selection','reception','together','asylum','receive'],
                    'security':['security', 'border','crossing','fence','control','flow'],
                    'criminality':['officer','terror','suspicion','crime','offense','police','trafficking','suspect'],
                    'economisation':['euro','economic','million','thousand','cost','money'],
                    'humanitarian':['humane','voluntary','help','support','aid','care','solidarity'],
                    'victimization':['fight','victim','war','dead','rescued','state'],
                    'integration': ['labour','employed','unemployed','integration','positive'],
                    
                    #from hamborg
                    'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']
                   
                   }



In [None]:
what_merged2 = load_pickle('beirut_whatmerged3')
what_merged2

In [None]:
#import conceptnet_lite as cn
import gensim
import gensim.downloader as api

#model = gensim.models.KeyedVectors.load_word2vec_format(r"C:/Users/niol19ac/Dropbox (CBS)/Master thesis data/GoogleNews-vectors-negative300.bin.gz", binary=True)

manual_cands = ['refugee','migrant','greece','turkey','syria','beirut','immigrant','aoun']


# to run on the server we should use larger model according to the paper - "conceptnet-numberbatch-17-06-300"
model = api.load("glove-twitter-200")


In [None]:
from nltk.corpus import stopwords
from collections import defaultdict
import nltk

nltk.download('stopwords')

stop_words = list(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

print('preprocessing tweets...')
tqdm.pandas()
tweets_corpus = list(event_df['text'].progress_apply(preprocessing.preprocess_tweets))


print('assigning frame properties to words from tweets...')
word_properties = defaultdict(dict)
for i in tqdm(tweets_corpus):
    tweet_words = [word.lower() for word in i.split() if word not in stop_words and len(word)>1]
    for word in tweet_words:
        #print(word)
        word = lemma.lemmatize(word)
        property_list = []
        #print(list(frame_properties.keys()))
        for prop in list(frame_properties.keys()):
            #print(frame_properties[prop])
            
            try:
                #print(f'sim of {word}, {prop} is {model.similarity(word, prop)}')
                weights = [model.similarity(word, seed) for seed in frame_properties[prop]]
                #print(weights)
                if max(weights)>0.4:
                    word_properties[word][prop] = max(weights)
            except KeyError:
                pass
            



print(word_properties)
        

            

In [None]:
tagged_tweets = load_pickle('beirut_tagged_tweets')
coref_chains = load_pickle('moria_crf_list')

#coref_chains[2]

In [None]:
for tweet_id in range(len(tagged_tweets))[:10]:
    print(coref_chains[tweet_id])
    print(tagged_tweets[tweet_id].text)

In [None]:
from collections import defaultdict
# import these modules 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus.reader.wordnet import NOUN
import numpy as np
  
lemma = WordNetLemmatizer() 

cand_frames = defaultdict(list)

framed_words = pd.DataFrame(columns=['word','date',list(frame_properties.keys())])

event_df[['date','time']] = event_df['created_at'].str.split(' ',expand=True)


for tweet_id in tqdm(range(len(tagged_tweets))):
    #print(tweet)
    cand_words = [[word.id, word.text,word.head] for sent in tagged_tweets[tweet_id].sentences for word in sent.words]
    #print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')
    #print(len(cand_df['candidates']))
    for cand in manual_cands:#event_merged_cands['candidates']:
            # lemmatize representative head of candidate 
            rep_head = cand #lemma.lemmatize(cand[1].lower(),pos=NOUN)
            
            if rep_head in tagged_tweets[tweet_id].text.lower() and len(rep_head)>1:
                #find all dependencies of the phrase head
                for related in range(len(cand_words)):
                    cand_word_lemma = lemma.lemmatize(cand_words[related][1].lower())
                    #print(f'Yes it is, related = {lemma.lemmatize(cand_heads[related][1].lower(),pos=NOUN)}')
                    #lemma.lemmatize(cand_heads[related][1].lower(),pos=NOUN)
                    if rep_head == cand_word_lemma:
                        related_word = lemma.lemmatize(cand_words[cand_words[related][2]-1][1].lower())
                        cand_frames['word'].append(rep_head)
                        cand_frames['date'].append(event_df['date'][tweet_id])
                        #cand_frames['word'].append(phrase_head)
                        for frame_property in list(frame_properties.keys()):
                            #print(frame_property)

                            try:
                                #print(word_properties[phrase_head][frame_property])
                                cand_frames[frame_property].append(word_properties[related_word][frame_property])

                            except KeyError:
                                #print('Error')
                                #cand_frames[frame_property].append(word_properties['tent'][frame_property])
                                cand_frames[frame_property].append(np.NaN)
                                

                    #print(len(cand_frames[frame_property]))
                    
                #print('\n')
                
                
                """for frame_property in list(frame_properties.keys()):
                    for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    #print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                            #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                        except KeyError:
                            pass"""
                        #[cand_frames[seed_word][cand].append(model.similarity(print(f'{cand} is related to {np_heads[np_heads[related][2]+1][1]}') if cand == np_heads[related][1] else print('nej') for related in range(len(np_heads))]
            #print(get_head(cand))
            #print(np_heads[19][1])
            #[f(x) if condition else g(x) for x in sequence]
            #[print(np_heads[np_heads[related][2]-1]) if get_head(cand)==np_heads[related][1] else print('hi') for related in range(len(np_heads))]

            
#became ___ (vb and vbx)
#(VP sit/VB (PP on/IN (NP the/DT mat/NN))))) 

#common phrases = migrant camp, covid case, covid test

#cand_frames

In [None]:
print(cand_frames.keys())

framed_words = pd.DataFrame.from_dict(cand_frames)

#framed_words[framed_words['word']=='migrants'].tail(50)

#framed_words = framed_words.dropna(subset=['settlement', 'reception', 'security', 'criminality', 'economisation', 'humanitarian', 'victimization', 'integration', 'affection', 'refusal', 'trustworthiness', 'no trustworthiness', 'reason', 'unreason/irrationality', 'easiness', 'difficulty', 'honor', 'dishonor'],how='all')

framed_words

In [None]:
the_word = 'refugee'

merged_frames = framed_words.copy()

#framed_words[framed_words['date']=='2020-09-04']

merged_frames['no trustworthiness'] = - merged_frames['no trustworthiness']
merged_frames['refusal'] = - merged_frames['refusal']
merged_frames['unreason/irrationality'] = -merged_frames['unreason/irrationality']
merged_frames['difficulty'] = -merged_frames['difficulty'] 
merged_frames['dishonor'] = -merged_frames['dishonor']


trust = ['trustworthiness', 'no trustworthiness']
honor = ['honor', 'dishonor']
affection = ['affection','refusal']
reason = ['reason','unreason/irrationality']
easiness = ['easiness','difficulty']

merged_frames = pd.lreshape(merged_frames,
                        {'reason':reason, 'honor':honor,'affection':affection,'trust':trust,'easiness':easiness},
                       dropna=False)


merged_frames = merged_frames[merged_frames['word'] == the_word]

aggr_frames = merged_frames.groupby(['word','date'],as_index=False).mean()
frame_size = merged_frames.groupby(['word','date'],as_index=False).size()

aggr_frames

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots

fig = make_subplots(specs=[[{"secondary_y": True}]])


ax1 = px.line(aggr_frames, x="date", y=['settlement'],render_mode='webgl')

ax2 = px.line(frame_size, x="date", y=['size'],render_mode='webgl')

ax2.update_traces(yaxis='y2')
fig.add_traces(ax1.data + ax2.data)

fig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))

fig.show()

In [None]:
fig = px.line(aggr_frames, x="date", y=['difficulty','easiness'], title=f'Frame bias towards {the_word}')
fig.show()

# TESTING:

In [None]:
# batching the tweets speeds the model considerably and is enabled by splitting sentences using '\n\n' 
from stanza_batch import batch
from nltk.tokenize import sent_tokenize

# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(tweets_corpus)[:50] 
for tweet in range(len(all_tweets_list)):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if tweet_sentokenized == []:
        tweet_sentokenized.append('empty_tweet')
        print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=1000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

In [None]:


for tweet in tqdm(range(len(tweets_corpus))):
    print(tweets_corpus[tweet])
    np_heads = [[word.id, word.text,word.head,word.deprel] for sent in tagged_tweets[tweet].sentences for word in sent.words]
    print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in tagged_tweets[tweet].sentences for word in sent.words], sep='\n')
    #print(np_heads)
    ph_ids = set([np_heads[i][2] for i in range(len(np_heads))])
    ph_words = [np_heads[i-1][1] for i in ph_ids]

    word_pairs = [(np_heads[word][1], np_heads[np_heads[word][2]-1][1]) for word in range(len(np_heads)) if np_heads[word][2] != 0]
    #print(word_pairs)
    
    compounds = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if 'compound' in np_heads[i][3]]
    print(compounds)
    
    advmods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='advmod']
    print(advmods)
    
    amods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='amod']
    print(amods)
    for pair in word_pairs:
        phrase = pair[0]+'_'+pair[1]

    #print(model.most_similar('illegal_immigrant'))

    
    """#print(len(cand_df['candidates']))
    candidate_list = cand_df['candidates']
    for cand in cand_df['candidates']:
        #print(cand[2])
        #print(get_head(str(cand)))
        for phrase_head in cand[2]:
            #print(phrase_head)
            #if str(cand[1]) in str(tweet):
            if str(phrase_head) in str(tweet) and len(phrase_head)>2:
                #print(phrase_head)
                ph_words = [np_heads[i-1][1] for i in phrase_heads]
                #print(ph_words)
                for related in range(len(np_heads)):
                    if phrase_head == np_heads[related][1]:
                        pass
                        #print(f'checking {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                  for frame_property in list(frame_properties.keys()):
                        for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    #cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                                #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                                except KeyError:
                                pass"""