# Candidate merging and related preprocessing


**Necessary files:**
 - event_df = df\_[event]\_clean.csv file with event dataframes with unique tweets only
 
 _the goal of this notebook is to tag all tweets from event_df and extract all noun phrases. Noun phrases will serve as candidates and using pipeline function they are categorised and finally only unique (and cleaned) candidates will be saved into event_cands dataframe_

In [1]:
#python libraries
import stanza

import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm
import time
from collections import Counter, defaultdict


# self written modules
import preprocessing
import candidate_processing as cand_prep
import candidate_extraction as cand_ex

import pickle

def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)


Reading english - 1grams ...
Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. We import the data

In [7]:
greece_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_greece_clean.csv" # for Greece
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_tigray_clean.csv" # for Tigray
rohingya_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_rohingya_clean.csv" # for Rohingya

def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(rohingya_url)
event_df.head()

loaded 22966 tweets!


Unnamed: 0,text_coherent,id,date,retweet_count_sum,text_alphanum,text_stm
0,"For rohingya Survivors in Bangladesh, Artwork ...",1373792416126402560,2021-03-22,2,for rohingya survivors in bangladesh artwork b...,rohingya survivor bangladesh artwork bear witn...
1,AstraZeneca dispels Indonesian Muslim concerns...,1373800977778700288,2021-03-22,1,astrazeneca dispels indonesian muslim concerns...,astrazeneca dispels indonesian muslim concern ...
2,I think u are one of the illegally migrant Roh...,1373802051524730880,2021-03-22,0,i think u are one of the illegally migrant roh...,think illegally migrant rohingya bangladesh be...
3,India seals Myanmar border amid strains over r...,1373802536579174401,2021-03-22,0,india seals myanmar border amid strains over r...,india seal myanmar border amid strain refugee ...
4,"Fleeing coup, Myanmar police refugees in India...",1373804367757807619,2021-03-22,1,fleeing coup myanmar police refugees in india ...,fleeing coup myanmar police refugee india seek...


## First,  extracting noun phrases

In [None]:
# this code runs for around another 13h per 100k tweets
from stanza.server import CoreNLPClient

#use "with" so the client stops properly after finished
with CoreNLPClient(annotators=["tokenize,ssplit,pos,parse"], timeout=6000000, memory='8G') as client:
        print('extracting noun phrases...')
        tqdm.pandas()
        # get noun phrases with tregex using get_noun_phrases function
        event_df['noun_phrases'] = event_df['text_coherent'].progress_apply(cand_ex.get_noun_phrases,args=(client,"tokenize,ssplit,pos,parse"))

np_list = list(event_df['noun_phrases'])
len(np_list)

In [12]:
pickle_file('rohingya_noun_phrases',np_list)

#np_list = load_pickle("moria_short_noun_phrases")

## 2. instantiate stanza english language module

In [2]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ needed when running first time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#

#stanza.download("en")

#stanza.install_corenlp()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# loading the pipeline
en_nlp = stanza.Pipeline("en", tokenize_pretokenized=True, ner_batch_size=4096)

2021-06-01 10:31:57 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-06-01 10:31:57 INFO: Use device: cpu
2021-06-01 10:31:57 INFO: Loading: tokenize
2021-06-01 10:31:57 INFO: Loading: pos
2021-06-01 10:31:58 INFO: Loading: lemma
2021-06-01 10:31:58 INFO: Loading: depparse
2021-06-01 10:31:59 INFO: Loading: sentiment
2021-06-01 10:31:59 INFO: Loading: ner
2021-06-01 10:32:01 INFO: Done loading processors!


## Tag tweets using stanza module to get NER and POS tags in tweets. We do it in batches to speed things up.

In [5]:
event_tagged_tweets = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(list(event_df['text_coherent']))]


100%|██████████████████████████████████████████████████████████████████████████| 19912/19912 [5:40:59<00:00,  1.03s/it]


In [6]:
pickle_file('tigray_tagged_tweets',event_tagged_tweets)

In [8]:
event_tagged_tweets = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(list(event_df['text_coherent']))]

100%|██████████████████████████████████████████████████████████████████████████| 22966/22966 [4:58:17<00:00,  1.28it/s]


In [9]:
pickle_file('rohingya_tagged_tweets',event_tagged_tweets)

## Pipeline for candidate identification

**Necessary files:**
 - event_np_list = pickled file of list of noun phrases
 - event_tagged_tweets = pickled file with NER and POS tags for all tweets

In [3]:
def load_event_data(event_name):
    assert event_name in ['greece','tigray','rohingya','moria'], f"Oh no! We do not analyze {event_name} event"
    
    print(f'Loading {event_name} data...')
    try:
        #sample = 2000
        event_np_list = load_pickle(event_name + '_noun_phrases')#[1000:sample]
        event_tagged_tweets = load_pickle(event_name + '_tagged_tweets')#[1000:sample]
        return event_np_list,event_tagged_tweets
    except:
        print(f'The {event_name} files not found! First extract noun phrases and tag tweets of the {eventname}_df')
        return None


In [9]:
def pipeline(event_name):
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 1. LOAD THE DATA ~~~~~~~~~~~~~~~~~~~~~
    event_np_list,event_tagged_tweets = load_event_data(event_name)
    print('did I get here?')
    ####  ~~~~~~~~~~~~~~~~~~~~~ 2. GET POS AND NER TAGS ~~~~~~~~~~~~~~~~~~~~~
    # get list of tuples (POS-tags of each word, NER-tags of each named entity) 
    tweet_tags = cand_prep.get_tweet_tags(event_tagged_tweets) 
    
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 3. PREPROCESS CANDIDATES ~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~ processing of noun phrases ~~~~~~~~~~~~~~~~~~~~~
    print(f'Processing {event_name} noun phrase candidates...')
    
    tqdm.pandas()
    # remove NP candidates longer than threshold and remove all child NPs of parent NPs
    event_np_list = cand_prep.remove_long_nps(event_np_list)
    event_np_list = cand_prep.remove_child_nps(event_np_list) 
    event_np_list = cand_prep.remove_char(event_np_list,'@')

    event_np_list = [['no_candidate'] if len(noun_ps)==0 or noun_ps ==' ' else noun_ps for noun_ps in event_np_list ]
    
    #print(event_np_list)
    print(f'Tagging {event_name} noun phrase candidates...')
    #tag all tweets and save them in a list    

    #tagged_np_cands = batched_np_list.progress_apply(en_nlp)
    tagged_np_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_np_list)]
    #tagged_np_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_np_list, en_nlp, batch_size=6000))]

    np_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_np_cands]
    #print(np_cand_heads)
    
    np_and_cand_list = cand_prep.get_cand_type(event_np_list,np_cand_heads, tweet_tags)
    #print(event_np_list)
          
          
    # ~~~~~~~~~~~~~~~~~~~~ combining candidate lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #concatenate corefs and noun phrase lists
    nps_cands = [cand for cands in np_and_cand_list for cand in cands]
    #candidate_list = coref_and_cand_list + np_and_cand_list

    #unpack list of lists into one list
    candidate_list = nps_cands
          
    nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences ]

    all_cands_tagged = nps_tagged

        
    #print(len(candidate_list),'vs', len(all_cands_tagged))
    cand_df = pd.DataFrame(
        {'candidates': candidate_list,
         'cand_tags': all_cands_tagged
        })

    cand_df['cand_text'] = cand_df.candidates.apply(lambda x: x[0])
    cand_df['cand_len'] = cand_df.cand_text.apply(lambda x: len(x.split()))


    count_cands = Counter(cand_df['cand_text'])
    cand_df['cand_freq'] = cand_df["cand_text"].map(count_cands)
    
    #count_cands[cand_df['cand_text']]
    #count_sorted = sorted(count_cands.items(),key=lambda x: x[1],reverse=True)
    cand_df.columns = cand_df.columns.str.strip()
    
          
    # we sort the candidates by their length
    cand_df.sort_values('cand_freq', ascending=False,inplace=True)

    #cand_df = cand_df[cand_df.cand_text not in  ['no_candidate', 'candidate_to_be_removed']]

    cand_df.reset_index(drop=True, inplace = True)
    #remove dummy candidates that were used to avoid errors

    
    cand_df = cand_df[cand_df.cand_text != 'candidate_to_be_removed']
    cand_df = cand_df[cand_df.cand_text != 'no_candidate']
    print(len(cand_df))    
    cand_df.reset_index(drop=True,inplace=True)
          
    return cand_df
          

event_cands = pipeline('rohingya')

#pickle_file('moria_cands_df', moria_cands)

Loading rohingya data...


  1%|▋                                                                           | 203/22966 [00:00<00:11, 2011.48it/s]

did I get here?


100%|██████████████████████████████████████████████████████████████████████████| 22966/22966 [00:07<00:00, 2990.35it/s]


Processing rohingya noun phrase candidates...
removing long candidates...
Removed 0 candidates longer than 9 words!
removing child NP candidates...


  0%|                                                                                        | 0/22966 [00:00<?, ?it/s]

Removed 40002 child NP candidates!
Tagging rohingya noun phrase candidates...


100%|██████████████████████████████████████████████████████████████████████████| 22966/22966 [3:13:14<00:00,  1.98it/s]
100%|████████████████████████████████████████████████████████████████████████████| 22966/22966 [04:57<00:00, 77.12it/s]


183488


In [10]:
event_cands

Unnamed: 0,candidates,cand_tags,cand_text,cand_len,cand_freq
0,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,2483
1,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,2483
2,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,2483
3,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,2483
4,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,2483
...,...,...,...,...,...
183483,"(Agency freya_cole, freya_cole, {freya_cole}, ...","[\n {\n ""id"": 1,\n ""text"": ""Agency"",\n ...",Agency freya_cole,2,1
183484,"(Injuries and death, Injuries, {Injuries, deat...","[\n {\n ""id"": 1,\n ""text"": ""Injuries"",\...",Injuries and death,3,1
183485,"(areas of ethnic groups, areas, {areas, groups...","[\n {\n ""id"": 1,\n ""text"": ""areas"",\n ...",areas of ethnic groups,4,1
183486,"(the junta' s airstrikes, airstrikes, {airstri...","[\n {\n ""id"": 1,\n ""text"": ""the"",\n ...",the junta' s airstrikes,4,1


In [None]:
pickle_file('tigray_cands', event_cands)

### Candidates as identified by stanza library still have a lot of noise to be removed. Cleaner candidates merge better and throwing away duplicate candidates or candidates without useful information speeds up merging.

In [11]:
#Finally the candidates are cleaned before storing in a file prior to merging

from nltk.corpus import stopwords

def clean_cands(event_cands):
    """
    Applying cleaning steps on candidates and engineering some features:
     1. creating a column with length of the tweet (in chars)
     2. lowercase the candidate information in the tuple with cand, candidate representative head and set of phrases heads
     3. extract candidate text and keep only alphanumeric chars
     4. remove candidates that are stopwords
     5. remove candidates that are only numeric
     6. remove candidates that are only 1 char long
     """
    def clean_cand(cand):
        cand = list(cand)
        cand[0] = re.sub(r'[^A-Za-z0-9 ]+', '', cand[0].lower())
        cand[1] = re.sub(r'[^A-Za-z0-9 ]+', '', cand[1].lower())
        cand[2] = set([re.sub(r'[^A-Za-z0-9 ]+', '', phrase_word.lower()) for phrase_word in cand[2]])

        return tuple(cand)

    #stopwords
    tqdm.pandas()
    event_cands_clean = event_cands.copy()
    
    
    event_cands_clean['candidates'] = event_cands_clean['candidates'].progress_apply(clean_cand)
    
    event_cands_clean['cand_text'] = event_cands_clean['cand_text'].progress_apply(lambda x:re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()).strip())
    event_cands_clean = event_cands_clean[~event_cands_clean['cand_text'].isin(stopwords.words('english'))]
    event_cands_clean['pure_chars'] = event_cands_clean['cand_text'].progress_apply(lambda x: x.replace(' ', ''))
    event_cands_clean = event_cands_clean[~event_cands_clean['pure_chars'].str.isnumeric()]
    event_cands_clean.drop('pure_chars',axis=1,inplace=True)
    
    event_cands_clean['string_len'] = event_cands_clean['cand_text'].progress_apply(len)
    event_cands_clean = event_cands_clean[event_cands_clean['string_len']>1]
    event_cands_clean = event_cands_clean.drop_duplicates(subset = ["cand_text"])
    event_cands_clean.reset_index(drop=True, inplace=True)
    print(f'The event has  {len(event_cands_clean)} unique candidates after cleaning')
    return event_cands_clean

event_cands_clean = clean_cands(event_cands)

100%|███████████████████████████████████████████████████████████████████████| 183488/183488 [00:01<00:00, 91926.97it/s]
100%|██████████████████████████████████████████████████████████████████████| 183488/183488 [00:00<00:00, 259163.25it/s]
100%|██████████████████████████████████████████████████████████████████████| 164787/164787 [00:00<00:00, 549295.76it/s]
100%|██████████████████████████████████████████████████████████████████████| 162945/162945 [00:00<00:00, 673231.98it/s]


The event has  50447 unique candidates after cleaning


In [12]:
pickle_file('rohingya_cands', event_cands_clean)

In [22]:
event_cands = load_pickle('moria_short_cands')