# Candidate merging and related preprocessing


Import relevant packages for the following parts

In [1]:
#python libraries
import numpy as np
import pandas as pd
import os
import re
import gensim

import time
from tqdm import tqdm

from collections import Counter, defaultdict


# self written modules
import preprocessing


# storing python objects in the desired locations using pickle
import pickle

def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)


Reading english - 1grams ...
Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. Import the data

In [114]:
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_tigray.csv" # location of Tigray dataset
greece_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_greece.csv" # location of Greece dataset
rohingya_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_rohingya.csv" # location of Rohingya dataset
moria_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_moria.csv" # location of Rohingya dataset
all_url = r"Dropbox (CBS)/Master thesis data/df_tweets.csv" # for all tweets

def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(moria_url)
#channel_df = read_event_df(channel_url)
tqdm.pandas()
event_df['text_clean'] = event_df['text'].progress_apply(preprocessing.preprocess_tweets)

  0%|                                                                                       | 0/118696 [00:00<?, ?it/s]

loaded 118696 tweets!


100%|████████████████████████████████████████████████████████████████████████| 118696/118696 [00:46<00:00, 2578.01it/s]


In [115]:
FILE_PATH = os.getcwd() + "/../../../Dropbox (CBS)/Master thesis data"
USERS_PATH = FILE_PATH + "/df_users.csv"
# Read the users csv

print("loading users dataframe...")
df_users = pd.read_csv(USERS_PATH)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)

df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}


def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['text_clean'] = event_df['text_clean'].progress_apply(resolve_username_to_name)

loading users dataframe...


100%|██████████████████████████████████████████████████████████████████████| 118696/118696 [00:01<00:00, 106453.17it/s]


## Train our own event-specific Word2Vec model

In [116]:
#fuzzy duplicate removal (removes 100% duplicates before more expensive operations) 
#done on dataframe level, we want to keep the ID column to match later on

# the comparison will be done on lowercased texts consisting of only letters, digits and spaces
event_df['text_clean'] = event_df['text_clean'].progress_apply(lambda tweet:re.sub(r'[^A-Za-z0-9.!? ]+', '', tweet.lower()))
unique_tweets_df = preprocessing.fuzzy_duplicate_removal(event_df)
unique_tweets_df

100%|██████████████████████████████████████████████████████████████████████| 118696/118696 [00:00<00:00, 126360.21it/s]


Tweets at the start: 118696
Tweets after 100% duplicates removed: 91787
calculating similarities across documents...


6982it [00:00, 34779.43it/s]

Similarity calculation completed in 682.0573885440826 seconds
removing fuzzy duplicates...


167311it [00:04, 37236.79it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupl_removed['is_dup'][i] = True


81327 tweets left after 70.0% similar tweets (by cosine similarity) removed


Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,year,calendar_week,year_month,year_calendar_week,refugee,migrant,immigrant,asylum_seeker,other,is_dup
0,saperduper_robots,Migrants clash with Greek police as they exit ...,en,1233904804134555649,2020-03-01 00:00:04+00:00,108944513,3,0,1,1,...,2020,8,2020_3,2020_08,False,True,False,False,False,False
1,Loomly,"NATO urges Syria, Russia to halt airstrikes as...",en,1233904855774683136,2020-03-01 00:00:17+00:00,4717892303,12,5,14,3,...,2020,8,2020_3,2020_08,False,True,False,False,False,False
2,Twitter Web App,Good on Greece!!....Close all OUR Borders!!......,en,1233904867078393856,2020-03-01 00:00:19+00:00,1067006507189886976,2,1,4,0,...,2020,8,2020_3,2020_08,False,False,False,False,False,False
3,Hootsuite Inc.,Increased risk of #MaternalDeath in immigrants...,en,1233904894425419776,2020-03-01 00:00:26+00:00,3397107171,1,0,2,0,...,2020,8,2020_3,2020_08,False,False,True,False,False,False
4,Twitter for iPhone,@OhGodPlsNOO @ilknurdarendeli @AFP Europe has ...,en,1233904905108283393,2020-03-01 00:00:28+00:00,1188172981530435584,0,0,0,1,...,2020,8,2020_3,2020_08,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118688,Twitter for Android,@AngeloDDuca @GiuseppeConteIT @IslamSeries @Mi...,en,1241511809389137920,2020-03-21 23:47:36+00:00,738486583725305857,2,0,0,0,...,2020,11,2020_3,2020_11,True,False,False,False,False,False
118689,dlvr.it,Coronavirus risks taking heavy toll on migrant...,en,1241512682513002497,2020-03-21 23:51:04+00:00,2985565615,0,0,0,0,...,2020,11,2020_3,2020_11,False,True,False,False,False,False
118691,Twitter Web App,@ConanOBrien @TheEllenShow @IngrahamAngle @Nob...,en,1241513000147664896,2020-03-21 23:52:20+00:00,309530329,0,2,1,0,...,2020,11,2020_3,2020_11,False,False,False,False,False,False
118694,Tweepsmap,RT @WRRoute\n\nNatasha Dailiani calls upon med...,en,1241514169532321793,2020-03-21 23:56:58+00:00,62632306,3,0,1,0,...,2020,11,2020_3,2020_11,False,False,False,False,False,False


In [103]:
from gensim.models.phrases import Phrases,ENGLISH_CONNECTOR_WORDS

tweet_corpus_tokens = [tweet.split() for tweet in unique_tweets_df['text_clean']]
#tweet_corpus_tokens
bigram = Phrases(tweet_corpus_tokens, min_count=10, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS) # higher threshold, fewer phrases.
trigram = Phrases(bigram[tweet_corpus_tokens],min_count=10, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS) 


trigram.vocab

{'thank_you': 353,
 'your': 1132,
 'thank_you_for_your': 23,
 'statement': 109,
 'your_statement': 2,
 'now': 2122,
 'statement_now': 1,
 'what': 1114,
 'now_what': 10,
 'action': 855,
 'what_action': 2,
 'is': 7358,
 'action_is': 16,
 'being': 723,
 'is_being': 83,
 'taken': 192,
 'being_taken': 11,
 'get': 429,
 'taken_to_get': 1,
 'aid': 828,
 'get_aid': 9,
 'all': 1503,
 'aid_to_all': 3,
 'refugees': 6441,
 'all_refugees': 63,
 'civilians': 630,
 'refugees_and_civilians': 5,
 'tigray': 15900,
 'civilians_in_tigray': 36,
 'unhcr_the_un': 1508,
 'tigray_unhcr_the_un': 36,
 'refugee_agency': 1606,
 'unhcr_the_un_refugee_agency': 1489,
 'united_nations': 1330,
 'refugee_agency_united_nations': 56,
 'michelle_bachelet': 86,
 'united_nations_michelle_bachelet': 2,
 'antnio_guterres': 364,
 'michelle_bachelet_antnio_guterres': 8,
 'united_nationshumanrights': 129,
 'antnio_guterres_united_nationshumanrights': 2,
 'un_ocha': 27,
 'united_nationshumanrights_un_ocha': 1,
 'ethiopia': 4823,
 

In [104]:
# OPTIONAL: use this if training of ngram models is complete, it only serves purpose of saving memory
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [105]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords

model_phrases = [trigram_mod[tweet] for tweet in tweet_corpus_tokens]
print(model_phrases[:10])

[['thank_you', 'for', 'your', 'statement', 'now', 'what', 'action', 'is', 'being', 'taken', 'to', 'get', 'aid', 'to', 'all', 'refugees', 'and', 'civilians', 'in', 'tigray', 'unhcr_the_un', 'refugee_agency', 'united_nations', 'michelle', 'bachelet', 'antnio_guterres', 'united', 'nationshumanrights', 'un_ocha', 'ethiopia', 'filippo_grandi'], ['but', 'antnio_guterres', 'said', 'there', 'are', 'no', 'eritrean_troops', 'in', 'ethiopia', 'he', 'needs', 'to', 'retract', 'his', 'statement', 'withdraw', 'his', 'bid', 'for', 'second', 'term', 'tigray_genocide'], ['we', 'call', 'for', 'global', 'solidarity', 'to', 'stop', 'killings_abductions', 'starvation', 'of', 'civilians', 'and', 'refugees', 'by', 'ethiopian', 'and', 'eritrean_army', 'lord', 'tariqahmad', 'of', 'wimbledon', 'marisepayne', 'mona', 'juul', 'brbel', 'kofler', 'mdb', 'annika', 'ben', 'david', 'minnaliina', 'lind', 'marc', 'garneau'], ['un', 'refugees', 'chief', 'trippin', 'the', 'eritrean_refugees', 'who_were', 'living', 'at', 's

In [None]:
# FINDING BEST PARAMETERS FOR WORD2VEC MODEL

#negatives = [5,10,20]
sizes = [100,200,300]
sgs=[0,1]
windows =[3,5] 
#cbow_means = [0,1]
#iters=[10]

for size in sizes:
        for window in windows:
            #print(f'\nfor params size={size},negative={neg},sg={sg},hs={hs},window={window},cbow_mean={cbow},iter={it}')
            print(f'\nfor params size={size},window={window}')
            model = Word2Vec(model_phrases,vector_size=size,window=window)
            print(model.wv.most_similar('refugees'))


In [106]:
import gensim.downloader as api
from gensim.models import Word2Vec

model = Word2Vec(model_phrases,vector_size=300,window=3)
print('first model done')
#model2 = api.load("glove-twitter-200")
#print('second model done')
#model3 = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\GoogleNews-vectors-negative300.bin.gz', binary=True)
#print('third model done')



first model done


In [107]:
model.wv.most_similar('refugees')

[('eritrean_refugees', 0.7808598875999451),
 ('migrants', 0.7493160367012024),
 ('60000_refugees', 0.7415494322776794),
 ('have_fled', 0.717192530632019),
 ('20000_refugees', 0.7117390036582947),
 ('residents', 0.7110738754272461),
 ('border', 0.7052704691886902),
 ('eritrean_refugee', 0.7011108994483948),
 ('refugee', 0.6936578750610352),
 ('their_homes', 0.6899452209472656)]

In [86]:
model.wv.most_similar('refugees')

[('immigrants', 0.7028154730796814),
 ('asylum_seekers', 0.6707106232643127),
 ('illegal_migrants', 0.6094726324081421),
 ('illegal_immigrants', 0.6072850227355957),
 ('migrants', 0.6011469960212708),
 ('people', 0.595695972442627),
 ('syrians', 0.5859284400939941),
 ('economic_migrants', 0.579109251499176),
 ('these_people', 0.5709031224250793),
 ('illegals', 0.5647295713424683)]

In [109]:
pickle_file('tigray_w2v_model',model)

## FastText model

In [40]:
# FINDING BEST PARAMETERS FOR FASTTEXT MODEL
from gensim.models import FastText

sizes = [100,200,300]
#losses=['ns','hs','softmax']
windows =[3,5] 


for size in sizes:
        for window in windows:
            #for loss in losses:
                #print(f'\nfor params size={size},negative={neg},sg={sg},hs={hs},window={window},cbow_mean={cbow},iter={it}')
                print(f'\nfor params size={size}, window={window} ')
                model = FastText(model_phrases,vector_size=size,window=window, epochs=10)
                print(model.wv.most_similar('refugee'))


for params size=100, window=3 
[('refugee_kid', 0.9725989699363708), ('refugee_law', 0.9665269255638123), ('refugee_exodus', 0.9585047364234924), ('refugee_flows', 0.954691469669342), ('refuge', 0.9496657252311707), ('refugee_status', 0.9360215067863464), ('refugee_child', 0.9352647662162781), ('refugee_policy', 0.9234868884086609), ('refugee_committee', 0.9191840291023254), ('kid_refugee', 0.9133069515228271)]

for params size=100, window=5 
[('refugee_kid', 0.9771548509597778), ('refugee_law', 0.9689993262290955), ('refugee_exodus', 0.960526704788208), ('refugee_flows', 0.9551403522491455), ('refuge', 0.9466025829315186), ('refugee_child', 0.9407798647880554), ('refugee_status', 0.9329602718353271), ('refugee_policy', 0.9241395592689514), ('refugee_committee', 0.9201054573059082), ('took_refuge', 0.909279465675354)]

for params size=200, window=3 
[('refugee_kid', 0.9748549461364746), ('refugee_law', 0.9703011512756348), ('refugee_exodus', 0.9645267724990845), ('refugee_flows', 0.95

In [110]:
from gensim.models import FastText

model2 = FastText(vector_size=300, window=3, sentences=model_phrases, min_n=4,max_n=6)

In [111]:
model2.wv.most_similar('refugee')

[('refuge', 0.9909552931785583),
 ('refuges', 0.9901469349861145),
 ('refugee_status', 0.9680262207984924),
 ('danish_refugee', 0.9666343927383423),
 ('refugee_said', 0.9661538600921631),
 ('refugees', 0.9506486058235168),
 ('un_refugee', 0.9475336670875549),
 ('iraqi_refugees', 0.9409911632537842),
 ('norwegian_refugee', 0.9401996731758118),
 ('refugee_camp', 0.9324396252632141)]

In [70]:
model2.wv.most_similar('refugee')

[('refugeees', 0.9461631178855896),
 ('refugee_girl', 0.9091788530349731),
 ('jesuit_refugee', 0.9081748723983765),
 ('imperilled_refugee', 0.8926677703857422),
 ('renewed_refugee', 0.8882556557655334),
 ('refugee_forum', 0.8876516222953796),
 ('refugee_scandal', 0.886340320110321),
 ('refugee_pact', 0.8844466209411621),
 ('refugee_boy', 0.8822042346000671),
 ('anti_refugee', 0.8774494528770447)]

In [98]:
model2.wv.similarity('refugee','migrant')

0.6189612

In [112]:
pickle_file('tigray_ft_model', model2)

In [61]:
path = os.getcwd() + "/../../../" + fr"Dropbox (CBS)/Master thesis data/Candidate Data/greece/greece_ft_model.bin"
model2.save(path)


## BERT embeddings

In [117]:
# train this model only after the first merging step to save both memory and time
event_cands = load_pickle('moria_short_cands')

In [118]:
event_cands

Unnamed: 0,candidates,cand_tags,cand_text,cand_len,cand_freq,string_len
0,"(greece, greece, {greece}, loc-ne)","[\n {\n ""id"": 1,\n ""text"": ""Greece"",\n ...",greece,1,18978,6
1,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,10656,8
2,"(fire, fire, {fire}, misc)","[\n {\n ""id"": 1,\n ""text"": ""fire"",\n ...",fire,1,7477,4
3,"(turkey, turkey, {turkey}, loc-ne)","[\n {\n ""id"": 1,\n ""text"": ""Turkey"",\n ...",turkey,1,4986,6
4,"(migrants, migrants, {migrants}, misc)","[\n {\n ""id"": 1,\n ""text"": ""migrants"",\...",migrants,1,4666,8
...,...,...,...,...,...,...
186493,"(wwf, wwf, {wwf}, misc)","[\n {\n ""id"": 1,\n ""text"": ""wwf"",\n ...",wwf,1,1,3
186494,"(its china, china, {china}, loc-ne)","[\n {\n ""id"": 1,\n ""text"": ""its"",\n ...",its china,2,1,9
186495,"(volunteers organizers and police, organizers,...","[\n {\n ""id"": 1,\n ""text"": ""volunteers,...",volunteers organizers and police,4,1,32
186496,"(pik pa moria lesbos, lesbos, {lesbos, pik}, m...","[\n {\n ""id"": 1,\n ""text"": ""pik"",\n ...",pik pa moria lesbos,4,1,19


In [183]:
from time import time
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

bert_corpus = list(unique_tweets_df['text_clean']) + list(event_cands['cand_text'])


t0 = time()
document_embeddings = sbert_model.encode(bert_corpus)
print(f'Training embeddings took {time()-t0} seconds')

Embedding took 13004.31396651268 seconds


In [195]:
print(document_embeddings.shape)
print(unique_tweets_df['text_clean'].shape)
print(event_cands['cand_text'].shape)

(267825, 768)
(81327,)
(186498,)


In [168]:

from sklearn.metrics.pairwise import cosine_similarity
 
sims = cosine_similarity(
    [document_embeddings[1002]],
    document_embeddings[1003:]
)

sim_df = pd.DataFrame({'text': bert_corpus[1003:],'sim': list(sims[0])})

In [None]:
rows_list = []
for up_cand_id in tqdm(range(len(bert_corpus[1000:]))):
    for low_cand_id in range(up_cand_id+1,len(bert_corpus[1000:])):
        sim = cosine_similarity(
            document_embeddings[up_cand_id].reshape(1,-1),
            document_embeddings[low_cand_id].reshape(1,-1)
        )
        dict1 = {}
        # get input row in dictionary format
        # key = col_name
        dict1.update({'text': bert_corpus[up_cand_id], 'text_to_compare':bert_corpus[low_cand_id], 'sim':[sim[0]]}) 

        rows_list.append(dict1)

sim_df = pd.DataFrame(rows_list)
sim_df

In [None]:
pickle_file('moria_sim_df',sim_df)

In [194]:
print(f'most similar to "{bert_corpus[1001]}":')

sim_df.columns = sim_df.columns.str.strip()

sim_df.sort_values('sim', ascending=False).head(50)

most similar to "i keep thinking about the refugees in greece who are receiving so much violence for fleeing certain death  to all these children and families who are lost in an extremely violent world  what will happen tonight and tomorrow without food and shelter":


Unnamed: 0,text,sim
35,fires,0.981077
456,flames,0.953122
382,this fire,0.948744
352,fire guts,0.942461
470,blaze,0.941246
13,a fire,0.939005
132,moria fire,0.908484
159,the fires,0.907713
9,the fire,0.907002
811,the flames,0.902333


array([[-0.25162622,  0.58416   ,  0.34161314, ..., -0.9051944 ,
         0.07126287, -0.21723138],
       [-0.28473833,  0.5902117 ,  0.8460189 , ..., -0.33095843,
        -0.47118548,  0.47009197],
       [-0.28960198,  1.0041703 ,  0.23169023, ..., -0.64314526,
        -0.23541632,  0.28106663],
       ...,
       [-0.03950938, -0.24995442,  1.3851532 , ..., -0.31755453,
         0.48770684, -0.23471756],
       [ 0.06689067, -0.868989  ,  1.4205451 , ..., -0.2091172 ,
         0.09321782, -0.24408102],
       [ 0.11261851,  0.28545126,  1.4639302 , ...,  0.28329983,
        -0.5337196 , -1.0075337 ]], dtype=float32)