# Embeddings training

**Required files:**
 - event_df_clean = event specific dataframe with preprocessed text, use column 'text_coherent' for training
 - event_cands_merged = dataframe of candidates that are merged after 1st step


Import relevant packages for the following parts

In [1]:
#python libraries
import numpy as np
import pandas as pd
import os
import re
import gensim

import time
from tqdm import tqdm

from collections import Counter, defaultdict

# self written modules
import preprocessing

# storing python objects in the desired locations using pickle
import pickle

def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)


Reading english - 1grams ...
Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. Import the data

In [2]:
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_tigray_clean.csv" # location of Tigray dataset
greece_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_greece_clean.csv" # location of Greece dataset
rohingya_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_rohingya_clean.csv" # location of Rohingya dataset
all_url = r"Dropbox (CBS)/Master thesis data/df_tweets.csv" # for all tweets

def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(rohingya_url)


loaded 22966 tweets!


In [4]:
#fuzzy duplicate removal (removes 100% duplicates before more expensive operations - should be done before) 
#done on dataframe level, we want to keep the ID column to match later on

# the comparison will be done on lowercased texts consisting of only letters, digits and spaces
#event_df['text_clean'] = event_df['text_clean'].progress_apply(lambda tweet:re.sub(r'[^A-Za-z0-9.!? ]+', '', tweet.lower()))
unique_tweets_df = preprocessing.fuzzy_duplicate_removal(event_df)
unique_tweets_df

Tweets at the start: 22966
Tweets after 100% duplicates removed: 22956
calculating similarities across documents...


1787it [00:00, 17695.31it/s]

Similarity calculation completed in 57.66400074958801 seconds
removing fuzzy duplicates...


47992it [00:01, 24982.97it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupl_removed['is_dup'][i] = True


20017 tweets left after 70.0% similar tweets (by cosine similarity) removed


Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,migrant,immigrant,asylum_seeker,other,text_coherent,retweet_count_sum,count,text_alphanum,text_stm,is_dup
0,Twitter Web App,"For #Rohingya Survivors in Bangladesh, Artwork...",en,1373792416126402560,2021-03-22 00:23:30+00:00,77844813,2,1,2,0,...,False,False,False,False,"For rohingya Survivors in Bangladesh, Artwork ...",2,1,for rohingya survivors in bangladesh artwork b...,rohingya survivor bangladesh artwork bear witn...,False
1,dlvr.it,AstraZeneca dispels Indonesian Muslim concerns...,en,1373800977778700288,2021-03-22 00:57:31+00:00,1898083759,1,0,0,0,...,False,False,False,False,AstraZeneca dispels Indonesian Muslim concerns...,1,1,astrazeneca dispels indonesian muslim concerns...,astrazeneca dispels indonesian muslim concern ...,False
2,Twitter for Android,@prabha_j @MehHarshil @derekobrienmp I think u...,en,1373802051524730880,2021-03-22 01:01:47+00:00,1209116380257112064,0,0,1,0,...,True,False,False,False,I think u are one of the illegally migrant Roh...,0,1,prabhaj mehharshil derekobrienmp i think u are...,think illegally rohingya bangladesh better kee...,False
3,Twitter for Android,India seals Myanmar border amid strains over r...,en,1373802536579174401,2021-03-22 01:03:43+00:00,1032998054297780224,0,0,0,0,...,False,False,False,False,India seals Myanmar border amid strains over r...,0,1,india seals myanmar border amid strains over r...,india seal myanmar border amid strain crisis,False
4,Twitter for iPhone,"Fleeing coup, Myanmar police refugees in India...",en,1373804367757807619,2021-03-22 01:10:59+00:00,15552861,1,0,1,0,...,False,False,False,False,"Fleeing coup, Myanmar police refugees in India...",1,1,fleeing coup myanmar police refugees in india ...,fleeing coup myanmar police india seek asylum ...,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22961,Twitter for Android,they went wedding ceremony of they're uncle th...,en,1388268996110266374,2021-04-30 23:08:16+00:00,1360258667292033028,0,1,6,0,...,False,False,False,False,they went wedding ceremony of they' re uncle t...,0,1,they went wedding ceremony of theyre uncle the...,went wedding ceremony uncle enjoyed bamboo bri...,False
22962,BPbreakingnews,A labour group yesterday called on the governm...,en,1388272686372057088,2021-04-30 23:22:55+00:00,20583561,3,0,5,0,...,False,False,False,False,A labour group yesterday called on the governm...,3,1,a labour group yesterday called on the governm...,labour group yesterday called government speed...,False
22963,Twitter for Android,Declare a fly zone right now please. There are...,en,1388273220579741701,2021-04-30 23:25:03+00:00,1360335395939115010,0,0,0,0,...,False,False,False,False,Declare a fly zone right now please . There ar...,0,1,declare a fly zone right now please. there are...,declare zone right please million karen kachim...,False
22964,Twitter for iPhone,@lvandenassum MYANMAR MILITARY REGIME IS CHEAT...,en,1388273570539782151,2021-04-30 23:26:26+00:00,973730357051838465,0,0,0,0,...,False,False,False,False,"MYANMAR MILITARY REGIME IS CHEATING, MANIPULAT...",0,1,lvandenassum myanmar military regime is cheati...,myanmar military regime cheating manipulating ...,False


## BERT embeddings

In [5]:
# train this model only after the first merging step to save both memory and time
event_cands = load_pickle('rohingya_cands')

In [7]:
from time import time
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

bert_corpus = list(unique_tweets_df['text_alphanum']) + list(event_cands['cand_text'])

print(bert_corpus[-20:])
t0 = time()
document_embeddings = sbert_model.encode(bert_corpus)
print(f'Training embeddings took {time()-t0} seconds')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training embeddings took 6703.449328184128 seconds


In [25]:
print(document_embeddings.shape)
print(unique_tweets_df['text_alphanum'].shape)
print(event_cands['cand_text'].shape)

(70464, 768)
(20017,)
(50447,)


In [10]:
bert_corpus[20016]

'the fire in the rohingya refugee camp leaves 45 000 people without shelter and hundreds missing. httpst.cohpjilanthr'

In [168]:

from sklearn.metrics.pairwise import cosine_similarity
 
sims = cosine_similarity(
    [document_embeddings[1002]],
    document_embeddings[1003:]
)

sim_df = pd.DataFrame({'text': bert_corpus[1003:],'sim': list(sims[0])})

In [None]:
rows_list = []
for up_cand_id in tqdm(range(len(bert_corpus[1000:]))):
    for low_cand_id in range(up_cand_id+1,len(bert_corpus[1000:])):
        sim = cosine_similarity(
            document_embeddings[up_cand_id].reshape(1,-1),
            document_embeddings[low_cand_id].reshape(1,-1)
        )
        dict1 = {}
        # get input row in dictionary format
        # key = col_name
        dict1.update({'text': bert_corpus[up_cand_id], 'text_to_compare':bert_corpus[low_cand_id], 'sim':[sim[0]]}) 

        rows_list.append(dict1)

sim_df = pd.DataFrame(rows_list)
sim_df

In [None]:
pickle_file('moria_sim_df',sim_df)

In [194]:
print(f'most similar to "{bert_corpus[1001]}":')

sim_df.columns = sim_df.columns.str.strip()

sim_df.sort_values('sim', ascending=False).head(50)

most similar to "i keep thinking about the refugees in greece who are receiving so much violence for fleeing certain death  to all these children and families who are lost in an extremely violent world  what will happen tonight and tomorrow without food and shelter":


Unnamed: 0,text,sim
35,fires,0.981077
456,flames,0.953122
382,this fire,0.948744
352,fire guts,0.942461
470,blaze,0.941246
13,a fire,0.939005
132,moria fire,0.908484
159,the fires,0.907713
9,the fire,0.907002
811,the flames,0.902333


## Train our own event-specific Word2Vec model

In [None]:
from gensim.models.phrases import Phrases,ENGLISH_CONNECTOR_WORDS

tweet_corpus_tokens = [tweet.split() for tweet in unique_tweets_df['text_coherent']]
#tweet_corpus_tokens
bigram = Phrases(tweet_corpus_tokens, min_count=10, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS) # higher threshold, fewer phrases.
trigram = Phrases(bigram[tweet_corpus_tokens],min_count=10, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS) 


trigram.vocab

In [104]:
# OPTIONAL: use this if training of ngram models is complete, it only serves purpose of saving memory
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [105]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords

model_phrases = [trigram_mod[tweet] for tweet in tweet_corpus_tokens]
print(model_phrases[:10])

[['thank_you', 'for', 'your', 'statement', 'now', 'what', 'action', 'is', 'being', 'taken', 'to', 'get', 'aid', 'to', 'all', 'refugees', 'and', 'civilians', 'in', 'tigray', 'unhcr_the_un', 'refugee_agency', 'united_nations', 'michelle', 'bachelet', 'antnio_guterres', 'united', 'nationshumanrights', 'un_ocha', 'ethiopia', 'filippo_grandi'], ['but', 'antnio_guterres', 'said', 'there', 'are', 'no', 'eritrean_troops', 'in', 'ethiopia', 'he', 'needs', 'to', 'retract', 'his', 'statement', 'withdraw', 'his', 'bid', 'for', 'second', 'term', 'tigray_genocide'], ['we', 'call', 'for', 'global', 'solidarity', 'to', 'stop', 'killings_abductions', 'starvation', 'of', 'civilians', 'and', 'refugees', 'by', 'ethiopian', 'and', 'eritrean_army', 'lord', 'tariqahmad', 'of', 'wimbledon', 'marisepayne', 'mona', 'juul', 'brbel', 'kofler', 'mdb', 'annika', 'ben', 'david', 'minnaliina', 'lind', 'marc', 'garneau'], ['un', 'refugees', 'chief', 'trippin', 'the', 'eritrean_refugees', 'who_were', 'living', 'at', 's

In [None]:
# FINDING BEST PARAMETERS FOR WORD2VEC MODEL

#negatives = [5,10,20]
sizes = [100,200,300]
sgs=[0,1]
windows =[3,5] 
#cbow_means = [0,1]
#iters=[10]

for size in sizes:
        for window in windows:
            #print(f'\nfor params size={size},negative={neg},sg={sg},hs={hs},window={window},cbow_mean={cbow},iter={it}')
            print(f'\nfor params size={size},window={window}')
            model = Word2Vec(model_phrases,vector_size=size,window=window)
            print(model.wv.most_similar('refugees'))


In [106]:
import gensim.downloader as api
from gensim.models import Word2Vec

# with parameters selected based on the previous cell
model = Word2Vec(model_phrases,vector_size=300,window=3)



first model done


In [107]:
model.wv.most_similar('refugees')

[('eritrean_refugees', 0.7808598875999451),
 ('migrants', 0.7493160367012024),
 ('60000_refugees', 0.7415494322776794),
 ('have_fled', 0.717192530632019),
 ('20000_refugees', 0.7117390036582947),
 ('residents', 0.7110738754272461),
 ('border', 0.7052704691886902),
 ('eritrean_refugee', 0.7011108994483948),
 ('refugee', 0.6936578750610352),
 ('their_homes', 0.6899452209472656)]

In [86]:
model.wv.most_similar('refugees')

[('immigrants', 0.7028154730796814),
 ('asylum_seekers', 0.6707106232643127),
 ('illegal_migrants', 0.6094726324081421),
 ('illegal_immigrants', 0.6072850227355957),
 ('migrants', 0.6011469960212708),
 ('people', 0.595695972442627),
 ('syrians', 0.5859284400939941),
 ('economic_migrants', 0.579109251499176),
 ('these_people', 0.5709031224250793),
 ('illegals', 0.5647295713424683)]

In [109]:
pickle_file('tigray_w2v_model',model)

## FastText model

In [40]:
# FINDING BEST PARAMETERS FOR FASTTEXT MODEL
from gensim.models import FastText

sizes = [100,200,300]
#losses=['ns','hs','softmax']
windows =[3,5] 


for size in sizes:
        for window in windows:
            #for loss in losses:
                #print(f'\nfor params size={size},negative={neg},sg={sg},hs={hs},window={window},cbow_mean={cbow},iter={it}')
                print(f'\nfor params size={size}, window={window} ')
                model = FastText(model_phrases,vector_size=size,window=window, epochs=10)
                print(model.wv.most_similar('refugee'))


for params size=100, window=3 
[('refugee_kid', 0.9725989699363708), ('refugee_law', 0.9665269255638123), ('refugee_exodus', 0.9585047364234924), ('refugee_flows', 0.954691469669342), ('refuge', 0.9496657252311707), ('refugee_status', 0.9360215067863464), ('refugee_child', 0.9352647662162781), ('refugee_policy', 0.9234868884086609), ('refugee_committee', 0.9191840291023254), ('kid_refugee', 0.9133069515228271)]

for params size=100, window=5 
[('refugee_kid', 0.9771548509597778), ('refugee_law', 0.9689993262290955), ('refugee_exodus', 0.960526704788208), ('refugee_flows', 0.9551403522491455), ('refuge', 0.9466025829315186), ('refugee_child', 0.9407798647880554), ('refugee_status', 0.9329602718353271), ('refugee_policy', 0.9241395592689514), ('refugee_committee', 0.9201054573059082), ('took_refuge', 0.909279465675354)]

for params size=200, window=3 
[('refugee_kid', 0.9748549461364746), ('refugee_law', 0.9703011512756348), ('refugee_exodus', 0.9645267724990845), ('refugee_flows', 0.95

In [110]:
from gensim.models import FastText

model2 = FastText(vector_size=300, window=3, sentences=model_phrases, min_n=4,max_n=6)

In [111]:
model2.wv.most_similar('refugee')

[('refuge', 0.9909552931785583),
 ('refuges', 0.9901469349861145),
 ('refugee_status', 0.9680262207984924),
 ('danish_refugee', 0.9666343927383423),
 ('refugee_said', 0.9661538600921631),
 ('refugees', 0.9506486058235168),
 ('un_refugee', 0.9475336670875549),
 ('iraqi_refugees', 0.9409911632537842),
 ('norwegian_refugee', 0.9401996731758118),
 ('refugee_camp', 0.9324396252632141)]

In [70]:
model2.wv.most_similar('refugee')

[('refugeees', 0.9461631178855896),
 ('refugee_girl', 0.9091788530349731),
 ('jesuit_refugee', 0.9081748723983765),
 ('imperilled_refugee', 0.8926677703857422),
 ('renewed_refugee', 0.8882556557655334),
 ('refugee_forum', 0.8876516222953796),
 ('refugee_scandal', 0.886340320110321),
 ('refugee_pact', 0.8844466209411621),
 ('refugee_boy', 0.8822042346000671),
 ('anti_refugee', 0.8774494528770447)]

In [98]:
model2.wv.similarity('refugee','migrant')

0.6189612

In [112]:
pickle_file('tigray_ft_model', model2)

In [61]:
path = os.getcwd() + "/../../../" + fr"Dropbox (CBS)/Master thesis data/Candidate Data/greece/greece_ft_model.bin"
model2.save(path)


array([[-0.25162622,  0.58416   ,  0.34161314, ..., -0.9051944 ,
         0.07126287, -0.21723138],
       [-0.28473833,  0.5902117 ,  0.8460189 , ..., -0.33095843,
        -0.47118548,  0.47009197],
       [-0.28960198,  1.0041703 ,  0.23169023, ..., -0.64314526,
        -0.23541632,  0.28106663],
       ...,
       [-0.03950938, -0.24995442,  1.3851532 , ..., -0.31755453,
         0.48770684, -0.23471756],
       [ 0.06689067, -0.868989  ,  1.4205451 , ..., -0.2091172 ,
         0.09321782, -0.24408102],
       [ 0.11261851,  0.28545126,  1.4639302 , ...,  0.28329983,
        -0.5337196 , -1.0075337 ]], dtype=float32)