# Embeddings training

**Required files:**
 - event_df_clean = event specific dataframe with preprocessed text, use column 'text_coherent' for training
 - event_cands_merged = dataframe of candidates that are merged after 1st step


Import relevant packages for the following parts

In [74]:
#python libraries
import numpy as np
import pandas as pd
import os
import re
#import gensim

import time
from tqdm import tqdm

from collections import Counter, defaultdict

# self written modules
#import preprocessing

# storing python objects in the desired locations using pickle
import pickle

def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)


## 1. Import the data

In [75]:
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_tigray_clean.csv" # location of Tigray dataset
greece_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_greece_clean.csv" # location of Greece dataset
rohingya_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_rohingya_clean.csv" # location of Rohingya dataset
channel_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_channel_clean.csv" # location of Channel dataset

def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
#event_df1 = read_event_df(tigray_url)
#event_df2 = read_event_df(rohingya_url)
#event_df3 = read_event_df(channel_url)
#event_df4 = read_event_df(greece_url)



## BERT embeddings

In [4]:
unique_tweets_df1 = preprocessing.fuzzy_duplicate_removal(event_df1)
unique_tweets_df2= preprocessing.fuzzy_duplicate_removal(event_df2)
unique_tweets_df3 = preprocessing.fuzzy_duplicate_removal(event_df3)
unique_tweets_df4 = preprocessing.fuzzy_duplicate_removal(event_df4)

Tweets at the start: 42853
Tweets after 100% duplicates removed: 42164
calculating similarities across documents...


955it [00:00, 9455.28it/s]

Similarity calculation completed in 149.39532446861267 seconds
removing fuzzy duplicates...


112872it [00:04, 26938.80it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupl_removed['is_dup'][i] = True


32831 tweets left after 70.0% similar tweets (by cosine similarity) removed
Tweets at the start: 29432
Tweets after 100% duplicates removed: 28820
calculating similarities across documents...


5806it [00:00, 29353.19it/s]

Similarity calculation completed in 59.32112240791321 seconds
removing fuzzy duplicates...


90075it [00:02, 34711.69it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupl_removed['is_dup'][i] = True


21154 tweets left after 70.0% similar tweets (by cosine similarity) removed
Tweets at the start: 173758
Tweets after 100% duplicates removed: 173339
calculating similarities across documents...


2947it [00:00, 29283.34it/s]

Similarity calculation completed in 2959.0516135692596 seconds
removing fuzzy duplicates...


240693it [00:06, 37092.65it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupl_removed['is_dup'][i] = True


162413 tweets left after 70.0% similar tweets (by cosine similarity) removed
Tweets at the start: 137462
Tweets after 100% duplicates removed: 135891
calculating similarities across documents...


2959it [00:00, 29297.49it/s]

Similarity calculation completed in 1541.3745939731598 seconds
removing fuzzy duplicates...


248357it [00:08, 29633.15it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupl_removed['is_dup'][i] = True


116533 tweets left after 70.0% similar tweets (by cosine similarity) removed


In [76]:
def train_embeddings(event_cands):
    from time import time
    from sentence_transformers import SentenceTransformer
    #sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    sbert_model = SentenceTransformer('paraphrase-mpnet-base-v2')

    bert_corpus = list(event_cands['entity'])

    print(f'there are {len(bert_corpus)} entities to be encoded')
    t0 = time()
    cands_embeddings = sbert_model.encode(bert_corpus)
    print(f'Training embeddings took {time()-t0} seconds')
    return cands_embeddings

In [73]:
#event_df = read_event_df(greece_url)
#unique_tweets_df = preprocessing.fuzzy_duplicate_removal(event_df)
#tweet_sentences = [sent for tweet in event_df['text_alphanum'] for sent in sent_tokenize(tweet)]
event_cands = load_pickle('greece_ents')

document_embeddings = train_embeddings(event_cands)

pickle_file('greece_embeddings_ents', document_embeddings)

there are 2076 sentences to be encoded
Training embeddings took 46.14393377304077 seconds


In [74]:
#event_df = read_event_df(rohingya_url)
#unique_tweets_df = preprocessing.fuzzy_duplicate_removal(event_df)
#tweet_sentences = [sent for tweet in event_df['text_alphanum'] for sent in sent_tokenize(tweet)]
event_cands = load_pickle('rohingya_ents')

document_embeddings = train_embeddings( event_cands)

pickle_file('rohingya_embeddings_ents', document_embeddings)

there are 939 sentences to be encoded
Training embeddings took 27.10666036605835 seconds


In [75]:
#event_df = read_event_df(tigray_url)
#unique_tweets_df = preprocessing.fuzzy_duplicate_removal(event_df)
#tweet_sentences = [sent for tweet in event_df['text_alphanum'] for sent in sent_tokenize(tweet)]
event_cands = load_pickle('tigray_ents')

document_embeddings = train_embeddings(event_cands)

pickle_file('tigray_embeddings_ents', document_embeddings)

there are 1125 sentences to be encoded
Training embeddings took 36.669280767440796 seconds


In [77]:
#event_df = read_event_df(channel_url)
#unique_tweets_df = preprocessing.fuzzy_duplicate_removal(event_df)
#tweet_sentences = [sent for tweet in event_df['text_alphanum'] for sent in sent_tokenize(tweet)]
event_cands = load_pickle('channel_ents')

document_embeddings = train_embeddings(event_cands)

pickle_file('channel_embeddings_ents', document_embeddings)

there are 2679 entities to be encoded
Training embeddings took 45.38644218444824 seconds


In [11]:
import winsound
duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)

## Create dataframe with similarities

In [91]:
from sklearn.metrics.pairwise import cosine_similarity

def create_sim_df(cand_df,cand_embeddings,sim_threshold=0.6):
    
    rows_list = []
    sims = cosine_similarity(cand_embeddings)
    
    for up_cand_id in tqdm(range(len(cand_df['entity']))):
        for low_cand_id in range(up_cand_id+1,len(cand_df)):
            dict1 = {}
            # get input row in dictionary format
            # key = col_name
            if sims[up_cand_id][low_cand_id]>sim_threshold:
                dict1.update({'text': cand_df['entity'][up_cand_id], 
                              'text_to_compare':cand_df['entity'][low_cand_id], 
                              'sim':sims[up_cand_id][low_cand_id]}) 
                rows_list.append(dict1)

    sim_df = pd.DataFrame(rows_list)
    return sim_df

def create_sim_df(event_name, entity_type = 'anns', sim_threshold=0.6):
    event_entities = load_pickle(f'{event_name}_{entity_type}')
    cand_embeddings = load_pickle(f'{event_name}_embeddings_{entity_type}')
    rows_list = []
    sims = cosine_similarity(cand_embeddings)
    
    for up_cand_id in tqdm(range(len(event_entities['entity']))):
        for low_cand_id in range(up_cand_id+1,len(event_entities)):
            dict1 = {}
            # get input row in dictionary format
            # key = col_name
            if sims[up_cand_id][low_cand_id]>sim_threshold:
                dict1.update({'text': event_entities['entity'][up_cand_id], 
                              'text_to_compare':event_entities['entity'][low_cand_id], 
                              'sim':sims[up_cand_id][low_cand_id]}) 
                rows_list.append(dict1)

    sim_df = pd.DataFrame(rows_list)
    return sim_df

In [92]:
sim_df = create_sim_df('rohingya')
pickle_file('rohingya_sim_df_anns',sim_df)

100%|██████████████████████████████████████████████████████████████████████████████| 5120/5120 [01:51<00:00, 45.76it/s]


In [93]:
sim_df = create_sim_df('rohingya',entity_type='ents')
pickle_file('rohingya_sim_df_ents',sim_df)

100%|██████████████████████████████████████████████████████████████████████████████| 9470/9470 [05:26<00:00, 29.02it/s]


In [95]:
sim_df = load_pickle('greece_sim_df')
event_cands = load_pickle('greece_cands_after1')

sim_df_anns = load_pickle('greece_sim_df_anns')
event_cands_anns = load_pickle('greece_anns')

sim_df_ents = load_pickle('greece_sim_df_ents')
event_cands_ents = load_pickle('greece_ents')

In [97]:
def make_sim_df_freq(sim_df, event_cands):
    #get frequency of of text column
    sim_df_freq_ = sim_df.merge(event_cands[['entity','freq']],left_on='text',right_on='entity')
    sim_df_freq_ = sim_df_freq_[sim_df_freq_.freq>5]
    #get frequency of of text_to_compare columns
    sim_df_freq = sim_df_freq_.merge(event_cands[['entity','freq']],left_on='text_to_compare',right_on='entity')
    sim_df_freq = sim_df_freq[sim_df_freq.freq_y>5]
    
    sim_df_freq.drop(['entity_x','entity_y'],axis=1,inplace=True)
    return sim_df_freq

sim_df_freq_ents = make_sim_df_freq(sim_df_ents,event_cands_ents)
sim_df_freq_anns = make_sim_df_freq(sim_df_anns,event_cands_anns)


def make_sim_df_freq(sim_df, event_cands):
    #get frequency of of text column
    sim_df_freq_ = sim_df.merge(event_cands[['cand_text','cand_freq']],left_on='text',right_on='cand_text')
    sim_df_freq_ = sim_df_freq_[sim_df_freq_.cand_freq>5]
    #get frequency of of text_to_compare columns
    sim_df_freq = sim_df_freq_.merge(event_cands[['cand_text','cand_freq']],left_on='text_to_compare',right_on='cand_text')
    sim_df_freq = sim_df_freq[sim_df_freq.cand_freq_y>5]
    
    sim_df_freq.drop(['cand_text_x','cand_text_y'],axis=1,inplace=True)
    return sim_df_freq

sim_df_freq = make_sim_df_freq(sim_df,event_cands)

In [116]:
sim_df_freq_anns.sort_values('cand_freq_x',ascending=False)['text'].unique()[:20]

array(['turkey', 'greece', 'europe', 'eu', 'syria', 'erdogan', 'idlib',
       'russia', 'uk', 'assad', 'germany', 'nato', 'turkish', 'us',
       'lesbos', 'syrians', 'syrian', 'iran', 'putin', 'bulgaria'],
      dtype=object)

In [156]:
def print_info(sim_df, entity,sim=0.7):
    merged = list(sim_df[(sim_df['sim']>sim) & (sim_df['text']==entity)].sort_values('sim',ascending=False)['text_to_compare'])
    #text_freq = max(sim_df[(sim_df['sim']>sim) & (sim_df['text']==entity)]['cand_freq_x'])
    #text_to_compare_freq = sum(sim_df[(sim_df['sim']>sim) & (sim_df['text']==entity)]['cand_freq_y'])
    #print(f'{entity} mentions: {text_freq}. After merging: {text_freq+text_to_compare_freq}')
    #print(merged)
    return merged

In [103]:
sim_df_freq_ents['cand_freq_x'] = sim_df_freq_ents['freq_x']
sim_df_freq_ents['cand_freq_y'] = sim_df_freq_ents['freq_y']
sim_df_freq_anns['cand_freq_x'] = sim_df_freq_anns['freq_x']
sim_df_freq_anns['cand_freq_y'] = sim_df_freq_anns['freq_y']

In [144]:
entity = 'greece'
sim = 0.8

print_info(sim_df_freq_ents, entity,sim)
print_info(sim_df_freq_anns, entity,sim)
print_info(sim_df_freq,entity,sim)

greece mentions: 30547. After merging: 51301
['greece &', 'greeces', 'greek islands', 'greece border', 'greek island', 'greek coast', 'the greek islands', 'the greek island', 'greeces border', 'greek islanders', 'greek border', 'greek embassy', 'greeks', 'greek borders', 'greek government', 'greek', 'greek aegean', 'athens', 'the greek embassy', 'greek pm', 'the greek border', 'greece & europe', 'greek forces', 'greece & eu', 'hellenic', 'greek consulates', 'greek city times', 'greek navy', 'thessaloniki', 'greek army', 'the greek borders', 'greek coast guards', 'greek coastguard', 'greek - european', 'greek security forces']
greece mentions: 31752. After merging: 33191
['greeces', 'greece europe', 'greek islands', 'greece border', 'lesvos greece', 'greek island', 'lesbos greece', 'greek state', 'greek islanders', 'greek border', 'europe greece', 'greek embassy', 'greece news', 'greek aegean islands', 'greek government', 'greece army', 'greek', 'greek aegean', 'greek patriots', 'athens

In [None]:
entity = 'eu'
sim = 0.85

merged_entities = set()
for merged in print_info(sim_df_freq_ents, entity,sim):
    merged_entities.update(print_info(sim_df_freq_ents, merged, sim))
    
merged_entities