# Candidate alignment (merging) 

**Necessary files:**

 - event_cands = dataframe with candidates and their meta information
 - model = embeddings model trained on the event dataset (for the second merging step)
 

In [3]:
from merging_steps import merging_step1, merging_step2, merging_step3, merging_step4
from merging_steps import merge_indices, merge_transitively
import stanza  
import pandas as pd

from nltk.tokenize import word_tokenize
import pickle
import os
import re
from tqdm.notebook import tqdm
from collections import Counter, defaultdict


# functions pickle_file and load_pickle merely help with storing pickled files in the event folders on drive
def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)

### Merging step 1
  - In the first merging step, we merge two candidates if the head of each of their representative phrase is identical by string comparison. Therefore, we merge candidates if the cand_head column value is equal. We keep the first row from top and sum the cand_freq column to keep count of all merged candidates.
  - sort by frequency after grouping


In [4]:
# instantiate stanza pipeline to find head of each entity
en_nlp = stanza.Pipeline("en",  
                          tokenize_pretokenized=False,
                          ner_batch_size=4096,
                          processors = "tokenize,pos,lemma,depparse,ner",
                          verbose=False)

In [5]:
def get_tweet_tags(tagged_tweets):
    """
    Input: pre-tagged tweets
    Output: list of lists containing named entities
    """
    tweet_entities=[]
    excluded_tags = ['CARDINAL', 'DATE', 'QUANTITY', 'TIME', 'PERCENT', 'MONEY', 'ORDINAL']
    for tweet in tqdm(range(len(tagged_tweets))):
            tweet_ner= [ent.text for sent in tagged_tweets[tweet].sentences for ent in sent.ents if ent.type not in excluded_tags]
            tweet_entities.append(tweet_ner)
    return tweet_entities  

def get_ent_head(entity):
    if len(word_tokenize(entity))>1:
        entity_doc = en_nlp(entity)
        #the root of NP has value 0. Since head is only one and stored in a list, we pick item [0]
        return [word.text for tweet_ent in entity_doc.sentences for word in tweet_ent.words if word.head == 0][0]
    else:
        return entity

def process_entity(entity):
    # re.sub removes all non alpanumeric characters (and lower() lowercases)
    # " ".join and split removes the consecutive whitespaces created by replacing characters
    entity = re.sub('[^0-9a-zA-Z.!?]+', ' ', entity.lower())
    entity = re.sub('[.]+', ' . ', entity)
    entity = re.sub('[.]+', ' ? ', entity)
    entity = re.sub('[.]+', ' ! ', entity)
    return " ".join(entity.split())

def create_entities_df(tagged_tweets):
    
    event_entities = get_tweet_tags(tagged_tweets)
    
    list_of_entities = list()
    for tweet_entities in tqdm(event_entities):
        list_of_entities.append(tweet_entities)

    entity_list = [process_entity(entity) for tweet in list_of_entities for entity in tweet]
    
    #create the count dictionary which will be converted into a df
    counted_entities = Counter(entity_list)
    ent_df = pd.DataFrame(counted_entities.items(),columns=['entity','freq']).sort_values('freq',ascending=False)
    #filter out entities shorter than 2 characters and frequency less than 5
    ent_df = ent_df[ent_df.freq>=5]
    ent_df['len'] = ent_df.entity.apply(lambda x: len(x))
    ent_df = ent_df[ent_df['len']>=2]
    #find head of the entity
    tqdm.pandas()
    ent_df['head'] = ent_df.entity.progress_apply(get_ent_head)
    ent_df.reset_index(drop=True,inplace=True)
    return ent_df[['entity','freq','head']]

def merging_step1(event_name):
    print(f'loading {event_name} tagged tweets...')
    tagged_tweets = load_pickle(f'{event_name}_tagged_tweets')
    ent_df = create_entities_df(tagged_tweets)
    pickle_file(f'{event_name}_ents',ent_df)
    return ent_df


In [19]:
ent_df = merging_step1('rohingya')

loading rohingya tagged tweets...


  0%|          | 0/29432 [00:00<?, ?it/s]

  0%|          | 0/29432 [00:00<?, ?it/s]

  0%|          | 0/939 [00:00<?, ?it/s]

In [20]:
ent_df = merging_step1('tigray')

loading tigray tagged tweets...


  0%|          | 0/42853 [00:00<?, ?it/s]

  0%|          | 0/42853 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

In [21]:
ent_df = merging_step1('greece')

loading greece tagged tweets...


  0%|          | 0/137462 [00:00<?, ?it/s]

  0%|          | 0/137462 [00:00<?, ?it/s]

  0%|          | 0/2076 [00:00<?, ?it/s]

In [234]:
ent_df = merging_step1('channel')

loading channel tagged tweets...


  0%|          | 0/173758 [00:00<?, ?it/s]

  0%|          | 0/173758 [00:00<?, ?it/s]

  0%|          | 0/2679 [00:00<?, ?it/s]

#### If twitter annotations are used

In [13]:
from ast import literal_eval

greece_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_greece_clean.csv" # for Greece
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_tigray_clean.csv" # for Tigray
rohingya_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_rohingya_clean.csv" # for Rohingya
channel_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_channel_clean.csv" # for channel

def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0,converters={"annotations": literal_eval})
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
#event_df = read_event_df(greece_url)

In [128]:
from collections import Counter

event_df = read_event_df(channel_url)
event_cands = list([ann for tweet in event_df['annotations']for ann in tweet])
counted_cands = Counter(event_cands)
#event_cands_merged = pd.Series(list(set(event_cands)))
#print(counted_cands)
event_cands_merged = pd.DataFrame(counted_cands.items(),columns=['entity','freq'])
event_cands_merged = event_cands_merged[event_cands_merged.entity!=''].sort_values('freq',ascending=False)
event_cands_merged = event_cands_merged[event_cands_merged.freq>=5]
event_cands_merged.reset_index(inplace=True, drop=True)
pickle_file('channel_anns',event_cands_merged)

loaded 173758 tweets!


#### If noun phrases candidate dataframe is used

In [32]:
def merging_step1(event_cands):
    event_cands_merged = event_cands.groupby(['cand_head'],sort=False,as_index=False).agg({
        'candidates': 'first', 
        'cand_text': 'first',
        'cand_type': 'first',
        'cand_freq':'sum'}).sort_values(by=['cand_freq'],ascending=False).reset_index(drop=True)
    print(f'there are {len(event_cands_merged)} candidates after 1st merging step')
    return event_cands_merged

## Merging step 2: Similarity using Sentence BERT embeddings

In [1]:
from sklearn.metrics.pairwise import cosine_similarity

def create_sim_df(event_name, entity_type = 'ents', sim_threshold=0.6):
    event_entities = load_pickle(f'{event_name}_{entity_type}')
    cand_embeddings = load_pickle(f'{event_name}_embeddings_{entity_type}')
    rows_list = []
    sims = cosine_similarity(cand_embeddings)
    
    #for up_cand_id in tqdm(range(len(event_entities['entity']))):
        #for low_cand_id in range(up_cand_id+1,len(event_entities)):
    for up_cand_id in tqdm(range(len(event_entities))):
        for low_cand_id in range(len(event_entities)):
            dict1 = {}
            # get input row in dictionary format
            # key = col_name
            if sims[up_cand_id][low_cand_id]>sim_threshold:
                dict1.update({'text': event_entities['entity'][up_cand_id], 
                              'text_to_compare':event_entities['entity'][low_cand_id], 
                              'sim':sims[up_cand_id][low_cand_id]}) 
                rows_list.append(dict1)

    sim_df = pd.DataFrame(rows_list)
    return sim_df

In [195]:
sim_df = create_sim_df('greece')
pickle_file('greece_sim_df_ents',sim_df)

  0%|          | 0/2076 [00:00<?, ?it/s]

In [214]:
sim_df = create_sim_df('rohingya')
pickle_file('rohingya_sim_df_ents',sim_df)

  0%|          | 0/939 [00:00<?, ?it/s]

In [67]:
sim_df = create_sim_df('tigray')
pickle_file('tigray_sim_df_ents',sim_df)

  0%|          | 0/1125 [00:00<?, ?it/s]

In [236]:
sim_df = create_sim_df('channel')
pickle_file('channel_sim_df_ents',sim_df)

  0%|          | 0/2679 [00:00<?, ?it/s]

In [77]:
sim_df[sim_df.text=='junta'].sort_values('sim',ascending=False)[:20]

                                        

Unnamed: 0,text,text_to_compare,sim
3026,junta,junta,1.0
3028,junta,juntas,0.891388
3027,junta,tplf junta,0.682311


In [41]:
def flatten(list_of_mixed_types):
    # helper function to convert a list of mixed types (strings and lists) into a list
    flat_list = []
    for element in list_of_mixed_types:
        if isinstance(element,list): 
            flat_list.extend(flatten(element))
        else: 
            flat_list.append(element)
    return flat_list

def align_ents(what_merged):
    
    merged_dict = defaultdict(list)
    merged_ent = set()
    
    for key,value in tqdm(what_merged.items()):
        merged_ent.update(what_merged[key])
        for item in value:
            merged_dict[key].extend(what_merged[key])
            if item in what_merged.keys():
                merged_dict[key].extend(what_merged[item])
                merged_dict[key] = list(set(merged_dict[key]))

        #print('-------loop------')
        for ent,merged in what_merged.items():
            for item2 in merged:
                if item2 in what_merged[key] and item2 in what_merged[ent] and key!=ent:
                    merged_dict[key].extend(what_merged[key] + what_merged[ent])
                    merged_dict[key].append(ent)
                    merged_ent.add(ent)
        merged_dict[key] = list(set(merged_dict[key]))  
        #merged_dict[key].pop(merged_dict[key].index(key))
    return merged_dict

def finalize_ents(merged_dict):
    final_ents = defaultdict()
    check_set = set()
    for key,value in merged_dict.items():
        if key not in check_set:
            final_ents[key] = merged_dict[key]
            check_set.update(merged_dict[key])
    
    for key in final_ents.keys():
        if key in final_ents[key]:
            final_ents[key].pop(final_ents[key].index(key))
            
    # add manual terms to the dict so (refugees and asyslum seekers) and (migrants and immigrants) are together
    final_ents['refugees'] = ['refugee','asylum seeker','asylum seekers']
    final_ents['migrants'] = ['migrant','immigrant','immigrants']
    return final_ents


def merging_step2(entities,sim_df,low_threshold=0.8, high_threshold =0.9, transitive=False): 
    
    what_merged = defaultdict(list)
    merged_ents = set()

    #merge the dataframes so we have information about frequencies of candidates in sim_df
    # use outer join to include the candidates that are not similar to any other candidate (thus not in sim_df)
    merged_df_ = pd.merge(sim_df,entities[entities.freq>=5], how='outer', left_on='text', right_on='entity')
    merged_df = pd.merge(merged_df_,entities[entities.freq>=5], how='outer', left_on='text_to_compare', right_on='entity')
    
    #blank fields in sim_df's text are the candidates that are dissimilar to the rest, pass the value from cand_text
    merged_df.entity_x.fillna(merged_df.entity_y, inplace=True) 
    merged_df.freq_x.fillna(merged_df.freq_y, inplace=True) 
    
    merged_df.drop(['text','text_to_compare'],axis=1,inplace=True)

    # and select only the rows above the lower threshold, so we do not have to filter by it in the loop
    merged_df_small = merged_df[merged_df.sim>low_threshold]

    print(f'finding merged entities...')
    # create an object to iterate over unique candidates
    unique_merged_df = merged_df.groupby(by=['entity_x'], sort = False, as_index=False).agg({'freq_x':'max'}).sort_values('freq_x',ascending=False)

    def get_merged_entities(entity,transitive=False):
        """
        Find which candidates merged into the candidate
        """
        lookup_df = pd.DataFrame({'entity_x': [], 'entity_y':[]})

        if entity not in merged_ents:
            if transitive == False:
                lookup_df = merged_df_small[merged_df_small.entity_x==entity]

                #what_merged[cand].extend(list(lookup_df['candidates']))
                merged_ents.update(list(lookup_df['entity_y']))
            else:
                lookup_df = merged_df_small[(merged_df_small.entity_x==entity) & (merged_df_small.sim>high_threshold)]

            #what_merged[cand].extend(list(lookup_df['candidates']))
            #merged_ents.update(list(lookup_df['entity_y']))
        
        #clean nans 
        lookup_df = lookup_df[~lookup_df['entity_y'].isnull()]
        return list(lookup_df['entity_y'])
    
    tqdm.pandas() 
    unique_merged_df['merged'] =  unique_merged_df.entity_x.progress_apply(get_merged_entities)
    #print(list(unique_merged_df['merged']))
    #convert the dataframe into a dictionary
    for entity,merged_list in tqdm(zip(unique_merged_df['entity_x'],unique_merged_df['merged'])):
        what_merged[entity] = merged_list.copy()
        if transitive:
            for merged_ent in merged_list:
                what_merged[entity].append(get_merged_entities(merged_ent,transitive=True))
        what_merged[entity] = list(set(flatten(what_merged[entity]) ) ) 
    
    return what_merged




In [42]:
def run_merging_step2(event_name,entity_type='ents',low_threshold=0.8, high_threshold = 0.9, transitive=True):
    sim_df = load_pickle(f'{event_name}_sim_df_{entity_type}')
    entities = load_pickle(f'{event_name}_{entity_type}')

    what_merged = merging_step2(entities,
                                sim_df,
                                low_threshold=low_threshold, 
                                high_threshold=high_threshold,
                                transitive=transitive)
    
    if transitive:
        what_merged = align_ents(what_merged)
    
    if event_name == 'channel' or event_name == 'greece':
        what_merged['germany'].pop(what_merged['germany'].index('austria'))
        what_merged['germany'].pop(what_merged['germany'].index('austrian'))
    
    if event_name == 'greece':
        what_merged['russia'].pop(what_merged['russia'].index('ukraine'))
            
    what_merged = finalize_ents(what_merged)
    #pickle_file(f'{event_name}_final_{entity_type}',final_ents)
    pickle_file(f'{event_name}_what_merged',what_merged)
    return  what_merged #final_ents,

In [78]:
merged_dict = run_merging_step2('tigray',low_threshold=0.7,high_threshold = 0.9, transitive=False)

finding merged entities...


  0%|          | 0/1125 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [215]:
merged_dict = run_merging_step2('rohingya',low_threshold=0.7,high_threshold = 0.9,transitive=False)

finding merged entities...


  0%|          | 0/939 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [252]:
merged_dict = run_merging_step2('greece',low_threshold=0.7, high_threshold = 0.9,transitive=False)

finding merged entities...


  0%|          | 0/2076 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [237]:
merged_dict = run_merging_step2('channel',low_threshold=0.7,high_threshold=0.85, transitive=False)

finding merged entities...


  0%|          | 0/2679 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [79]:
print(list(merged_dict.keys())[:20])

['eritrean', 'ethiopia', 'tigray', 'sudan', 'un', 'tplf', 'biden', 'antony blinken', 'abiy ahmed ali', 'amhara', 'un refugee agency', 'kamala harris', 'us', 'africa', 'eu', 'amnesty international', 'yemen', 'uk', 'israel', 'shimelba']


In [80]:
print(list(merged_dict['turkey']))

['turkish']


In [238]:
sim_df[sim_df.text=='english'].sort_values('sim',ascending=False)[:40]

Unnamed: 0,text,text_to_compare,sim
476,english,english,1.0
483,english,englishmen,0.771397
479,english,anglo saxon,0.698635
482,english,englishman,0.693618
480,english,anglo,0.654551
481,english,latin,0.630546
484,english,uk french,0.60946
478,english,english channel,0.609256
477,english,england,0.606441


### Find frequencies

In [197]:
from collections import defaultdict
from itertools import combinations

def get_frequencies(what_merged, event_df):
    """
    in part 1: get frequencies of each item in what_merged dictionary from the corpus
    
    in part 2: correct frequencies for repeating terms
    """
    #~~~~ part 1 ~~~~
    freq_dict = defaultdict(list)
    for key in tqdm(what_merged.keys()):
        freq_dict2 = dict()
        c = sum(event_df['text_alphanum'].apply(lambda x: x.count(f' {key} ')))
        freq_dict[key].append(c)
        for item in what_merged[key]:
            c2 = sum(event_df['text_alphanum'].apply(lambda x: x.count(f' {item} ')))
            freq_dict2[item]=c2
        freq_dict[key].append(freq_dict2)
    
    #~~~~ part 2 ~~~~
    
    # Instantiate dict to store individual count for entity and merged count
    freq_counts = defaultdict(dict)
    
    # Iterate over entities
    for ent in freq_dict.keys():
        
        #print(f"Ent: {ent}")
        adj_freq = 0
        
        # Store number of occurences of root entity
        root_freq = freq_dict[ent][0]
        
        # Instantiate empty list where entities are stored that have already been counted
        roots = []
        
        # Integrate main entity into dict of merged entities
        ent_dict = freq_dict[ent][1]
        ent_dict[ent] = freq_dict[ent][0]
        
        # Iterate over merged entities based on their length (in term of tokens) in ascending order
        for merged_ent in sorted(ent_dict.keys(), key = len):
            #print(f"Merged Ent: {merged_ent}")
            
            # Create all possible sub-combinations of entity while keeping the order constant
            # For example: "Greece Turkey Border" yields Greece, Turkey, Border, Greece Turkey, Turkey Border, Greece Turkey Border
            merged_ent_list = merged_ent.split(" ")
            all_slices = [merged_ent_list[s:e] for s, e in combinations(range(len(merged_ent_list)+1), 2)]
            all_slices_strings = [" ".join(l) for l in all_slices]
            
            # Iterate over all entities that have already been counted
            for root in roots:
                
                # Check if any of the substring of current entity has already been counted, if yes don't count it again
                if root in all_slices:
                    #print(f"{merged_ent} was removed to prevent double count. {root} has already been counted")
                    break
            
            # If entity has not been counted, count it and add entity to counted entities
            else:
                roots.append([merged_ent])
                #print(roots)
                adj_freq += ent_dict[merged_ent]
                #print(f"Frequency incremented by {ent_dict[merged_ent]}")
        
        # Update frequency dict
        freq_counts[ent]["Root"] = root_freq
        freq_counts[ent]["Overall"] = adj_freq


    return freq_dict,freq_counts

In [216]:
event_df = read_event_df(rohingya_url)
what_merged = load_pickle('rohingya_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)

loaded 29432 tweets!


  0%|          | 0/553 [00:00<?, ?it/s]

In [191]:
event_df = read_event_df(tigray_url)
what_merged = load_pickle('tigray_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)
#freq_dict

loaded 42853 tweets!


  0%|          | 0/654 [00:00<?, ?it/s]

Ent: eritrean
Ent: ethiopia
Ent: tigray
Ent: sudan
Ent: un
Ent: tplf
Ent: biden
Ent: antony blinken
Ent: abiy ahmed ali
Ent: amhara
Ent: un refugee agency
Ent: kamala harris
Ent: us
Ent: africa
Ent: eu
Ent: amnesty international
Ent: yemen
Ent: uk
Ent: israel
Ent: shimelba
Ent: justin trudeau
Ent: cnn
Ent: kenya
Ent: filippo grandi
Ent: hitsats
Ent: the united nations
Ent: hamdayet
Ent: mai kadra
Ent: u ? s ?
Ent: tegaru
Ent: marc garneau
Ent: somalia
Ent: canada
Ent: josep borrell
Ent: houthi
Ent: bbc news
Ent: shire
Ent: english
Ent: syria
Ent: axum
Ent: libya
Ent: egypt
Ent: geneva
Ent: michelle bachelet
Ent: oromo
Ent: isaias
Ent: ali
Ent: al jazeera
Ent: addis
Ent: ant nio guterres
Ent: mekelle
Ent: gedaref
Ent: uganda
Ent: nrc
Ent: sanaa
Ent: france
Ent: reuters
Ent: western
Ent: addis ababa
Ent: norway
Ent: covid 19
Ent: linda thomas
Ent: italy
Ent: mekele
Ent: djibouti
Ent: world bank
Ent: adi harush
Ent: chris coons
Ent: martin plaut
Ent: ethiopian human rights commission
Ent:

In [253]:
event_df = read_event_df(greece_url)
what_merged = load_pickle('greece_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)
#freq_dict

loaded 137462 tweets!


  0%|          | 0/1199 [00:00<?, ?it/s]

In [239]:
event_df = read_event_df(channel_url)
what_merged = load_pickle('channel_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)
#freq_dict

loaded 173758 tweets!


  0%|          | 0/1578 [00:00<?, ?it/s]

In [240]:
print(list(total_freq.keys())[:30])

['uk', 'france', 'eu', 'priti patel', 'english', 'farage', 'germany', 'bbc', 'calais', 'brexit', 'channel', 'nigel farage', 'dover', 'greece', 'us', 'turkey', 'spain', 'kent', 'italy', 'syria', 'boris', 'un', 'tory', 'london', 'africa', 'dublin', 'muslim', 'libya', 'boris johnson', 'navy']


In [250]:
word= 'brexit'
#print(dict(sorted(freq_dict.items(), key=lambda item: item[1][0],reverse=True))[word][0])
print(total_freq[word])
print(list(dict(sorted(freq_dict[word][1].items(), key=lambda item: item[1],reverse=True)).keys()))
#dict(sorted(freq_dict[word][1].items(), key=lambda item: item[1],reverse=True))

{'Root': 9861, 'Overall': 10973}
['brexit', 'brexiteers', 'brexiters', 'brexit party', 'brexiteer', 'brexit britain', 'the brexit party', 'brexiter', 'brexit uk', 'brexshit', 'brexity', 'brexit france', 'tory brexiteer', 'brexit covid', 'brexidiots', 'brexit england', 'brexiteer mp', 'brexitland']


In [256]:
freq_dict['russia']

[3773,
 {'russian': 1250,
  'ussr': 15,
  'russians': 234,
  'russian mod': 7,
  'russia turkey': 91,
  'soviets': 16,
  'syrian russian': 28,
  'putins': 79,
  'moscow': 136,
  'russias': 52,
  'russia iran': 111,
  'putin': 1415,
  'soviet union': 13,
  'rus': 19,
  'turkish russian': 19,
  'ukraine': 88,
  'soviet': 52,
  'russian syrian': 28,
  'russia': 3773}]

In [194]:
dict(sorted(total_freq.items(), key=lambda item: item[1]['Overall'],reverse=True))



{'refugees': {'Root': 21145, 'Overall': 32942},
 'tigray': {'Root': 26578, 'Overall': 29993},
 'un refugee agency': {'Root': 2761, 'Overall': 28798},
 'eritrean': {'Root': 13582, 'Overall': 20870},
 'ethiopia': {'Root': 9596, 'Overall': 16723},
 'un': {'Root': 7757, 'Overall': 15617},
 'tplf': {'Root': 7764, 'Overall': 7883},
 'sudan': {'Root': 6711, 'Overall': 7395},
 'genocide': {'Root': 5287, 'Overall': 5287},
 'who': {'Root': 5072, 'Overall': 5072},
 'displaced': {'Root': 4851, 'Overall': 4851},
 'the united nations': {'Root': 439, 'Overall': 4572},
 'abiy ahmed ali': {'Root': 1420, 'Overall': 4309},
 'united': {'Root': 3783, 'Overall': 3783},
 'western tigray zone': {'Root': 14, 'Overall': 3418},
 'abi adi': {'Root': 21, 'Overall': 3282},
 'agency': {'Root': 3251, 'Overall': 3251},
 'biden': {'Root': 3086, 'Overall': 3086},
 'abraha': {'Root': 19, 'Overall': 3001},
 'amhara': {'Root': 2626, 'Overall': 2930},
 'abyi': {'Root': 55, 'Overall': 2893},
 'eu': {'Root': 1069, 'Overall': 

In [115]:
freq_dict['sudan']

[6711,
 {'south sudanese': 44,
  'sudan refugee': 388,
  'sudans': 79,
  'sudan border town': 46,
  'sudanese': 567,
  'south sudan': 578,
  'sudanpmhamdok': 38,
  's sudan': 19,
  'eastern sudan': 180,
  'unhcr sudan': 47,
  'east sudan': 31}]

### To quickly check what merged and play with thresholds

In [3]:
def make_sim_df_freq(sim_df, event_cands):
    #get frequency of of text column
    sim_df_freq_ = sim_df.merge(event_cands[['entity','freq']],left_on='text',right_on='entity')
    sim_df_freq_ = sim_df_freq_[sim_df_freq_.freq>5]
    #get frequency of of text_to_compare columns
    sim_df_freq = sim_df_freq_.merge(event_cands[['entity','freq']],left_on='text_to_compare',right_on='entity')
    sim_df_freq = sim_df_freq[sim_df_freq.freq_y>5]
    
    sim_df_freq.drop(['entity_x','entity_y'],axis=1,inplace=True)
    return sim_df_freq

def print_info(sim_df, entity,sim=0.7):
    merged = list(sim_df[(sim_df['sim']>sim) & (sim_df['text']==entity)].sort_values('sim',ascending=False)['text_to_compare'])
    #text_freq = max(sim_df[(sim_df['sim']>sim) & (sim_df['text']==entity)]['cand_freq_x'])
    #text_to_compare_freq = sum(sim_df[(sim_df['sim']>sim) & (sim_df['text']==entity)]['cand_freq_y'])
    #print(f'{entity} mentions: {text_freq}. After merging: {text_freq+text_to_compare_freq}')
    #print(merged)
    return merged

In [6]:
sim_df_freq_anns.sort_values('freq_x',ascending=False)['text'].unique()[:20]

array(['turkey', 'greece', 'europe', 'eu', 'syria', 'erdogan', 'idlib',
       'russia', 'uk', 'assad', 'germany', 'nato', 'turkish', 'us',
       'lesbos', 'syrians', 'syrian', 'iran', 'putin', 'bulgaria'],
      dtype=object)

In [32]:
sim_df = load_pickle('greece_sim_df_anns')
entities = load_pickle('greece_anns')

sim_df_freq_anns= make_sim_df_freq(sim_df,entities)

NameError: name 'make_sim_df_freq' is not defined

In [105]:
sim_df

Unnamed: 0,text,text_to_compare,sim
0,turkey,erdogan,0.809029
1,turkey,turkish,0.880714
2,turkey,ankara,0.790253
3,turkey,erdoğan,0.809029
4,turkey,turks,0.850284
...,...,...,...
28447336,allah ekber,yahyah farroukh,0.795914
28447337,malcom,venezuala,0.673318
28447338,malcom,yahyah farroukh,0.691370
28447339,the island,venezuala,0.677538


In [92]:
entity = 'assad'
sim = 0.85

print_info(sim_df_freq_anns, entity,sim)

['mr assad',
 'al-assad',
 'abu emad',
 'erdo',
 'arash yousufi',
 'alqaeda',
 'essad',
 'haftar',
 'quatar',
 'zaatari',
 'afrin',
 'khaled',
 'naamat',
 'ahmet',
 'hatay',
 'afgans',
 'syriza',
 'iman',
 'ertogan',
 'bashar assad',
 'javid',
 'anadolu',
 'afd',
 'hayat tahrir',
 'belal khaled',
 'aq',
 'asad',
 'afgani',
 'ulay',
 'maaret',
 'majed',
 'yarmouk',
 'afp',
 'khaled heeba',
 'esad']

In [103]:
entity = 'turkey'
sim = 0.85

merged_entities = set()
merged_entities.update(print_info(sim_df_freq_anns, entity,sim))
for merged in print_info(sim_df_freq_anns, entity,sim):
    merged_entities.update(print_info(sim_df_freq_anns, merged, sim=0.9))
    
merged_entities

{'al turkey',
 'eu turkey',
 'eu-turkey',
 'istanbul turkey',
 'nw turkey',
 'republic of turkey',
 'turkey border',
 'turkey erdogan',
 'turkey news',
 'turkey turkey',
 'turkish',
 'turkish border',
 'turkish coast',
 'turkish gov',
 'turkish greek',
 'turkish republic',
 'turkish state',
 'turkish-eu',
 'turks'}

#### If noun phrases candidate dataframe is used

In [11]:
def merging_step2(event_cands, sim_df,low_threshold = 0.8, high_threshold = 0.9, method='misc'):

    what_merged = defaultdict(list)
    merged_cands = set()

    #merge the dataframes so we have information about frequencies of candidates in sim_df
    # use outer join to include the candidates that are not similar to any other candidate (thus not in sim_df)
    merged_df = pd.merge(sim_df,event_cands, how='outer', left_on='text_to_compare', right_on='cand_text')
    merged_df = merged_df[~merged_df['cand_freq'].isnull()] # if there are any extra records in sim_df, remove them
    
    #blank fields in sim_df's text are the candidates that are dissimilar to the rest, pass the value from cand_text
    merged_df.text.fillna(merged_df.cand_text, inplace=True) 

    #in general, short words tend to be similar to much more words than long ones, setting threshold for length can help
    merged_df['cand_len'] = merged_df['text'].apply(lambda cand: max([len(word) for word in cand.split()]))
    
    # and select only the rows above the lower threshold, so we do not have to filter by it in the loop
    merged_df_small = merged_df[merged_df.sim>low_threshold]
    
    print(f'finding merged candidates...')
    # create an object to iterate over unique candidates
    unique_merged_df = merged_df.groupby(by=['text'], sort = False, as_index=False).agg({'cand_len':'first'})
    
    def get_merged_candidates(cand):
        """
        Find which candidates merged into the candidate
        """
        lookup_df = pd.DataFrame({'candidates': [], 'text_to_compare':[]})
        if cand not in merged_cands:
            lookup_df_full = merged_df_small[merged_df_small.text==cand]
            if method == 'len':
                # use bound of 5 letters for different thresholds to be applied
                lookup_df_long = lookup_df_full[lookup_df_full.cand_len>5]
                lookup_df_short = lookup_df_full[(lookup_df_full.sim>high_threshold) & 
                                                  (lookup_df_full.cand_len<=5)]                  
            else:
                # apply misc to apply different similarities to match
                lookup_df_long = lookup_df_full[lookup_df_full.cand_type!='misc']
                lookup_df_short = lookup_df_full[(lookup_df_full.sim>high_threshold) & 
                                                  (lookup_df_full.cand_type=='misc')] 

            lookup_df = pd.concat([lookup_df_long, lookup_df_short],ignore_index=True)
            #what_merged[cand].extend(list(lookup_df['candidates']))
            merged_cands.update(list(lookup_df['text_to_compare']))

        return list(lookup_df['candidates'])
    
    tqdm.pandas() 
    unique_merged_df['merged'] =  unique_merged_df.text.progress_apply(get_merged_candidates) 
    
    #convert the dataframe into a dictionary
    for cand,merged_list in zip(unique_merged_df['text'],unique_merged_df['merged']):
        what_merged[cand] = merged_list
        for merged_ent in merged_list:
            what_merged[cand].append(get_merged_candidates(merged_ent)
    
    if method == 'len':
        merged_df_lowt = merged_df_small[merged_df_small['cand_len']>5]
        merged_df_hight = merged_df_small[((merged_df_small['sim']>high_threshold) | (merged_df_small['sim'].isnull())) & 
                                          (merged_df_small['cand_len']<=5)]
    else:
        merged_df_lowt = merged_df_small[merged_df_small['cand_type']!='misc']
        merged_df_hight = merged_df_small[((merged_df_small['sim']>high_threshold) | (merged_df_small['sim'].isnull())) & 
                                          (merged_df_small['cand_type']=='misc')]

    final_cands_lowt = merged_df_lowt.groupby(['text'],sort=False,as_index=False).agg({'cand_freq':'sum'})
    final_cands_hight = merged_df_hight.groupby(['text'],sort=False,as_index=False).agg({'cand_freq':'sum'})

    event_cands_final = pd.concat([final_cands_lowt, final_cands_hight],ignore_index=True) 
    event_cands_final = event_cands_final[~event_cands_final['text'].isin(merged_cands)]
    event_cands_final = event_cands_final.sort_values(by=['cand_freq'],ascending=False)
    event_cands_final.reset_index(drop=True, inplace=True)
    

    
    return event_cands_final, what_merged


In [None]:
#clean nans from the dictionary
for key in what_merged.keys():
    #print(what_merged[key])
    if what_merged[key]!= 'nan':
        what_merged[key] = [value for value in what_merged[key] if value != 'nan']  

In [55]:
what_merged['turkey']

[('Erdogan', 'Erdogan', 'person-ne'),
 ('Turks', 'Turks', 'person-ne'),
 ('Turkeys', 'Turkeys', 'misc'),
 ('Turkish', 'Turkish', 'person-ne'),
 ('Turkish soldiers', 'soldiers', 'misc'),
 ('the Turkish side', 'side', 'misc'),
 ('Turkish army', 'army', 'group-ne'),
 ('Turkish officials', 'officials', 'misc'),
 ('2 - Turkey', '2', 'misc'),
 ('Turkey hosts', 'hosts', 'misc'),
 ('the Turkish coast', 'coast', 'misc'),
 ('its agents in Turkey', 'agents', 'misc'),
 ('the defensive as Turkey', 'defensive', 'misc'),
 ('Turkish flags', 'flags', 'misc'),
 ('Turkish flag', 'flag', 'misc'),
 ('Ankaras', 'Ankaras', 'misc'),
 ('Turkish Presidency', 'Presidency', 'misc'),
 ('turkey anymore, there', 'anymore,', 'misc'),
 ('rnek 1 - Turkey', 'rnek', 'misc'),
 ('Turkish ambassador', 'ambassador', 'misc'),
 ('Turkish MFA', 'MFA', 'misc'),
 ('Turkish Airways', 'Airways', 'group-ne'),
 ('Turkeyborder', 'Turkeyborder', 'loc-ne'),
 ('Turkish columnist', 'columnist', 'misc'),
 ('Turkey opensborders', 'opensbord

In [37]:
#new_merged_df['merged'] =  new_merged_df[:100].text.progress_apply(merging_bert_len) 
merged_df[merged_df['text_to_compare']=='2 - Turkey']

Unnamed: 0,text,text_to_compare,sim,cand_head,candidates,cand_text,cand_type,cand_freq,cand_len


In [3]:
event_cands[event_cands.cand_text=='smarty']#[event_cands.cand_freq==30]

Unnamed: 0,cand_head,candidates,cand_text,cand_type,cand_freq
9695,smarty,"(Smarty, Smarty, misc)",smarty,misc,1


In [None]:
merged_df_long[merged_df_long.text=='smarty']#[event_cands_final.cand_freq==30]

In [208]:
what_merged['smarty']

[]

In [104]:
sim_df

Unnamed: 0,text,text_to_compare,sim
0,refugees,karen refugees,0.762601
1,refugees,a rohingya refugee camp,0.740026
2,refugees,refugee camps,0.881408
3,refugees,refugee,0.950323
4,refugees,shelters,0.707489
...,...,...,...
4282924,eao in mon state,brahmanical,0.722714
4282925,eao in mon state,congis,0.735037
4282926,naive optimism,many innocent buddists,0.704713
4282927,wokologics,brahmanical,0.768744


In [None]:
# second merging step for rohingya
sim_df = load_pickle('rohingya_sim_df')
event_cands = load_pickle('rohingya_cands_after1')

event_cands_final, what_merged = merging_step2(event_cands, sim_df)
pickle_file('rohingya_final_cands',event_cands_final)
pickle_file('rohingya_what_merged',what_merged)

In [None]:
# second merging step for greece
sim_df = load_pickle('greece_sim_df')
event_cands = load_pickle('greece_cands_after1')

event_cands_final, what_merged = merging_step2(event_cands, sim_df)
pickle_file('greece_final_cands2',event_cands_final)
pickle_file('greece_what_merged2',what_merged)

In [None]:
# second merging step for tigray
sim_df = load_pickle('tigray_sim_df')
event_cands = load_pickle('tigray_cands_after1')

event_cands_final, what_merged = merging_step2(event_cands, sim_df)
pickle_file('tigray_final_cands',event_cands_final)
pickle_file('tigray_what_merged',what_merged)

In [None]:
# second merging step for channel
sim_df = load_pickle('channel_sent_sim_df')
event_cands = load_pickle('channel_cands_after1')

event_cands_final, what_merged = merging_step2(event_cands, sim_df)
pickle_file('channel_final_cands',event_cands_final)
pickle_file('channel_what_merged',what_merged)