# Entity Extraction and Alignment


**Necessary files:**
 - event_df = df\_[event]\_clean.csv file with event dataframes with clean unique tweets


In [13]:
#python libraries
import stanza
import numpy as np
import pandas as pd
import os
import re
from tqdm.notebook import tqdm
import time
from collections import Counter, defaultdict


# pickle functions for quick storage and loading of checkpoint files
import pickle

def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    #folder_name = re.sub(r'[12]', '', folder_name)
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)


# 1. Importing the data

In [6]:
def read_event_df(event_name):
    # reading the clean event dataframes based on the event name
    assert event_name in ['greece','channel','tigray','rohingya']
    
    event_path = fr"Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_{event_name}_clean.csv"
    directory_path = os.getcwd() + "/../../../../" + event_path
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df('channel')
event_df.head()

loaded 173758 tweets!


Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,refugee,migrant,immigrant,asylum_seeker,other,text_coherent,retweet_count_sum,count,text_alphanum,text_stm
0,WordPress.com,CHANNEL MIGRANT CRISIS – TODAYS VIDEOS FROM DO...,en,1284639846930227200,2020-07-19 00:03:01+00:00,1039171425364520960,0,0,0,0,...,False,True,False,False,False,CHANNEL MIGRANT CRISIS TODAYS VIDEOS FROMDOVER.,0,1,channel migrant crisis todays videos fromdover.,channel crisis today video fromdover
1,Twitter Web App,“Chinese immorality [and] eccentricities … are...,en,1284640070855729163,2020-07-19 00:03:55+00:00,153438157,22,1,37,0,...,False,False,True,False,False,Chinese immorality [and] eccentricities are ab...,22,1,chinese immorality and eccentricities are abho...,chinese immorality eccentricity abhorrent arya...
2,Twitter for iPhone,@chrisgregson123 @VeuveK @CharlieHicks90 @Rudy...,en,1284640230499328000,2020-07-19 00:04:33+00:00,503070765,0,0,0,1,...,False,False,False,False,False,O / c Leavers voted for what they believed was...,0,1,o c leavers voted for what they believed was b...,leaver voted believed best england wale howeve...
3,Twitter for iPhone,@SkyNews It never will if uk keeps bring in hu...,en,1284640911788576770,2020-07-19 00:07:15+00:00,1276420769384402944,1,1,3,1,...,False,True,False,True,False,It never will if uk keeps bring in hundreds of...,1,1,it never will if uk keeps bring in hundreds of...,never keep bring hundred asylum seeker giving ...
4,Twitter for iPhone,How many illegal immigrants this week in #Dove...,en,1284641481576402945,2020-07-19 00:09:31+00:00,755084846783950848,0,0,0,0,...,False,False,True,False,False,How many illegal immigrants this week in dover...,0,1,how many illegal immigrants this week in dover...,many illegal week dover


# 2. Entity extraction: Tag tweets using stanza module to get NER and POS tags in tweets. 
Recommended to run on GPU to speed things up.

*The function will also perform dependency parsing used at the end of the analysis

In [3]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ needed when running first time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
#stanza.download("en")
#stanza.install_corenlp()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# loading the pipeline
en_nlp = stanza.Pipeline("en",  
                         tokenize_pretokenized=False,
                         ner_batch_size=4096,
                         processors = "tokenize,pos,lemma,depparse,ner")

2021-06-20 22:20:33 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2021-06-20 22:20:33 INFO: Use device: cpu
2021-06-20 22:20:33 INFO: Loading: tokenize
2021-06-20 22:20:33 INFO: Loading: pos
2021-06-20 22:20:33 INFO: Loading: lemma
2021-06-20 22:20:33 INFO: Loading: depparse
2021-06-20 22:20:34 INFO: Loading: ner
2021-06-20 22:20:36 INFO: Done loading processors!


Run the stanza pipeline on the parsing corpus for each event dataset

In [9]:
event_name = 'greece'
event_df = read_event_df(event_name)
event_tagged_tweets = [en_nlp(tweet_batch) for tweet_batch in tqdm(list(event_df['parsing_corpus']))]
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)

In [None]:
event_name = 'channel'
event_df = read_event_df(event_name)
event_tagged_tweets = [en_nlp(tweet_batch) for tweet_batch in tqdm(list(event_df['parsing_corpus']))]
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)

In [None]:
event_name = 'tigray'
event_df = read_event_df(event_name)
event_tagged_tweets = [en_nlp(tweet_batch) for tweet_batch in tqdm(list(event_df['parsing_corpus']))]
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)

In [8]:
event_name = 'rohingya'
event_df = read_event_df(event_name)
event_tagged_tweets = [en_nlp(tweet_batch) for tweet_batch in tqdm(list(event_df['parsing_corpus']))]
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)
pickle_file(f'{event_name}_tagged_tweets',event_tagged_tweets)

100%|██████████████████████████████████████████████████████████████████████████| 22966/22966 [4:58:17<00:00,  1.28it/s]


# 3. Entity alignment
## 3.1. Merging step 1

Pre-requisite files:

    - event_tagged_tweets = python object with tagged tweets
    
The first merging step is an overlap between extraction and alignment. First, we extract the entities from the tagged tweets objects. Next, we process them, remove the duplicates by counting their instances and store them in a data frame. The dataframe is sorted by frequency of the entities.


In [7]:
def get_tweet_tags(tagged_tweets):
    """
    Input: pre-tagged tweets
    Output: list of lists containing named entities
    """
    tweet_entities=[]
    excluded_tags = ['CARDINAL', 'DATE', 'QUANTITY', 'TIME', 'PERCENT', 'MONEY', 'ORDINAL']
    for tweet in tqdm(range(len(tagged_tweets))):
            tweet_ner= [ent.text for sent in tagged_tweets[tweet].sentences for ent in sent.ents if ent.type not in excluded_tags]
            tweet_entities.append(tweet_ner)
    return tweet_entities  

def get_ent_head(entity):
    if len(word_tokenize(entity))>1:
        entity_doc = en_nlp(entity)
        #the root of NP has value 0. Since head is only one and stored in a list, we pick item [0]
        return [word.text for tweet_ent in entity_doc.sentences for word in tweet_ent.words if word.head == 0][0]
    else:
        return entity

def process_entity(entity):
    # re.sub removes all non alpanumeric characters (and lower() lowercases)
    # " ".join and split removes the consecutive whitespaces created by replacing characters
    entity = re.sub('[^0-9a-zA-Z]+', ' ', entity.lower())
    return " ".join(entity.split())

def create_entities_df(tagged_tweets):
    
    event_entities = get_tweet_tags(tagged_tweets)
    
    list_of_entities = list()
    for tweet_entities in tqdm(event_entities):
        list_of_entities.append(tweet_entities)

    entity_list = [process_entity(entity) for tweet in list_of_entities for entity in tweet]
    
    #create the count dictionary which will be converted into a df
    counted_entities = Counter(entity_list)
    ent_df = pd.DataFrame(counted_entities.items(),columns=['entity','freq']).sort_values('freq',ascending=False)
    #filter out entities shorter than 2 characters and frequency less than 5
    ent_df = ent_df[ent_df.freq>=5]
    ent_df['len'] = ent_df.entity.apply(lambda x: len(x))
    ent_df = ent_df[ent_df['len']>=2]
    #find head of the entity
    tqdm.pandas()
    ent_df['head'] = ent_df.entity.progress_apply(get_ent_head)
    ent_df.reset_index(drop=True,inplace=True)
    return ent_df[['entity','freq','head']]

def merging_step1(event_name):
    print(f'loading {event_name} tagged tweets...')
    tagged_tweets = load_pickle(f'{event_name}_tagged_tweets')
    ent_df = create_entities_df(tagged_tweets)
    pickle_file(f'{event_name}_ents',ent_df)
    return ent_df


In [None]:
ent_df = merging_step1('rohingya')

In [None]:
ent_df = merging_step1('tigray')

In [None]:
ent_df = merging_step1('greece')

In [None]:
ent_df = merging_step1('channel')

## 3.2. Merging step 2: Similarity using Sentence BERT embeddings

### 3.2.1. Encode entities with S-BERT embeddings

Pre-requisite files:
 
    - event_ents = df with event entities after 1st merging step to be encoded


In [None]:
def train_embeddings(event_ents):
    from time import time
    from sentence_transformers import SentenceTransformer
    #sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    sbert_model = SentenceTransformer('paraphrase-mpnet-base-v2')

    bert_corpus = list(event_ents['entity'])

    print(f'there are {len(bert_corpus)} entities to be encoded')
    t0 = time()
    ent_embeddings = sbert_model.encode(bert_corpus)
    print(f'Training embeddings took {time()-t0} seconds')
    return ent_embeddings

In [None]:

event_ents = load_pickle('greece_ents')

document_embeddings = train_embeddings(event_ents)

pickle_file('greece_embeddings_ents', document_embeddings)

In [None]:
event_ents = load_pickle('rohingya_ents')

document_embeddings = train_embeddings( event_ents)

pickle_file('rohingya_embeddings_ents', document_embeddings)

In [None]:
event_ents = load_pickle('tigray_ents')

document_embeddings = train_embeddings(event_ents)

pickle_file('tigray_embeddings_ents', document_embeddings)

In [None]:
event_ents = load_pickle('channel_ents')

document_embeddings = train_embeddings(event_ents)

pickle_file('channel_embeddings_ents', document_embeddings)

In [None]:
import winsound

#sound warning after the code is done running
duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)

### 3.2.2 Creating a similarity dataframe
The similarity dataframe enables faster lookup and experimentation with optimal thresholds when aligning entities
Pre-requisite files:

    - [event_name]_ents
    - [event_name]_embeddings_ents

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def create_sim_df(event_name, sim_threshold=0.6):
    """
    1. loads necessary files for the selected event 
    2. calculates pairwise cosine similarity of the entity embeddings 
    3. stores the pairs and their similarity in the similarity dataframe"""
    event_entities = load_pickle(f'{event_name}_ents')
    ent_embeddings = load_pickle(f'{event_name}_embeddings_ents')
    rows_list = []
    
    #calculate pairwise similarities
    sims = cosine_similarity(ent_embeddings)
    
    # check whether a pair similarity is above the given threshold, store in the sim_df if true
    for up_ent_id in tqdm(range(len(event_entities))):
        for low_ent_id in range(len(event_entities)):
            dict1 = {}
            # get input row in dictionary format
            # key = col_name
            if sims[up_ent_id][low_ent_id]>sim_threshold:
                dict1.update({'text': event_entities['entity'][up_ent_id], 
                              'text_to_compare':event_entities['entity'][low_ent_id], 
                              'sim':sims[up_ent_id][low_ent_id]}) 
                rows_list.append(dict1)

    sim_df = pd.DataFrame(rows_list)
    return sim_df

In [None]:
sim_df = create_sim_df('greece')
pickle_file('greece_sim_df_ents',sim_df)

In [None]:
sim_df = create_sim_df('rohingya')
pickle_file('rohingya_sim_df_ents',sim_df)

In [None]:
sim_df = create_sim_df('tigray')
pickle_file('tigray_sim_df_ents',sim_df)

In [None]:
sim_df = create_sim_df('channel')
pickle_file('channel_sim_df_ents',sim_df)

In [None]:
#sim_df enables lookup of the most similar terms 
sim_df = load_pickle('greece_sim_df_ents')
sim_df[sim_df.text=='greece'].sort_values('sim',ascending=False)[:20]


### 3.2.3. Aligning the entities

Pre-requisite files:

    - sim_df = df with all similar entity pairs)
    - entities = df of all entities (including dissimilar) with frequencies

In [10]:
def flatten(list_of_mixed_types):
    # helper function to convert a list of mixed types (strings and lists) into a list
    flat_list = []
    for element in list_of_mixed_types:
        if isinstance(element,list): 
            flat_list.extend(flatten(element))
        else: 
            flat_list.append(element)
    return flat_list

def align_transitively(what_merged):
    """
    function to transitively merge the entities
    """
    merged_dict = defaultdict(list)
    merged_ent = set()
    
    for key,value in tqdm(what_merged.items()):
        merged_ent.update(what_merged[key])
        for item in value:
            merged_dict[key].extend(what_merged[key])
            if item in what_merged.keys():
                merged_dict[key].extend(what_merged[item])
                merged_dict[key] = list(set(merged_dict[key]))

        #print('-------loop------')
        for ent,merged in what_merged.items():
            for item2 in merged:
                if item2 in what_merged[key] and item2 in what_merged[ent] and key!=ent:
                    merged_dict[key].extend(what_merged[key] + what_merged[ent])
                    merged_dict[key].append(ent)
                    merged_ent.add(ent)
        merged_dict[key] = list(set(merged_dict[key]))  
        #merged_dict[key].pop(merged_dict[key].index(key))
    return merged_dict

def finalize_ents(merged_dict):
    final_ents = defaultdict()
    check_set = set()
    for key,value in merged_dict.items():
        if key not in check_set:
            final_ents[key] = merged_dict[key]
            check_set.update(merged_dict[key])
    
    for key in final_ents.keys():
        if key in final_ents[key]:
            final_ents[key].pop(final_ents[key].index(key))
            
    # add manual terms to the dict so (refugees and asyslum seekers) and (migrants and immigrants) are together
    final_ents['refugees'] = ['refugee','asylum seeker','asylum seekers']
    final_ents['migrants'] = ['migrant','immigrant','immigrants']
    return final_ents


def merging_step2(entities,sim_df,low_threshold=0.8, high_threshold =0.9, transitive=False): 
    """
    inputs:
    entities = entity df with entity frequencies
    sim_df = similarity dataframe with all pairs of entities that are at least [threshold] similar
    low_threshold = determines entity merge in the first merging round 
    high_threshold = determines entity merge in the second and subsequent merging rounds if transitive merge is enabled
    transitive = True if enabled
    returns:
    what_merged = dictionary with core entities as keys and lists of their merged entities as values
    """
    what_merged = defaultdict(list)
    merged_ents = set()

    #merge the dataframes so we have information about frequencies of entities in sim_df
    # use outer join to include the entities that are not similar to any other entity (thus not in sim_df)
    merged_df_ = pd.merge(sim_df,entities[entities.freq>=5], how='outer', left_on='text', right_on='entity')
    merged_df = pd.merge(merged_df_,entities[entities.freq>=5], how='outer', left_on='text_to_compare', right_on='entity')
    
    #blank fields in sim_df's text are the entities that are dissimilar to the rest, pass the value from entity text
    merged_df.entity_x.fillna(merged_df.entity_y, inplace=True) 
    merged_df.freq_x.fillna(merged_df.freq_y, inplace=True) 
    
    merged_df.drop(['text','text_to_compare'],axis=1,inplace=True)

    # and select only the rows above the lower threshold, so we do not have to filter by it in the loop
    merged_df_small = merged_df[merged_df.sim>low_threshold]

    print(f'finding merged entities...')
    # create an object to iterate over unique entities
    unique_merged_df = merged_df.groupby(by=['entity_x'], sort = False, as_index=False).agg({'freq_x':'max'}).sort_values('freq_x',ascending=False)

    def get_merged_entities(entity,transitive = False):
        """
        Store merged entities as list
        """
        lookup_df = pd.DataFrame({'entity_x': [], 'entity_y':[]})

        if entity not in merged_ents:
            if transitive == False:
                lookup_df = merged_df_small[merged_df_small.entity_x==entity]
                merged_ents.update(list(lookup_df['entity_y']))
            else:
                lookup_df = merged_df_small[(merged_df_small.entity_x==entity) & (merged_df_small.sim>high_threshold)]
        
        #clean nans 
        lookup_df = lookup_df[~lookup_df['entity_y'].isnull()]
        return list(lookup_df['entity_y'])
    
    tqdm.pandas() 
    #note that in the first round the transitiveness is False
    unique_merged_df['merged'] =  unique_merged_df.entity_x.progress_apply(get_merged_entities)

    #convert the dataframe into a dictionary
    for entity,merged_list in tqdm(zip(unique_merged_df['entity_x'],unique_merged_df['merged'])):
        what_merged[entity] = merged_list.copy()
        if transitive:
            for merged_ent in merged_list:
                what_merged[entity].append(get_merged_entities(merged_ent,transitive=True))
        what_merged[entity] = list(set(flatten(what_merged[entity]) ) ) 
    
    return what_merged




In [11]:
def run_merging_step2(event_name,entity_type='ents',low_threshold=0.8, high_threshold = 0.9, transitive=True):
    sim_df = load_pickle(f'{event_name}_sim_df_{entity_type}')
    entities = load_pickle(f'{event_name}_{entity_type}')

    what_merged = merging_step2(entities,
                                sim_df,
                                low_threshold=low_threshold, 
                                high_threshold=high_threshold,
                                transitive=transitive)
    
    if transitive:
        what_merged = align_transitively(what_merged)
    
    if event_name == 'channel' or event_name == 'greece':
        what_merged['germany'].pop(what_merged['germany'].index('austria'))
        what_merged['germany'].pop(what_merged['germany'].index('austrian'))
    
    if event_name == 'greece':
        what_merged['russia'].pop(what_merged['russia'].index('ukraine'))
            
    what_merged = finalize_ents(what_merged)
    #pickle_file(f'{event_name}_final_{entity_type}',final_ents)
    pickle_file(f'{event_name}_what_merged',what_merged)
    return  what_merged #final_ents,

In [None]:
merged_dict = run_merging_step2('tigray',low_threshold=0.7,high_threshold = 0.9, transitive=False)

In [None]:
merged_dict = run_merging_step2('rohingya',low_threshold=0.7,high_threshold = 0.9,transitive=False)

In [14]:
merged_dict = run_merging_step2('greece',low_threshold=0.7, high_threshold = 0.9,transitive=False)

finding merged entities...


  0%|          | 0/2076 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [None]:
merged_dict = run_merging_step2('channel',low_threshold=0.7,high_threshold=0.85, transitive=False)

In [19]:
print(list(merged_dict.keys())[:20])

['turkey', 'greece', 'eu', 'syria', 'erdogan', 'russia', 'idlib', 'uk', 'assad', 'germany', 'nato', 'us', 'un', 'muslim', 'lesbos', 'iran', 'afghanistan', 'italy', 'france', 'bulgaria']


Check whether the similar pairs from sim_df correspond to entities in merged_dict

In [16]:
merged_dict['syria']

['north syrian',
 'syrian war',
 'syrias',
 'syrians',
 'russian syrian',
 'northern syria',
 'ne syria',
 'the syrian arab',
 'syria war',
 'syrian',
 'nw syria',
 'the syrian war',
 'syrian arab republic',
 'damascus',
 'turkish syrian',
 'the syrian army',
 'the syrian crisis',
 'syrian coalition',
 'the syrian arab republic',
 'syrian army',
 'northwest syria',
 'syrian arab',
 'north syria',
 'syrian regime',
 'the syrian civil war',
 'n syria']

In [20]:
sim_df[sim_df.text=='syria'].sort_values('sim',ascending=False)[:30]

Unnamed: 0,text,text_to_compare,sim
241,syria,syria,1.0
249,syria,syrias,0.931801
286,syria,n syria,0.913211
242,syria,syrian,0.873278
243,syria,syrians,0.835501
265,syria,syrian arab republic,0.828813
266,syria,nw syria,0.822142
254,syria,north syria,0.812446
274,syria,ne syria,0.810139
250,syria,northern syria,0.805941


### 3.2.4. Find entity frequencies

Pre-requisite files:
    
    - [event_name]_what_merged = dictionary of merged entities
    - event_df = event dataset
    
This part serves solely for illustrative purposes of how much have been the counts of analyzable entities increased thanks to entity alignment.

In [None]:
from collections import defaultdict
from itertools import combinations

def get_frequencies(what_merged, event_df):
    """
    in part 1: get frequencies of each item in what_merged dictionary from the corpus
    
    in part 2: correct frequencies for repeating terms
    """
    #~~~~ part 1 ~~~~
    freq_dict = defaultdict(list)
    for key in tqdm(what_merged.keys()):
        freq_dict2 = dict()
        c = sum(event_df['text_alphanum'].apply(lambda x: x.count(f' {key} ')))
        freq_dict[key].append(c)
        for item in what_merged[key]:
            c2 = sum(event_df['text_alphanum'].apply(lambda x: x.count(f' {item} ')))
            freq_dict2[item]=c2
        freq_dict[key].append(freq_dict2)
    
    #~~~~ part 2 ~~~~
    
    # Instantiate dict to store individual count for entity and merged count
    freq_counts = defaultdict(dict)
    
    # Iterate over entities
    for ent in freq_dict.keys():
        
        #print(f"Ent: {ent}")
        adj_freq = 0
        
        # Store number of occurences of root entity
        root_freq = freq_dict[ent][0]
        
        # Instantiate empty list where entities are stored that have already been counted
        roots = []
        
        # Integrate main entity into dict of merged entities
        ent_dict = freq_dict[ent][1]
        ent_dict[ent] = freq_dict[ent][0]
        
        # Iterate over merged entities based on their length (in term of tokens) in ascending order
        for merged_ent in sorted(ent_dict.keys(), key = len):
            #print(f"Merged Ent: {merged_ent}")
            
            # Create all possible sub-combinations of entity while keeping the order constant
            # For example: "Greece Turkey Border" yields Greece, Turkey, Border, Greece Turkey, Turkey Border, Greece Turkey Border
            merged_ent_list = merged_ent.split(" ")
            all_slices = [merged_ent_list[s:e] for s, e in combinations(range(len(merged_ent_list)+1), 2)]
            all_slices_strings = [" ".join(l) for l in all_slices]
            
            # Iterate over all entities that have already been counted
            for root in roots:
                
                # Check if any of the substring of current entity has already been counted, if yes don't count it again
                if root in all_slices:
                    #print(f"{merged_ent} was removed to prevent double count. {root} has already been counted")
                    break
            
            # If entity has not been counted, count it and add entity to counted entities
            else:
                roots.append([merged_ent])
                #print(roots)
                adj_freq += ent_dict[merged_ent]
                #print(f"Frequency incremented by {ent_dict[merged_ent]}")
        
        # Update frequency dict
        freq_counts[ent]["Root"] = root_freq
        freq_counts[ent]["Overall"] = adj_freq


    return freq_dict,freq_counts

In [None]:
event_df = read_event_df(rohingya_url)
what_merged = load_pickle('rohingya_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)

In [None]:
event_df = read_event_df(tigray_url)
what_merged = load_pickle('tigray_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)

In [None]:
event_df = read_event_df(greece_url)
what_merged = load_pickle('greece_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)
#freq_dict

In [None]:
event_df = read_event_df(channel_url)
what_merged = load_pickle('channel_what_merged')

freq_dict,total_freq = get_frequencies(what_merged, event_df)
#freq_dict

In [None]:
print(list(total_freq.keys())[:30])

In [None]:
word= 'brexit'

print(total_freq[word])
print(list(dict(sorted(freq_dict[word][1].items(), key=lambda item: item[1],reverse=True)).keys()))


In [None]:
dict(sorted(total_freq.items(), key=lambda item: item[1]['Overall'],reverse=True))