# Topic modelling using BERTopic

## Libraries/data required

In [1]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
# Read the data and perform preprocessing
df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [3]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

Batches: 100%|██████████| 579/579 [14:03<00:00,  1.46s/it]
2023-10-19 13:09:26,026 - BERTopic - Transformed documents to Embeddings
2023-10-19 13:10:11,221 - BERTopic - Reduced dimensionality
2023-10-19 13:11:36,742 - BERTopic - Clustered reduced embeddings


In [4]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [5]:
#bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [6]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [7]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger', 'food insecurity'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

23 0.4998878
83 0.46617252
58 0.4534978
123 0.4528453
34 0.36217391
195 0.32818058
182 0.2928917
161 0.28540993
131 0.28012794
4 0.2799514


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23,116,23_food_famine_hunger_million,"[food, famine, hunger, million, insecurity, cr...",[The article discusses the famine crisis affec...
83,43,83_fao_food_kits_million,"[fao, food, kits, million, farmers, seeds, liv...",[The article discusses FAO's efforts to provid...
58,57,58_wfp_food_programme_assistance,"[wfp, food, programme, assistance, world, mill...",[The article discusses the European Commission...
123,30,123_malnutrition_children_nutrition_unicef,"[malnutrition, children, nutrition, unicef, br...",[The article discusses the high rates of acute...
34,91,34_agriculture_agricultural_farmers_food,"[agriculture, agricultural, farmers, food, far...",[The article discusses the need for cooperatio...
195,18,195_children_unicef_million_malnutrition,"[children, unicef, million, malnutrition, fund...",[The article discusses the deteriorating human...
182,19,182_tons_metric_wfp_food,"[tons, metric, wfp, food, corridor, sorghum, h...",[The article discusses the arrival of 18 truck...
161,22,161_livestock_animal_cattle_animals,"[livestock, animal, cattle, animals, diseases,...",[The article discusses how South Sudan leads i...
131,28,131_prices_price_inflation_beverages,"[prices, price, inflation, beverages, consumer...",[The article discusses a decrease in inflation...
4,215,4_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[The article discusses the commitment of the U...


In [8]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['refugees', 'displaced'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

16 0.6878126
231 0.65658164
13 0.6533308
127 0.6432601
87 0.621728
228 0.5975042
33 0.5760368
212 0.5379883
156 0.51203954
136 0.5112951


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
16,135,16_refugees_uganda_refugee_district,"[refugees, uganda, refugee, district, adjumani...",[The article discusses the World Refugee Counc...
231,12,231_israeli_israel_immigrants_migrants,"[israeli, israel, immigrants, migrants, asylum...",[The article discusses an incident where Egypt...
13,142,13_refugees_unhcr_yida_refugee,"[refugees, unhcr, yida, refugee, nile, camp, a...",[The article discusses the shortage of humanit...
127,28,127_kakuma_refugee_camp_refugees,"[kakuma, refugee, camp, refugees, kenya, camps...",[The article discusses the influx of refugees ...
87,41,87_refugees_unhcr_refugee_funding,"[refugees, unhcr, refugee, funding, ethiopia, ...",[The article discusses the urgent need for fun...
228,13,228_darfur_chad_darfuri_tissi,"[darfur, chad, darfuri, tissi, displaced, retu...",[The article discusses the influx of Chadian r...
33,92,33_displaced_idps_bases_people,"[displaced, idps, bases, people, internally, u...",[The article discusses the expansion of operat...
212,14,212_bentiu_flooding_dire_base,"[bentiu, flooding, dire, base, sanitation, dis...",[The article discusses the horrific living con...
156,24,156_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, peacekeeping, m...",[The article discusses new fighting in South S...
136,27,136_kenyans_evacuation_kenyan_flight,"[kenyans, evacuation, kenyan, flight, national...",[The article discusses the evacuation of Kenya...


In [9]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['humanitarian'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

111

 0.64607096
4 0.64320934
234 0.6348435
49 0.6089697
33 0.6074257
87 0.60171133
156 0.601251
212 0.59630156
195 0.59358394
95 0.5844182


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
111,34,111_humanitarian_pibor_jonglei_affected,"[humanitarian, pibor, jonglei, affected, aid, ...",[The article discusses the return of humanitar...
4,215,4_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[The article discusses the commitment of the U...
234,12,234_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, upda...",[The article discusses a press conference with...
49,63,49_workers_aid_humanitarian_worker,"[workers, aid, humanitarian, worker, killing, ...",[The article discusses the release of ten aid ...
33,92,33_displaced_idps_bases_people,"[displaced, idps, bases, people, internally, u...",[The article discusses the expansion of operat...
87,41,87_refugees_unhcr_refugee_funding,"[refugees, unhcr, refugee, funding, ethiopia, ...",[The article discusses the urgent need for fun...
156,24,156_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, peacekeeping, m...",[The article discusses new fighting in South S...
212,14,212_bentiu_flooding_dire_base,"[bentiu, flooding, dire, base, sanitation, dis...",[The article discusses the horrific living con...
195,18,195_children_unicef_million_malnutrition,"[children, unicef, million, malnutrition, fund...",[The article discusses the deteriorating human...
95,39,95_red_cross_icrc_crescent,"[red, cross, icrc, crescent, ifrc, medical, ca...",[The article discusses how the Governor of Jon...


In [10]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['conflict', 'fighting', 'murder'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

215 0.43773508
110 0.40162688
76 0.40162563
108 0.38666812
197 0.38523126
140 0.3822374
1 0.38093996
26 0.38051602
151 0.38031748
213 0.38017347


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
215,14,215_truce_positions_army_upper,"[truce, positions, army, upper, attacking, nil...",[The article discusses accusations from the So...
110,34,110_her_death_murder_sister,"[her, death, murder, sister, shot, was, kenyan...",[The article discusses the call made by Bishop...
76,47,76_kordofan_border_blue_accusations,"[kordofan, border, blue, accusations, nile, bo...",[The article discusses the decision by the Sud...
108,35,108_jonglei_conference_communities_peace,"[jonglei, conference, communities, peace, stat...",[The article discusses the Greater Akobo peace...
197,17,197_kordofan_army_khartoum_supporting,"[kordofan, army, khartoum, supporting, denial,...",[The article discusses South Sudan's spokesper...
140,26,140_reconciliation_healing_process_national,"[reconciliation, healing, process, national, c...",[The article discusses South Sudan's governmen...
1,394,1_igad_talks_intergovernmental_authority,"[igad, talks, intergovernmental, authority, ad...",[The article discusses the upcoming meeting of...
26,104,26_peace_peacebuilding_civil_society,"[peace, peacebuilding, civil, society, violenc...",[The article discusses the importance of civil...
151,25,151_disarmament_jonglei_guns_communities,"[disarmament, jonglei, guns, communities, murl...",[The article discusses clashes between the Sou...
213,14,213_ethiopias_somalia_ethiopia_horn,"[ethiopias, somalia, ethiopia, horn, eritrea, ...",[The article discusses accusations of interfer...


In [12]:
original_df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])

# Combine article summaries with the newly created features
df = original_df.merge(
    df[["summary", "hunger", "refugees", "humanitarian", "conflict"]],
    how="left",
    left_on="summary",
    right_on="summary",
)

df.to_csv("data/articles_topics.csv", index=False) # Save DataFrame to articles_topics.csv

In [13]:
print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)]))

18520
16495


There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!