# Topic modelling using BERTopic

## Libraries/data required

In [1]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os
from pathlib import Path

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
# Read the data and perform preprocessing
df = pd.read_csv("C://Users//20203697//Desktop//DC3//JBG060-DC3-Group-12//data//articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe

print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [3]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    # NOTE why is fit_transform called? is this inplace?
    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

In [4]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, set random state in the dimensionality reduction step via the following lines

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [5]:
bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

KeyboardInterrupt: 

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [6]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [7]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger', 'food insecurity'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

69 0.5281395
182 0.4919537
67 0.4784918
111 0.45269877
85 0.4358937
253 0.4260673
19 0.3588685
165 0.30877727
260 0.3016836
196 0.3007723


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
69,48,69_food_hunger_insecurity_million,"[food, hunger, insecurity, million, levels, cr...",[The article discusses South Sudan's Agricultu...
182,19,182_famine_hunger_starvation_million,"[famine, hunger, starvation, million, somalia,...",[The article discusses the potential for a fam...
67,48,67_fao_food_kits_million,"[fao, food, kits, million, fishing, seeds, liv...",[The article discusses FAO's efforts to provid...
111,32,111_malnutrition_children_nutrition_unicef,"[malnutrition, children, nutrition, unicef, br...",[The article discusses the high rates of acute...
85,40,85_wfp_food_assistance_world,"[wfp, food, assistance, world, million, progra...",[The article discusses the U.S. government's a...
253,11,253_airdrops_wfp_food_maban,"[airdrops, wfp, food, maban, replenish, airdro...",[The article discusses how the World Food Prog...
19,111,19_agriculture_agricultural_food_farmers,"[agriculture, agricultural, food, farmers, far...",[The article discusses the need for cooperatio...
165,23,165_wfp_tons_metric_food,"[wfp, tons, metric, food, sorghum, refugees, h...",[The article discusses the successful delivery...
260,10,260_supplies_unicef_children_sanitation,"[supplies, unicef, children, sanitation, emerg...",[The article discusses the joint emergency res...
196,18,196_prices_price_poverty_inflation,"[prices, price, poverty, inflation, ssp, trade...",[The article discusses the issue of poverty an...


In [9]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['refugees', 'displaced'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

14 0.68303037
221 0.6466793
249 0.64660645
130 0.6294658
95 0.6261976
143 0.6171924
188 0.59226924
34 0.58013654
227 0.54412246
177 0.5347856


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14,124,14_refugees_uganda_refugee_district,"[refugees, uganda, refugee, district, adjumani...",[The article discusses the launch of a regiona...
221,13,221_refugees_ethiopia_number_gambella,"[refugees, ethiopia, number, gambella, camps, ...",[The article discusses the continued rise in t...
249,11,249_israeli_israel_migrants_immigrants,"[israeli, israel, migrants, immigrants, asylum...",[The article discusses an incident where Egypt...
130,28,130_kakuma_camp_refugee_refugees,"[kakuma, camp, refugee, refugees, kenya, camps...",[The article discusses the South Sudan Youth f...
95,37,95_yida_refugees_unhcr_camp,"[yida, refugees, unhcr, camp, arrivals, refuge...",[The article discusses the ongoing war in Sout...
143,26,143_refugees_unhcr_funding_million,"[refugees, unhcr, funding, million, refugee, u...",[The article discusses the efforts made by Sud...
188,19,188_refugees_water_blue_nile,"[refugees, water, blue, nile, unhcr, clean, re...",[The article discusses the worsening humanitar...
34,84,34_displaced_idps_internally_people,"[displaced, idps, internally, people, bases, c...",[The article discusses the high number of inte...
227,12,227_bentiu_base_drinking_overcrowded,"[bentiu, base, drinking, overcrowded, relief, ...",[The article discusses the horrific living con...
177,20,177_civilians_unmiss_bases_refuge,"[civilians, unmiss, bases, refuge, un, displac...",[The article discusses new fighting in South S...


In [10]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['humanitarian'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

92 0.6493029
2 0.64495003
213 0.63255084
54 0.6100116
143 0.6087105
227 0.6083819
34 0.5986478
177 0.59742427
260 0.5967566
97 0.581107


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92,39,92_humanitarian_pibor_jonglei_affected,"[humanitarian, pibor, jonglei, affected, aid, ...",[The article discusses the aid distribution op...
2,227,2_aid_million_humanitarian_assistance,"[aid, million, humanitarian, assistance, fundi...",[The article discusses the US President author...
213,14,213_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, mr, ...",[The article discusses a press briefing with t...
54,65,54_workers_aid_humanitarian_worker,"[workers, aid, humanitarian, worker, maban, ki...",[The article discusses the disappearance of si...
143,26,143_refugees_unhcr_funding_million,"[refugees, unhcr, funding, million, refugee, u...",[The article discusses the efforts made by Sud...
227,12,227_bentiu_base_drinking_overcrowded,"[bentiu, base, drinking, overcrowded, relief, ...",[The article discusses the horrific living con...
34,84,34_displaced_idps_internally_people,"[displaced, idps, internally, people, bases, c...",[The article discusses the high number of inte...
177,20,177_civilians_unmiss_bases_refuge,"[civilians, unmiss, bases, refuge, un, displac...",[The article discusses new fighting in South S...
260,10,260_supplies_unicef_children_sanitation,"[supplies, unicef, children, sanitation, emerg...",[The article discusses the joint emergency res...
97,37,97_red_cross_icrc_crescent,"[red, cross, icrc, crescent, ifrc, medical, co...",[The article discusses how the Governor of Jon...


In [11]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['conflict', 'fighting', 'murder'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

169 0.43673888
228 0.4269666
31 0.4068361
118 0.39172322
101 0.3874699
258 0.38515255
102 0.3801176
78 0.3794209
220 0.37818405
119 0.37547877


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
169,21,169_positions_rebels_upper_nassir,"[positions, rebels, upper, nassir, army, truce...",[The article discusses renewed clashes between...
228,12,228_ceasefire_akasoba_addis_ababa,"[ceasefire, akasoba, addis, ababa, violence, c...",[The article discusses the ongoing civil confl...
31,92,31_kordofan_border_blue_nafie,"[kordofan, border, blue, nafie, accusations, d...",[The article discusses the upcoming presidenti...
118,31,118_her_sister_family_death,"[her, sister, family, death, murder, veronika,...",[The article discusses the call made by Bishop...
101,35,101_jonglei_conference_state_peace,"[jonglei, conference, state, peace, communitie...",[The article discusses the Greater Akobo peace...
258,10,258_supporting_denial_rebels_rebel,"[supporting, denial, rebels, rebel, makuei, sp...",[The article discusses the denial by South Sud...
102,35,102_talks_ababa_addis_parties,"[talks, ababa, addis, parties, ethiopia, peace...",[The article discusses the latest round of pea...
78,44,78_abraham_isaiah_awuol_murder,"[abraham, isaiah, awuol, murder, chan, assassi...",[The article discusses the assassination of So...
220,13,220_ethiopias_somalia_horn_eritrea,"[ethiopias, somalia, horn, eritrea, ethiopia, ...",[The article discusses the Ethiopian governmen...
119,31,119_ceasefire_agreement_signing_ababa,"[ceasefire, agreement, signing, ababa, addis, ...",[The article discusses the signing of a ceasef...


In [17]:
# Get the top 10 topics related to the keyword 'climate'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['climate'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["climate"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

29 0.24781422
182 0.24057579
223 0.2018032
193 0.20152958
69 0.19328366
227 0.18134616
89 0.1809628
246 0.1805672
220 0.17768724
101 0.1748128


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
29,92,29_flooding_floods_affected_flood,"[flooding, floods, affected, flood, rains, wat...",[The article discusses the catastrophic humani...
182,19,182_famine_hunger_starvation_million,"[famine, hunger, starvation, million, somalia,...",[The article discusses the potential for a fam...
223,13,223_fragile_index_ranked_most,"[fragile, index, ranked, most, fsi, ffp, world...",[The article discusses the release of the Frag...
193,18,193_icglr_great_angola_lakes,"[icglr, great, angola, lakes, burundi, republi...",[The article discusses a meeting of the chiefs...
69,48,69_food_hunger_insecurity_million,"[food, hunger, insecurity, million, levels, cr...",[The article discusses South Sudan's Agricultu...
227,12,227_bentiu_base_drinking_overcrowded,"[bentiu, base, drinking, overcrowded, relief, ...",[The article discusses the horrific living con...
89,40,89_basin_water_irrigation_nile,"[basin, water, irrigation, nile, egypt, resour...",[The article discusses the findings from a new...
246,11,246_mental_health_depression_anxiety,"[mental, health, depression, anxiety, stress, ...",[The article discusses how mental illnesses ar...
220,13,220_ethiopias_somalia_horn_eritrea,"[ethiopias, somalia, horn, eritrea, ethiopia, ...",[The article discusses the Ethiopian governmen...
101,35,101_jonglei_conference_state_peace,"[jonglei, conference, state, peace, communitie...",[The article discusses the Greater Akobo peace...


In [27]:
# Get the top 10 topics related to the keyword 'flood'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['flood'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["flood"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

29 0.533038
131 0.35181606
227 0.34954697
71 0.3189929
188 0.28356898
194 0.2753489
36 0.2605583
89 0.2529
226 0.24744715
10 0.24186808


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
29,92,29_flooding_floods_affected_flood,"[flooding, floods, affected, flood, rains, wat...",[The article discusses the catastrophic humani...
131,28,131_water_project_supply_drinking,"[water, project, supply, drinking, clean, sani...",[The article discusses the South Sudan governm...
227,12,227_bentiu_base_drinking_overcrowded,"[bentiu, base, drinking, overcrowded, relief, ...",[The article discusses the horrific living con...
71,47,71_dam_egypt_renaissance_grand,"[dam, egypt, renaissance, grand, gerd, ethiopi...",[The article discusses the ongoing dispute amo...
188,19,188_refugees_water_blue_nile,"[refugees, water, blue, nile, unhcr, clean, re...",[The article discusses the worsening humanitar...
194,18,194_fire_brigade_hotel_damage,"[fire, brigade, hotel, damage, destroyed, occu...",[The article discusses a fire that occurred at...
36,84,36_cholera_outbreak_cases_hygiene,"[cholera, outbreak, cases, hygiene, health, sp...",[The article discusses the outbreak of cholera...
89,40,89_basin_water_irrigation_nile,"[basin, water, irrigation, nile, egypt, resour...",[The article discusses the findings from a new...
226,12,226_village_county_murle_attack,"[village, county, murle, attack, cattle, twic,...",[The article discusses the killing of four peo...
10,158,10_lakes_rumbek_dhuol_governor,"[lakes, rumbek, dhuol, governor, chut, matur, ...",[The article discusses the death of Colonel Yo...


In [32]:
# Get the top 10 topics related to the keyword 'drought'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['drought'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["drought"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

131 0.41914517
29 0.41300887
182 0.32987046
227 0.31524625
89 0.30321428
188 0.2780032
71 0.25939792
36 0.2560544
176 0.25591892
69 0.2557764


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
131,28,131_water_project_supply_drinking,"[water, project, supply, drinking, clean, sani...",[The article discusses the South Sudan governm...
29,92,29_flooding_floods_affected_flood,"[flooding, floods, affected, flood, rains, wat...",[The article discusses the catastrophic humani...
182,19,182_famine_hunger_starvation_million,"[famine, hunger, starvation, million, somalia,...",[The article discusses the potential for a fam...
227,12,227_bentiu_base_drinking_overcrowded,"[bentiu, base, drinking, overcrowded, relief, ...",[The article discusses the horrific living con...
89,40,89_basin_water_irrigation_nile,"[basin, water, irrigation, nile, egypt, resour...",[The article discusses the findings from a new...
188,19,188_refugees_water_blue_nile,"[refugees, water, blue, nile, unhcr, clean, re...",[The article discusses the worsening humanitar...
71,47,71_dam_egypt_renaissance_grand,"[dam, egypt, renaissance, grand, gerd, ethiopi...",[The article discusses the ongoing dispute amo...
36,84,36_cholera_outbreak_cases_hygiene,"[cholera, outbreak, cases, hygiene, health, sp...",[The article discusses the outbreak of cholera...
176,20,176_electricity_power_grid_solar,"[electricity, power, grid, solar, distribution...",[The article discusses AfDB's approval of a $1...
69,48,69_food_hunger_insecurity_million,"[food, hunger, insecurity, million, levels, cr...",[The article discusses South Sudan's Agricultu...


In [35]:
# Get the top 10 topics related to the keyword 'war'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['war'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["war"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

228 0.37526667
160 0.37088418
173 0.36752376
169 0.3633321
124 0.36224234
31 0.35796228
18 0.35751647
51 0.34704417
220 0.3466307
119 0.3442785


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
228,12,228_ceasefire_akasoba_addis_ababa,"[ceasefire, akasoba, addis, ababa, violence, c...",[The article discusses the ongoing civil confl...
160,23,160_disarmament_jonglei_guns_murle,"[disarmament, jonglei, guns, murle, collected,...",[The article discusses clashes between the Sou...
173,21,173_oil_fields_riek_machar,"[oil, fields, riek, machar, machars, loyal, re...",[The article discusses the major offensive lau...
169,21,169_positions_rebels_upper_nassir,"[positions, rebels, upper, nassir, army, truce...",[The article discusses renewed clashes between...
124,30,124_dinka_conflict_ethnic_machar,"[dinka, conflict, ethnic, machar, riek, nuer, ...",[The article discusses the call from internall...
31,92,31_kordofan_border_blue_nafie,"[kordofan, border, blue, nafie, accusations, d...",[The article discusses the upcoming presidenti...
18,111,18_peace_peacebuilding_society_civil,"[peace, peacebuilding, society, civil, violenc...",[The article discusses the debate over civil s...
51,68,51_lra_kony_lords_resistance,"[lra, kony, lords, resistance, joseph, central...",[The article discusses the end of the six-year...
220,13,220_ethiopias_somalia_horn_eritrea,"[ethiopias, somalia, horn, eritrea, ethiopia, ...",[The article discusses the Ethiopian governmen...
119,31,119_ceasefire_agreement_signing_ababa,"[ceasefire, agreement, signing, ababa, addis, ...",[The article discusses the signing of a ceasef...


In [39]:
# Get the top 10 topics related to the keyword 'fuel'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['fuel'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["fuel"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

232 0.4664012
235 0.29718953
138 0.27914804
48 0.24188197
117 0.22917321
120 0.22770771
47 0.22561127
194 0.21550217
59 0.21280524
115 0.200598


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
232,12,232_fuel_petrol_petroleum_shortage,"[fuel, petrol, petroleum, shortage, trucks, ha...",[The article discusses how the ministry of pet...
235,12,235_exploration_oil_barrels_refinery,"[exploration, oil, barrels, refinery, energy, ...",[The article discusses the signing of an explo...
138,27,138_production_oil_barrels_petroleum,"[production, oil, barrels, petroleum, day, com...",[The article discusses South Sudan's plans to ...
48,72,48_oil_pipeline_crude_decision,"[oil, pipeline, crude, decision, shut, product...",[The article discusses South Sudan's decision ...
117,31,117_revenue_oil_sector_transparency,"[revenue, oil, sector, transparency, petroleum...",[The article discusses a conference on Oil Rev...
120,30,120_pipeline_lamu_port_oil,"[pipeline, lamu, port, oil, construction, keny...",[The article discusses South Sudan's plans to ...
47,73,47_oil_production_countries_cooperation,"[oil, production, countries, cooperation, agre...",[The article discusses oil talks between Sudan...
194,18,194_fire_brigade_hotel_damage,"[fire, brigade, hotel, damage, destroyed, occu...",[The article discusses a fire that occurred at...
59,59,59_fees_barrel_oil_transit,"[fees, barrel, oil, transit, per, fee, through...",[The article discusses the disagreement betwee...
115,31,115_drivers_highway_truck_transporters,"[drivers, highway, truck, transporters, trucks...",[The article discusses how Kenyan truck driver...


In [36]:
original_df = pd.read_csv("C://Users//20203697//Desktop//DC3//JBG060-DC3-Group-12//data//articles_summary_cleaned.csv", parse_dates=["date"])

# Combine article summaries with the newly created features
df = original_df.merge(
    df[["summary", "hunger", "refugees", "humanitarian", "conflict", "climate", "flood", "drought", "war", "fuel"]],
    how="left",
    left_on="summary",
    right_on="summary",
)

df.to_csv("C://Users//20203697//Desktop//DC3//JBG060-DC3-Group-12//data//articles_topics.csv", index=False) # Save DataFrame to articles_topics.csv

In [38]:
print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False) & (df["climate"] == False) & (df["flood"] == False) & (df["drought"]==False) & (df["war"] == False)]))

18520
16286


There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!