# Topic modelling using BERTopic

## Libraries/data required

In [2]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk import WordNetLemmatizer
import re


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [3]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Sentiment analysis

In [4]:
# Read data into dataframe
df_senti = pd.read_csv("C://Users//20203697//Desktop//DC3//JBG060-DC3-Group-12//data//articles_summary_cleaned.csv", parse_dates=["date"]) 

### Preprocessing functions

In [5]:
# Preprocessing functions
def preprocess_articles(data,name):
    # Proprocessing the data
    data[name]=data[name].str.lower()
    # Code to remove the Hashtags from the text
    data[name]=data[name].apply(lambda x:re.sub(r'\B#\S+','',x))
    # Code to remove the links from the text
    data[name]=data[name].apply(lambda x:re.sub(r"http\S+", "", x))
    # Code to remove the Special characters from the text 
    data[name]=data[name].apply(lambda x:' '.join(re.findall(r'\w+', x)))
    # Code to substitute the multiple spaces with single spaces
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
    # Code to remove all the single characters in the text
    data[name]=data[name].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen
        
        filtered_sentence = [] 

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 
        
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        
        return filtered_sentence
    # Using "getting(sen)" function to append edited sentence to data
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x

lemmatizer = WordNetLemmatizer()
def Lemmatization(data,name):
    def getting2(sen):
        example = sen
        output_sentence =[]
        word_tokens2 = word_tokenize(example)
        lemmatized_output = [lemmatizer.lemmatize(w) for w in word_tokens2]
        
        # Remove characters which have length less than 2  
        without_single_chr = [word for word in lemmatized_output if len(word) > 2]
        # Remove numbers
        cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]
        
        return cleaned_data_title
    # Using "getting2(sen)" function to append edited sentence to data
    x=[]
    for i in data[name].values:
        x.append(getting2(i))
    data[name]=x

def make_sentences(data,name):
    data[name]=data[name].apply(lambda x:' '.join([i+' ' for i in x]))
    # Removing double spaces if created
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

# Functions from https://www.kaggle.com/code/yommnamohamed/sentiment-analysis-using-sentiwordnet

### Preprocessing

In [6]:
# Copy article paragraphs to edit and make articles without stopwords
Edited_article = df_senti['summary'].copy()
df_senti['Article_without_stopwords'] = Edited_article
preprocess_articles(df_senti,'Article_without_stopwords')
rem_stopwords_tokenize(df_senti,'Article_without_stopwords')
make_sentences(df_senti, 'Article_without_stopwords')

# Copy article paragraphs without stopwords and make articles with lemmatized words
final_Edit = df_senti['Article_without_stopwords'].copy()
df_senti["After_lemmatization"] = final_Edit
# Using the Lemmatization function to lemmatize the data
Lemmatization(df_senti,'After_lemmatization')
# Converting all the texts back to sentences
make_sentences(df_senti,'After_lemmatization')


In [7]:
df_senti

Unnamed: 0,summary,date,location_article,lat,lng,Article_without_stopwords,After_lemmatization
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250,article discusses passing new constitution rep...,article discus passing new constitution republ...
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,article discusses military actions taken khart...,article discus military action taken khartoum ...
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,article discusses signing offramework agreemen...,article discus signing offramework agreement s...
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,article discusses upcoming independence south ...,article discus upcoming independence south sud...
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250,article discusses need south sudan attract dir...,article discus need south sudan attract direct...
...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100,article discusses successful delivery twins mi...,article discus successful delivery twin midwif...
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899,article discusses bombing forced evacuation 60...,article discus bombing forced evacuation major...
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761,article discusses prime minister abiy ahmed ca...,article discus prime minister abiy ahmed calle...
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970,article discusses collapse ofcommercial buildi...,article discus collapse ofcommercial building ...


## Get sentiment scores

In [8]:
pos=neg=obj=count=0

postagging = []

for article in df_senti['After_lemmatization']:
    lists = word_tokenize(article)
    postagging.append(nltk.pos_tag(lists))

df_senti['pos_tags'] = postagging

def penn_to_wn(tag):    
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    #Lemmatization
    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    #Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet. 
    #Synset instances are the groupings of synonymous words that express the same concept. 
    #Some of the words have only one Synset and some have several.
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]
senti_score = []

for pos_val in df_senti['pos_tags']:
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for score in senti_val:
        try:
            pos = pos + score[1]  #positive score is stored at 2nd position
            neg = neg + score[2]  #negative score is stored at 3rd position
        except:
            continue
    senti_score.append(pos - neg)
    pos=neg=0    
    
df_senti['senti_score'] = senti_score
print(df_senti['senti_score'])


0        1.500
1       -2.125
2        1.750
3        0.500
4        0.500
         ...  
18515    4.750
18516    0.625
18517    0.750
18518   -0.875
18519    0.625
Name: senti_score, Length: 18520, dtype: float64


In [9]:
# Save sentiment of articles to csv
df_senti.to_csv("data/articles_sentiment.csv")

## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [10]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

Batches: 100%|██████████| 579/579 [13:44<00:00,  1.42s/it]
2023-09-28 11:35:31,073 - BERTopic - Transformed documents to Embeddings
2023-09-28 11:36:07,694 - BERTopic - Reduced dimensionality
2023-09-28 11:37:28,391 - BERTopic - Clustered reduced embeddings


In [11]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [12]:
# bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [13]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [14]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger', 'food insecurity'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

21 0.5058255
70 0.4754068
109 0.45269877
223 0.4301224
53 0.42781878
20 0.36943847
217 0.34289846
143 0.3093038
206 0.2891382
149 0.2843095


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21,107,21_food_famine_hunger_million,"[food, famine, hunger, million, crisis, insecu...",[The article discusses how extreme hunger is a...
70,48,70_fao_food_seeds_kits,"[fao, food, seeds, kits, fishing, livelihood, ...",[The article discusses FAO's efforts to provid...
109,32,109_malnutrition_children_nutrition_unicef,"[malnutrition, children, nutrition, unicef, br...",[The article discusses the high rates of acute...
223,13,223_wfp_airdrops_food_maban,"[wfp, airdrops, food, maban, yida, drops, metr...",[The article discusses the United Nations Worl...
53,57,53_wfp_food_programme_assistance,"[wfp, food, programme, assistance, world, mill...",[The article discusses how USAID has provided ...
20,107,20_agriculture_agricultural_farmers_food,"[agriculture, agricultural, farmers, food, pro...",[The article discusses the celebration of the ...
217,14,217_children_unicef_million_malnutrition,"[children, unicef, million, malnutrition, cris...",[The article discusses how hundreds of thousan...
143,23,143_prices_traders_price_flour,"[prices, traders, price, flour, market, povert...",[The article discusses the issue of poverty an...
206,15,206_tons_metric_corridor_food,"[tons, metric, corridor, food, sorghum, humani...",[The article discusses the arrival of 18 truck...
149,23,149_livestock_animal_cattle_animals,"[livestock, animal, cattle, animals, diseases,...",[The article discusses how South Sudan leads i...


In [15]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['refugees', 'displaced'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

1 0.68447477
241 0.64660645
22 0.5758951
218 0.55605793
74 0.5270381
204 0.5249783
38 0.51201504
164 0.50009775
232 0.49630988
41 0.49117184


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,508,1_refugees_refugee_unhcr_camp,"[refugees, refugee, unhcr, camp, camps, uganda...",[The article discusses the launch of a regiona...
241,11,241_israeli_israel_migrants_immigrants,"[israeli, israel, migrants, immigrants, asylum...",[The article discusses an incident where Egypt...
22,104,22_displaced_idps_people_un,"[displaced, idps, people, un, internally, mala...",[The article discusses the continuing clashes ...
218,14,218_bentiu_flooding_dire_base,"[bentiu, flooding, dire, base, sanitation, dri...",[The article discusses the dire situation in a...
74,45,74_darfur_unamid_300000_chad,"[darfur, unamid, 300000, chad, displaced, huma...",[The article discusses a news report by the Ne...
204,15,204_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, poc, displaced,...","[The article discusses the situation in Juba, ..."
38,76,38_returnees_kosti_iom_repatriation,"[returnees, kosti, iom, repatriation, stranded...",[The article discusses the arrival of the last...
164,20,164_kenyans_evacuation_kenyan_evacuated,"[kenyans, evacuation, kenyan, evacuated, citiz...",[The article discusses the evacuation of Kenya...
232,12,232_malakal_poc_msf_base,"[malakal, poc, msf, base, site, shilluk, prote...",[The article discusses the ongoing relocation ...
41,70,41_workers_aid_humanitarian_worker,"[workers, aid, humanitarian, worker, killing, ...",[The article discusses the release of ten aid ...


In [16]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['humanitarian'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

6 0.64275354
108 0.64151883
215 0.6407988
22 0.61294293
41 0.61222845
218 0.6093367
249 0.5988955
204 0.59769297
1 0.58603036
92 0.58243316


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,181,6_million_humanitarian_aid_assistance,"[million, humanitarian, aid, assistance, emerg...",[The article discusses the commitment of the U...
108,33,108_humanitarian_jonglei_pibor_affected,"[humanitarian, jonglei, pibor, affected, aid, ...",[The article discusses the return of humanitar...
215,14,215_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, upda...",[The article discusses a press conference with...
22,104,22_displaced_idps_people_un,"[displaced, idps, people, un, internally, mala...",[The article discusses the continuing clashes ...
41,70,41_workers_aid_humanitarian_worker,"[workers, aid, humanitarian, worker, killing, ...",[The article discusses the release of ten aid ...
218,14,218_bentiu_flooding_dire_base,"[bentiu, flooding, dire, base, sanitation, dri...",[The article discusses the dire situation in a...
249,10,249_ukraine_million_path_faith,"[ukraine, million, path, faith, peace, aid, ne...",[The article discusses the commitment of warri...
204,15,204_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, poc, displaced,...","[The article discusses the situation in Juba, ..."
1,508,1_refugees_refugee_unhcr_camp,"[refugees, refugee, unhcr, camp, camps, uganda...",[The article discusses the launch of a regiona...
92,39,92_red_cross_icrc_crescent,"[red, cross, icrc, crescent, ifrc, medical, ca...",[The article discusses how the Governor of Jon...


In [17]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['conflict', 'fighting', 'murder'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

198 0.44643044
133 0.44176638
63 0.40496123
65 0.39442405
216 0.3852977
122 0.37921542
87 0.3762852
90 0.3736592
205 0.3733245
182 0.37059677


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
198,16,198_positions_truce_rebels_army,"[positions, truce, rebels, army, attacks, oppo...",[The article discusses how South Sudan has acc...
133,25,133_her_murder_shot_sister,"[her, murder, shot, sister, veronika, she, bur...",[The article discusses the murder of Kenyan bu...
63,52,63_border_kordofan_blue_both,"[border, kordofan, blue, both, negotiations, h...",[The article discusses fighting in Southern Ko...
65,50,65_jonglei_conference_state_peace,"[jonglei, conference, state, peace, communitie...",[The article discusses a political reconciliat...
216,14,216_supporting_army_denial_kordofan,"[supporting, army, denial, kordofan, rebels, w...",[The article discusses South Sudan's spokesper...
122,27,122_disarmament_jonglei_guns_exercise,"[disarmament, jonglei, guns, exercise, murle, ...",[The article discusses clashes between the Sou...
87,40,87_abraham_isaiah_murder_assassination,"[abraham, isaiah, murder, assassination, awuol...",[The article discusses the assassination of So...
90,39,90_nuer_dinka_ethnic_conflict,"[nuer, dinka, ethnic, conflict, machar, riek, ...",[The article discusses the call from internall...
205,15,205_kiir_salva_kiirs_conflict,"[kiir, salva, kiirs, conflict, riek, machar, p...",[The article discusses actor George Clooney's ...
182,18,182_ethiopias_ethiopia_somalia_horn,"[ethiopias, ethiopia, somalia, horn, eritrea, ...",[The article discusses accusations of interfer...


In [18]:
# Combine article summaries with the newly created features and sentiment

df_senti_topic = pd.merge(df, df_senti[['summary', 'senti_score']], on = ['summary'])
df_senti_topic.to_csv("data/articles_senti_topics.csv", index=False) # Save DataFrame to articles_senti_topics.csv
df_senti_topic

Unnamed: 0,summary,date,location_article,lat,lng,hunger,refugees,humanitarian,conflict,senti_score
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250,False,False,False,False,1.500
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,False,False,False,False,-2.125
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,False,False,False,False,1.750
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,False,False,False,False,0.500
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250,False,False,False,False,0.500
...,...,...,...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100,False,False,False,False,4.750
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899,False,False,False,False,0.625
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761,False,False,False,True,0.750
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970,False,False,False,True,-0.875


In [19]:
print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)]))

18520
16633


There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!