# Topic modelling using BERTopic

## Libraries/data required

In [162]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk import WordNetLemmatizer
import re


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\20203697\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [163]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Sentiment analysis

In [164]:
# Read data into dataframe
df_senti = pd.read_csv("C://Users//20203697//Desktop//DC3//JBG060-DC3-Group-12//data//articles_summary_cleaned.csv", parse_dates=["date"]) 


### Preprocessing functions

In [165]:
# Preprocessing functions
def preprocess_articles(data,name):
    # Proprocessing the data
    data[name]=data[name].str.lower()
    # Code to remove the Hashtags from the text
    data[name]=data[name].apply(lambda x:re.sub(r'\B#\S+','',x))
    # Code to remove the links from the text
    data[name]=data[name].apply(lambda x:re.sub(r"http\S+", "", x))
    # Code to remove the Special characters from the text 
    data[name]=data[name].apply(lambda x:' '.join(re.findall(r'\w+', x)))
    # Code to substitute the multiple spaces with single spaces
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
    # Code to remove all the single characters in the text
    data[name]=data[name].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen
        
        filtered_sentence = [] 

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 
        
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        
        return filtered_sentence
    # Using "getting(sen)" function to append edited sentence to data
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x

lemmatizer = WordNetLemmatizer()
def Lemmatization(data,name):
    def getting2(sen):
        example = sen
        output_sentence =[]
        word_tokens2 = word_tokenize(example)
        lemmatized_output = [lemmatizer.lemmatize(w) for w in word_tokens2]
        
        # Remove characters which have length less than 2  
        without_single_chr = [word for word in lemmatized_output if len(word) > 2]
        # Remove numbers
        cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]
        
        return cleaned_data_title
    # Using "getting2(sen)" function to append edited sentence to data
    x=[]
    for i in data[name].values:
        x.append(getting2(i))
    data[name]=x

def make_sentences(data,name):
    data[name]=data[name].apply(lambda x:' '.join([i+' ' for i in x]))
    # Removing double spaces if created
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

# Functions from https://www.kaggle.com/code/yommnamohamed/sentiment-analysis-using-sentiwordnet

### Preprocessing

In [166]:
# Copy article paragraphs to edit and make articles without stopwords
Edited_article = df_senti['summary'].copy()
df_senti['Article_without_stopwords'] = Edited_article
preprocess_articles(df_senti,'Article_without_stopwords')
rem_stopwords_tokenize(df_senti,'Article_without_stopwords')
make_sentences(df_senti, 'Article_without_stopwords')

# Copy article paragraphs without stopwords and make articles with lemmatized words
final_Edit = df_senti['Article_without_stopwords'].copy()
df_senti["After_lemmatization"] = final_Edit
# Using the Lemmatization function to lemmatize the data
Lemmatization(df_senti,'After_lemmatization')
# Converting all the texts back to sentences
make_sentences(df_senti,'After_lemmatization')


In [167]:
df_senti

Unnamed: 0,summary,date,location_article,lat,lng,Article_without_stopwords,After_lemmatization
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250,article discusses passing new constitution rep...,article discus passing new constitution republ...
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,article discusses military actions taken khart...,article discus military action taken khartoum ...
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,article discusses signing offramework agreemen...,article discus signing offramework agreement s...
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,article discusses upcoming independence south ...,article discus upcoming independence south sud...
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250,article discusses need south sudan attract dir...,article discus need south sudan attract direct...
...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100,article discusses successful delivery twins mi...,article discus successful delivery twin midwif...
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899,article discusses bombing forced evacuation 60...,article discus bombing forced evacuation major...
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761,article discusses prime minister abiy ahmed ca...,article discus prime minister abiy ahmed calle...
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970,article discusses collapse ofcommercial buildi...,article discus collapse ofcommercial building ...


## Get sentiment scores

In [168]:
pos=neg=obj=count=0

postagging = []

for article in df_senti['After_lemmatization']:
    lists = word_tokenize(article)
    postagging.append(nltk.pos_tag(lists))

df_senti['pos_tags'] = postagging

def penn_to_wn(tag):    
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    #Lemmatization
    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    #Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet. 
    #Synset instances are the groupings of synonymous words that express the same concept. 
    #Some of the words have only one Synset and some have several.
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]
senti_score = []

for pos_val in df_senti['pos_tags']:
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for score in senti_val:
        try:
            pos = pos + score[1]  #positive score is stored at 2nd position
            neg = neg + score[2]  #negative score is stored at 3rd position
        except:
            continue
    senti_score.append(pos - neg)
    pos=neg=0    
    
df_senti['senti_score'] = senti_score
print(df_senti['senti_score'])


0        1.500
1       -2.125
2        1.750
3        0.500
4        0.500
         ...  
18515    4.750
18516    0.625
18517    0.750
18518   -0.875
18519    0.625
Name: senti_score, Length: 18520, dtype: float64


In [169]:
df_senti.to_csv("data/articles_sentiment.csv")
df_senti[['summary', 'senti_score']]

Unnamed: 0,summary,senti_score
0,The article discusses the passing of the new C...,1.500
1,The article discusses the military actions tak...,-2.125
2,The article discusses the signing of a Framewo...,1.750
3,The article discusses the upcoming independenc...,0.500
4,The article discusses the need for South Sudan...,0.500
...,...,...
18515,The article discusses the successful delivery ...,4.750
18516,The article discusses the bombing and forced e...,0.625
18517,The article discusses how Prime Minister Abiy ...,0.750
18518,The article discusses the collapse of a commer...,-0.875


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [170]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

In [171]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [172]:
# bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [173]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [174]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger', 'food insecurity'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

20 0.5041958
75 0.48639947
118 0.45192146
235 0.43502936
80 0.4321677
19 0.3603521
138 0.3305999
164 0.32937902
173 0.3139181
161 0.28486472


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20,107,20_food_famine_hunger_million,"[food, famine, hunger, million, insecurity, cr...",[The article discusses the unprecedented level...
75,43,75_fao_food_kits_livelihood,"[fao, food, kits, livelihood, seeds, million, ...",[The article discusses the dire food insecurit...
118,29,118_malnutrition_children_unicef_nutrition,"[malnutrition, children, unicef, nutrition, br...",[The article discusses the launch of a mass mo...
235,12,235_wfp_airdrops_food_maban,"[wfp, airdrops, food, maban, yida, drops, metr...",[The article discusses the UN World Food Progr...
80,42,80_wfp_food_assistance_world,"[wfp, food, assistance, world, programme, wfps...",[The article discusses how USAID has provided ...
19,107,19_agriculture_agricultural_farmers_food,"[agriculture, agricultural, farmers, food, pro...",[The article discusses the need for cooperatio...
138,26,138_wfp_tons_food_metric,"[wfp, tons, food, metric, refugees, humanitari...",[The article discusses the opening of a new co...
164,22,164_children_unicef_million_malnutrition,"[children, unicef, million, malnutrition, cris...",[The article discusses the warning from the UN...
173,20,173_prices_price_traders_poverty,"[prices, price, traders, poverty, market, ssp,...",[The article discusses the issue of poverty an...
161,22,161_livestock_animal_cattle_fisheries,"[livestock, animal, cattle, fisheries, disease...",[The article discusses the prevalence of East ...


In [175]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['refugees', 'displaced'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

12 0.68977296
38 0.6648527
243 0.65658164
123 0.6229372
171 0.6137617
151 0.58424604
27 0.57889175
205 0.5399327
254 0.52582836
41 0.51459694


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,132,12_refugees_uganda_refugee_district,"[refugees, uganda, refugee, district, adjumani...",[The article discusses the high levels of disp...
38,79,38_refugees_unhcr_yida_camp,"[refugees, unhcr, yida, camp, refugee, arrival...",[The article discusses renewed fighting betwee...
243,12,243_israeli_israel_immigrants_migrants,"[israeli, israel, immigrants, migrants, asylum...",[The article discusses an incident where Egypt...
123,28,123_refugees_unhcr_refugee_funding,"[refugees, unhcr, refugee, funding, million, a...",[The article discusses the urgent need for fun...
171,20,171_kakuma_camp_refugee_kenya,"[kakuma, camp, refugee, kenya, refugees, centr...",[The article discusses the UNFPA-supported Kak...
151,24,151_refugees_nile_water_unhcr,"[refugees, nile, water, unhcr, blue, refugee, ...",[The article discusses the worsening condition...
27,96,27_displaced_idps_internally_people,"[displaced, idps, internally, people, bases, d...",[The article discusses the high number of inte...
205,16,205_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, refuge, displac...",[The article discusses how hundreds of civilia...
254,10,254_bentiu_drinking_rainy_base,"[bentiu, drinking, rainy, base, water, positio...",[The article discusses the alarming number of ...
41,76,41_returnees_kosti_iom_repatriation,"[returnees, kosti, iom, repatriation, migratio...",[The article discusses the expected arrival of...


In [176]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['humanitarian'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

105 0.64876795
4 0.6443301
212 0.6376411
50 0.61242604
123 0.60754377
27 0.60434043
254 0.59778774
205 0.5931656
164 0.58671415
86 0.58494776


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,35,105_humanitarian_pibor_jonglei_relief,"[humanitarian, pibor, jonglei, relief, affecte...",[The article discusses the aid distribution op...
4,222,4_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, emerg...",[The article discusses the commitment of the U...
212,15,212_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, mr, ...",[The article discusses a press conference with...
50,65,50_workers_aid_humanitarian_worker,"[workers, aid, humanitarian, worker, killing, ...",[The article discusses the disappearance of si...
123,28,123_refugees_unhcr_refugee_funding,"[refugees, unhcr, refugee, funding, million, a...",[The article discusses the urgent need for fun...
27,96,27_displaced_idps_internally_people,"[displaced, idps, internally, people, bases, d...",[The article discusses the high number of inte...
254,10,254_bentiu_drinking_rainy_base,"[bentiu, drinking, rainy, base, water, positio...",[The article discusses the alarming number of ...
205,16,205_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, refuge, displac...",[The article discusses how hundreds of civilia...
164,22,164_children_unicef_million_malnutrition,"[children, unicef, million, malnutrition, cris...",[The article discusses the warning from the UN...
86,40,86_red_cross_icrc_crescent,"[red, cross, icrc, crescent, ifrc, medical, ca...",[The article discusses how the Governor of Jon...


In [177]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['conflict', 'fighting', 'murder'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

160 0.45053893
259 0.42847627
83 0.41210556
210 0.38839823
116 0.38701126
121 0.37882677
82 0.37860534
93 0.37774906
133 0.3762521
145 0.3740555


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
160,22,160_her_murder_shot_sister,"[her, murder, shot, sister, veronika, killed, ...",[The article discusses the call made by Bishop...
259,10,259_positions_violating_upper_army,"[positions, violating, upper, army, opposition...",[The article discusses accusations from the So...
83,41,83_kordofan_blue_border_between,"[kordofan, blue, border, between, negotiations...",[The article discusses the increasing military...
210,15,210_supporting_army_kordofan_khartoum,"[supporting, army, kordofan, khartoum, war, to...",[The article discusses South Sudan's spokesper...
116,30,116_talks_addis_ababa_ethiopia,"[talks, addis, ababa, ethiopia, peace, parties...",[The article discusses the latest round of pea...
121,28,121_conflict_dinka_machar_riek,"[conflict, dinka, machar, riek, nuer, displace...",[The article discusses the ongoing conflict in...
82,41,82_abraham_isaiah_murder_awuol,"[abraham, isaiah, murder, awuol, assassination...",[The article discusses the assassination of So...
93,38,93_jonglei_conference_communities_peace,"[jonglei, conference, communities, peace, stat...",[The article discusses the Greater Akobo peace...
133,27,133_ceasefire_signing_agreement_ababa,"[ceasefire, signing, agreement, ababa, addis, ...",[The article discusses the signing of a ceasef...
145,25,145_bahr_ghazal_el_spla,"[bahr, ghazal, el, spla, northern, army, attac...",[The article discusses the South Sudan cabinet...


In [178]:
df

Unnamed: 0,summary,date,location_article,lat,lng,hunger,refugees,humanitarian,conflict
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250,False,False,False,False
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,False,False,False,False
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,False,False,False,False
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,False,False,False,False
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250,False,False,False,False
...,...,...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100,False,False,False,False
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899,False,False,False,False
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761,False,False,False,False
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970,False,False,False,True


In [181]:
# Combine article summaries with the newly created features

df_senti_topic = pd.merge(df, df_senti[['summary', 'senti_score']], on = ['summary'])
df_senti_topic.to_csv("data/articles_senti_topics.csv", index=False) # Save DataFrame to articles_senti_topics.csv
df_senti_topic

Unnamed: 0,summary,date,location_article,lat,lng,hunger,refugees,humanitarian,conflict,senti_score
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250,False,False,False,False,1.500
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,False,False,False,False,-2.125
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,False,False,False,False,1.750
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,False,False,False,False,0.500
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250,False,False,False,False,0.500
...,...,...,...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100,False,False,False,False,4.750
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899,False,False,False,False,0.625
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761,False,False,False,False,0.750
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970,False,False,False,True,-0.875


In [180]:
print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)]))

18520
16943


There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!