# Topic modelling using BERTopic

## Libraries/data required

In [2]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

In [3]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [4]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

In [5]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [6]:
# bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [7]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!

Top 10 categories for food related categories : food insecurity previously 370 now 553


In [8]:
import pandas as pd
df_newtopics = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
df_newtopics

Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250
...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970


In [9]:
food_crisis_keywords = [
    "Food crisis", "Hunger", "Starvation", "Malnutrition", 
    "Famine", "Food scarcity", "Food insecurity", "Undernourished", 
    "Food shortage", "Hunger strike", "Food aid", "Emergency relief", 
    "Nutritional deficiency", "Food distribution", "Food bank", 
    "Food desert", "Agricultural collapse", "Rising food prices", 
    "Food rationing"
]

In [10]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=food_crisis_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['hunger']==True].count()
df_newtopics



85 0.630256
17 0.6161678
80 0.5772756
181 0.57381785
115 0.5703357
168 0.55105084
27 0.51323843
228 0.46964934
226 0.46925715
3 0.46828657


Unnamed: 0,summary,date,location_article,lat,lng,hunger
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250,False
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,False
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,False
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,False
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250,False
...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100,False
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899,False
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761,False
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970,False


Top 10 categories for conflict and violence result previously 223 now 396

In [11]:
conflict_and_violence_keywords = [
    "Conflict", "Violence", "War", "Battle", "Skirmish", "Attack", 
    "Terrorism", "Militant", "Insurgency", "Riot", "Clashes", "Strife",
    "Aggression", "Hostilities", "Bombing", "Shooting", "Casualties", 
    "Armed", "Rebellion", "Assault", "Combat", "Siege", "Civil war", 
    "Ethnic tension", "Genocide", "Massacre", "Hostage", "Guerrilla", 
    "Coup", "Uprising"
]


In [12]:
# Get the top 10 topics related to the keywords conflict and violence

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=conflict_and_violence_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['conflict']==True].count()


196 0.5245111
70 0.5057998
172 0.5053438
195 0.5013187
106 0.48698074
53 0.47766948
116 0.47572848
218 0.4688604
60 0.4616946
187 0.45715407


summary             329
date                329
location_article    329
lat                 329
lng                 329
hunger              329
conflict            329
dtype: int64

Top 10 categories for humanitarian 634 then 643 now

In [13]:
humanitarian_aid_keywords = [
    "Food aid", "Emergency relief", "Aid convoy", "Donations", 
    "Non-governmental organizations", "Charities", "Intervention", 
    "Supplies", "Medical aid", "Humanitarian", "Help"
]

In [14]:
# Get the top 10 topics related to the keywords humanitarian aid

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=humanitarian_aid_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['humanitarian']==True].count()
#list(df_newtopics['summary'][df_newtopics['humanitarian']==True])


86 0.6475625
3 0.64314044
208 0.64294755
85 0.630256
53 0.6127999
28 0.60199976
228 0.5950663
114 0.5948111
186 0.5924746
2 0.5801154


summary             819
date                819
location_article    819
lat                 819
lng                 819
hunger              819
conflict            819
humanitarian        819
dtype: int64

Top 10 categories for forced displacements / refugees then now 493

In [15]:
forced_displacements_keywords = [
    "Refugees", "Internally displaced", "Eviction", "Migration",
    "Relocation", "Camp", "Asylum seekers", "Exodus", 
    "Population displacement", "Border crossing"
]

In [16]:
# Get the top 10 topics related to the keywords refugees

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=forced_displacements_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
#df_newtopics[df_newtopics['refugees']==True].count()



15 0.6876225
2 0.6605958
241 0.64660645
138 0.6259512
239 0.5960009
28 0.57727945
186 0.54077786
228 0.5366763
47 0.5160629
130 0.5128032


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15,137,15_refugees_uganda_refugee_district,"[refugees, uganda, refugee, district, adjumani...",[The article discusses the launch of a regiona...
2,285,2_refugees_unhcr_refugee_nile,"[refugees, unhcr, refugee, nile, yida, camp, c...",[The article discusses the increasing number o...
241,11,241_israeli_israel_migrants_immigrants,"[israeli, israel, migrants, immigrants, asylum...",[The article discusses an incident where Egypt...
138,27,138_kakuma_refugee_camp_kenya,"[kakuma, refugee, camp, kenya, refugees, camps...",[The article discusses the influx of refugees ...
239,12,239_darfur_chad_darfuri_tissi,"[darfur, chad, darfuri, tissi, displaced, retu...",[The article discusses the influx of Chadian r...
28,100,28_displaced_idps_people_internally,"[displaced, idps, people, internally, bases, u...",[The article discusses violent activities amon...
186,18,186_civilians_unmiss_un_bases,"[civilians, unmiss, un, bases, refuge, displac...",[The article discusses new fighting in South S...
228,12,228_bentiu_base_flooding_drinking,"[bentiu, base, flooding, drinking, sanitation,...",[The article discusses the horrific living con...
47,77,47_returnees_kosti_iom_repatriation,"[returnees, kosti, iom, repatriation, stranded...",[The article discusses the arrival of the last...
130,28,130_kenyans_evacuation_kenyan_nationals,"[kenyans, evacuation, kenyan, nationals, fligh...",[The article discusses the evacuation of Kenya...


## Additional new categories
Top 10 categories for political instability NOW: 515

In [17]:
political_instability_keywords = [
    "Coup", "Unrest", "Regime change", "Sanctions", "Governance", 
    "Corruption", "Protests", "Civil unrest", "Election", "Political crisis"
]

In [18]:
# Get the top 10 topics related to the keywords political instability

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=political_instability_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["politics"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['politics']==True].count()


34 0.6067066
8 0.58838606
76 0.5493268
20 0.52059174
70 0.5166007
108 0.51253843
195 0.5011233
72 0.4988699
131 0.49361986
116 0.49328315


summary             618
date                618
location_article    618
lat                 618
lng                 618
hunger              618
conflict            618
humanitarian        618
refugees            618
politics            618
dtype: int64

Top 10 categories for economic issues NOW 476

In [19]:
economic_issues_keywords = [
    "Recession", "Inflation", "Unemployment", "Economic collapse", 
    "Debt", "Trade barrier", "Currency devaluation", "Financial crisis",
    "Poverty", "Economic sanctions"
]


In [20]:
# Get the top 10 topics related to the keywords economic issues

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=economic_issues_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["economics"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['economics']==True].count()


51 0.5655854
34 0.56004345
17 0.4726956
102 0.45058596
160 0.44329643
115 0.42803964
181 0.42509422
43 0.41583616
194 0.41398948
80 0.41186297


summary             543
date                543
location_article    543
lat                 543
lng                 543
hunger              543
conflict            543
humanitarian        543
refugees            543
politics            543
economics           543
dtype: int64

Top 10 categories for production shortage NOW 489

In [21]:
production_shortage_keywords = [
    "Crop failure", "Supply chain disruption", "Agriculture decline",
    "Harvest", "Irrigation problems", "Yield", "Production halt",
    "Shortfall", "Agriculture technology", "Infrastructure"
]

In [22]:
# Get the top 10 topics related to the keywords production shortage

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=production_shortage_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["production"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['production']==True].count()


27 0.55947334
246 0.46831745
80 0.4261831
129 0.42516935
179 0.4201357
17 0.4173216
119 0.40326
63 0.3929451
181 0.39060515
219 0.38318223


summary             450
date                450
location_article    450
lat                 450
lng                 450
hunger              450
conflict            450
humanitarian        450
refugees            450
politics            450
economics           450
production          450
dtype: int64

Top 10 categories for Land Related Issues NOW 474

In [23]:
land_related_issues_keywords = [
    "Land rights", "Land grab", "Land degradation", "Land reform", 
    "Deforestation", "Afforestation", "Land dispute", "Territorial disputes",
    "Land tenure", "Land use change"
]

In [24]:
# Get the top 10 topics related to the keywords land related issues
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=land_related_issues_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["land"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['land']==True].count()


136 0.5959922
112 0.46228904
234 0.45230207
195 0.4332926
89 0.43215275
67 0.4220714
99 0.40830228
78 0.3916825
68 0.38830143
44 0.3851289


summary             394
date                394
location_article    394
lat                 394
lng                 394
hunger              394
conflict            394
humanitarian        394
refugees            394
politics            394
economics           394
production          394
land                394
dtype: int64

Top 10 categories for pests and diseases NOW 462

In [25]:
pests_and_diseases_keywords = [
    "Pest infestation", "Crop disease", "Blights", "Locust", 
    "Weevils", "Molds", "Fungus", "Agricultural pests", "Pathogens", 
    "Invasive species"
]

In [26]:
# Get the top 10 topics related to the keywords pests and diseases

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=pests_and_diseases_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["pests"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['pests']==True].count()


191 0.4810661
40 0.41487882
27 0.40696007
80 0.3913884
144 0.38963416
41 0.3808151
17 0.37353253
66 0.37348843
123 0.36794478
179 0.36504847


summary             590
date                590
location_article    590
lat                 590
lng                 590
hunger              590
conflict            590
humanitarian        590
refugees            590
politics            590
economics           590
production          590
land                590
pests               590
dtype: int64

In [27]:
environmental_issues_keywords = [
    "Drought", "Climate change", "Flood", "Desertification", 
    "Natural disasters", "Water scarcity", "Heatwave", "Sea level rise", 
    "Erosion", "Pollution"
]

In [28]:
# Get the top 10 topics related to the keywords enviromental issues

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=environmental_issues_keywords, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["environment"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['environment']==True].count()

relevant_topics
# topic_terms = bertopic.get_topic(128)
# topic_terms

129 0.5480695
44 0.53715414
228 0.4821716
193 0.41341066
86 0.41012728
28 0.38149956
134 0.3732643
17 0.37106913
145 0.36905771
245 0.36735898


[(129, 0.5480695),
 (44, 0.53715414),
 (228, 0.4821716),
 (193, 0.41341066),
 (86, 0.41012728),
 (28, 0.38149956),
 (134, 0.3732643),
 (17, 0.37106913),
 (145, 0.36905771),
 (245, 0.36735898)]

### Big Diseases 447


In [29]:
major_dieseases = [
    "covid",
    "cholera",
    "malaria",
    "hiv",
    "aids",
    "ebola",
    "tyfus",
    "odra",
    "hepatitis",
    "monkeypox",
]


In [30]:
# Get the top 10 topics related to the keywords enviromental issues

relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=major_dieseases, top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df_newtopics["major"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
df_newtopics[df_newtopics['major']==True].count()


# topic_terms = bertopic.get_topic(111)
# topic_terms

144 0.7132701
123 0.6793604
40 0.6528835
111 0.6517547
41 0.55072457
191 0.5271292
211 0.42946506
66 0.41168702
46 0.35681075
65 0.35555172


summary             474
date                474
location_article    474
lat                 474
lng                 474
hunger              474
conflict            474
humanitarian        474
refugees            474
politics            474
economics           474
production          474
land                474
pests               474
environment         474
major               474
dtype: int64

## NOW TOTAL 3526 WITH AT LEAST ONE CATEGORY

In [31]:

#df_newtopics.to_csv("data/articles_newtopics.csv", index=False) # Save DataFrame to articles_newtopics.csv

In [32]:
filtered_df = df_newtopics[df_newtopics.applymap(lambda x: x == True).any(axis=1)]
print(filtered_df)

                                                 summary       date  \
11     The article discusses the South Sudan governme... 2011-07-03   
18     The article discusses the potential for full-s... 2011-07-07   
45     The article discusses the ongoing conflict and... 2011-07-02   
48     The article discusses the decision of Western ... 2011-06-30   
80     The article discusses the difficulties faced b... 2011-07-05   
...                                                  ...        ...   
18493  The article discusses the denial by the govern... 2023-02-10   
18496  The article discusses how conflict, political ... 2023-01-30   
18506  The article discusses how armed raids in Great... 2022-12-30   
18516  The article discusses the bombing and forced e... 2023-04-26   
18519  The article discusses the establishment of a m... 2023-04-24   

                        location_article        lat        lng  hunger  \
11                                  Juba   4.859363  31.571250   False   

### Implementing Sentiment

In [33]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from transformers import pipeline

# 1. Topic Modeling
vectorizer = CountVectorizer(stop_words='english')
data_vectorized = vectorizer.fit_transform(df_newtopics['summary'])

# Create a gensim dictionary and ensure summaries are strings and then tokenize them
df_newtopics['tokens'] = df_newtopics['summary'].apply(lambda x: str(x).split())
dictionary = Dictionary(df_newtopics['tokens'])

# Build the corpus using the tokens
corpus = [dictionary.doc2bow(tokens) for tokens in df_newtopics['tokens']]

# Train the LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=30)

def get_main_topic(tokens):
    bow = dictionary.doc2bow(tokens)
    main_topic = max(lda_model[bow], key=lambda tup: tup[1])
    return main_topic[0]

df_newtopics['topic'] = df_newtopics['tokens'].apply(get_main_topic)

# 2. Sentiment Analysis
analyzer = pipeline("sentiment-analysis")

def get_sentiment_score(text):
    result = analyzer(text)[0]
    if result['label'] == 'POSITIVE':
        return result['score']
    else:
        return -result['score']

df_newtopics['sentiment'] = df_newtopics['summary'].apply(get_sentiment_score)

# 3. Trend Analysis
monthly_sentiments = df_newtopics.groupby([df_newtopics['date'].dt.to_period('M'), 'topic'])['sentiment'].mean().unstack()

print(monthly_sentiments)


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 629/629 [00:00<?, ?B/s] 
Downloading model.safetensors: 100%|██████████| 268M/268M [00:10<00:00, 26.5MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 48.5kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.36MB/s]


KeyboardInterrupt: 

# The code below is specific for the sentiment analysis above therefore it requires some modifications

In [None]:

# monthly_sentiments = df_newtopics.groupby([df_newtopics['date'].dt.to_period('M'), 'topic'])['sentiment'].mean().unstack()
# monthly_sentiments[2].iloc[0]=monthly_sentiments[2].mean()
# monthly_sentiments[15].iloc[0]=monthly_sentiments[15].mean()
# monthly_sentiments[19].iloc[0]=monthly_sentiments[19].mean()
# monthly_sentiments[25].iloc[0]=monthly_sentiments[25].mean()
# monthly_sentiments[27].iloc[0]=monthly_sentiments[27].mean()
# monthly_sentiments[28].iloc[0]=monthly_sentiments[28].mean()
# monthly_sentiments[29].iloc[0]=monthly_sentiments[29].mean()
# monthly_sentiments=monthly_sentiments.ffill()


# monthly_sentiments['1_diff']=monthly_sentiments[1].diff()
# monthly_sentiments['2_diff']=monthly_sentiments[2].diff()
# monthly_sentiments['6_diff']=monthly_sentiments[6].diff()
# monthly_sentiments['14_diff']=monthly_sentiments[14].diff()

# monthly_sentiments['15_diff']=monthly_sentiments[15].diff()
# monthly_sentiments['19_diff']=monthly_sentiments[19].diff()
# monthly_sentiments['25_diff']=monthly_sentiments[25].diff()
# monthly_sentiments['27_diff']=monthly_sentiments[27].diff()
# monthly_sentiments['28_diff']=monthly_sentiments[28].diff()
# monthly_sentiments['29_diff']=monthly_sentiments[29].diff()

# #monthly_sentiments=monthly_sentiments.drop(['2_diff'],axis=1)

# monthly_sentiments


topic,1,2,6,14,15,19,25,27,28,29,1_diff,2_diff,6_diff,14_diff,15_diff,19_diff,25_diff,27_diff,28_diff,29_diff
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2011-06,0.688175,0.800203,0.231677,0.605170,-0.317350,-0.254239,0.370362,-0.450529,-0.414348,-0.551050,,,,,,,,,,
2011-07,0.613761,0.800203,-0.316726,0.657156,-0.328491,0.267233,0.370362,-0.473375,-0.414348,-0.979257,-0.074414,0.0,-0.548403,0.051985,-0.011141,0.521472,0.000000,-0.022846,0.0,-0.428207
2011-08,0.729675,0.800203,-0.523801,0.463213,-0.357604,0.036319,0.370362,-0.980567,-0.414348,-0.139213,0.115913,0.0,-0.207075,-0.193943,-0.029113,-0.230914,0.000000,-0.507192,0.0,0.840044
2011-09,0.585312,0.800203,0.234096,0.419113,-0.554267,0.466481,0.370362,-0.980567,-0.414348,-0.327074,-0.144363,0.0,0.757897,-0.044099,-0.196663,0.430162,0.000000,0.000000,0.0,-0.187861
2011-10,0.835639,0.800203,-0.197548,0.403372,0.002666,0.080131,-0.744252,-0.191858,-0.414348,-0.630267,0.250327,0.0,-0.431644,-0.015742,0.556933,-0.386351,-1.114613,0.788709,0.0,-0.303192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12,0.986017,0.800203,-0.987557,0.688767,-0.922577,-0.997625,0.868274,-0.994578,-0.996758,0.608692,-0.010003,0.0,0.000000,0.488800,0.069210,0.000000,0.000000,0.000000,0.0,1.568919
2023-01,0.358064,0.800203,-0.987557,0.333655,0.935065,-0.996539,0.868274,-0.994578,-0.996758,0.608692,-0.627953,0.0,0.000000,-0.355113,1.857642,0.001086,0.000000,0.000000,0.0,0.000000
2023-02,0.000399,0.800203,-0.224990,0.491596,0.941952,-0.996539,0.868274,-0.994578,-0.996758,-0.741725,-0.357666,0.0,0.762566,0.157941,0.006886,0.000000,0.000000,0.000000,0.0,-1.350417
2023-03,0.934713,0.800203,-0.224990,0.612692,-0.986989,-0.943442,0.868274,-0.994578,-0.996758,-0.741725,0.934315,0.0,0.000000,0.121096,-1.928941,0.053098,0.000000,0.000000,0.0,0.000000


In [None]:
#monthly_sentiments.to_csv("data/monthly_sentiment.csv", index=False) # Save DataFrame to monthly_sentiment.csv

In [None]:

# df_newtopics_copy = df_newtopics.copy()
# df_newtopics_copy['yyyy-mm'] = pd.to_datetime(df_newtopics_copy['date']).dt.strftime('%Y-%m')


In [None]:
# diff_list=[]
# for i in range(len(df_newtopics_copy)):
#     diff_list.append(monthly_sentiments[df_newtopics_copy['topic'][i]][df_newtopics_copy['yyyy-mm'][i]])
# df_newtopics_copy['difference']=diff_list

# This is the final sentiment csv

In [None]:
#df_newtopics_copy.to_csv("data/topic_sentiment.csv", index=False) # Save DataFrame to monthly_sentiment.csv
