In [2]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

In [3]:
# nltk resources download
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Husnain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Husnain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Husnain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
data=pd.read_csv(r'csvs/scraped_data.csv')

In [5]:
# Step 1: Preprocess Text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(str(text).lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords and non-alphabetic words
    return tokens

# Apply preprocessing to the 'Subtext' column
data['Processed_Text'] = data['Subtext'].apply(preprocess_text)

In [6]:
# Step 2: Extract Bigrams and Trigrams
def get_ngrams(tokenized_text, n=2):
    return list(ngrams(tokenized_text, n))

data['Bigrams'] = data['Processed_Text'].apply(lambda x: get_ngrams(x, 2))
data['Trigrams'] = data['Processed_Text'].apply(lambda x: get_ngrams(x, 3))

# Flatten lists and count occurrences
bigrams_flat = Counter([item for sublist in data['Bigrams'] for item in sublist])
trigrams_flat = Counter([item for sublist in data['Trigrams'] for item in sublist])

# Extract unique words, bigrams, trigrams
unique_terms = set([word for doc in data['Processed_Text'] for word in doc])
unique_bigrams = set(bigrams_flat.keys())
unique_trigrams = set(trigrams_flat.keys())

In [7]:
data

Unnamed: 0,Title,Date,Subtext,Processed_Text,Bigrams,Trigrams
0,Intergovernmental Group of Twenty-Four on Inte...,"October 22, 2024",1. The G-24 expresses its deep concern over th...,"[expresses, deep, concern, humanitarian, crise...","[(expresses, deep), (deep, concern), (concern,...","[(expresses, deep, concern), (deep, concern, h..."
1,Intergovernmental Group of Twenty-Four on Inte...,"April 16, 2024",1. The G‑24 recognizes the profound human suff...,"[recognizes, profound, human, suffering, diffe...","[(recognizes, profound), (profound, human), (h...","[(recognizes, profound, human), (profound, hum..."
2,Intergovernmental Group of Twenty-Four on Inte...,"October 10, 2023",1. We express our condolenceson...,"[express, condolenceson, human, suffering, exp...","[(express, condolenceson), (condolenceson, hum...","[(express, condolenceson, human), (condolences..."
3,Intergovernmental Group of Twenty-Four on Inte...,"April 11, 2023",Ministers of the Intergovernmental Group of Tw...,"[ministers, intergovernmental, group, internat...","[(ministers, intergovernmental), (intergovernm...","[(ministers, intergovernmental, group), (inter..."
4,Intergovernmental Group of Twenty-Four on Inte...,"October 11, 2022",1. Multiple compounding crises have severely d...,"[multiple, compounding, crises, severely, dark...","[(multiple, compounding), (compounding, crises...","[(multiple, compounding, crises), (compounding..."
...,...,...,...,...,...,...
168,Communiqué of the Interim Committee of the Boa...,"September 21, 1997","In the advanced economies as a group, growth ...","[advanced, economies, group, growth, low, infl...","[(advanced, economies), (economies, group), (g...","[(advanced, economies, group), (economies, gro..."
169,Group of Twenty Four Communiqué,"September 20, 1997",Ministers of the Intergovernmental Group of Tw...,"[ministers, intergovernmental, group, internat...","[(ministers, intergovernmental), (intergovernm...","[(ministers, intergovernmental, group), (inter..."
170,Communiqué of the Ministers and Governors of t...,"April 28, 1997","Washington, D.C. 1. The Ministers and Central ...","[washington, ministers, central, bank, governo...","[(washington, ministers), (ministers, central)...","[(washington, ministers, central), (ministers,..."
171,Interim Committee Communiqué,"April 28, 1997",The Committee welcomed the generally favorable...,"[committee, welcomed, generally, favorable, pr...","[(committee, welcomed), (welcomed, generally),...","[(committee, welcomed, generally), (welcomed, ..."


In [8]:
# Step 3: Create Document-Term Matrix (DTM)
data['Subtext'] = data['Subtext'].fillna("")  # Replace NaN with an empty string
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
dtm = vectorizer.fit_transform(data['Subtext'])

# Convert DTM to DataFrame
dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out(), index=data['Title'])
dtm_df.head()

Unnamed: 0_level_0,0006,0006 april,0006 april 19,04,04 08,04 08 attendance,04 13,04 13 attendance,04 16,04 16 attendance,...,álvarez pallete chairman,álvaro,álvaro baltodano,álvaro baltodano executive,åsbrink,åsbrink minister,åsbrink minister finance,óscar,óscar arias,óscar arias president
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Intergovernmental Group of Twenty-Four on International Monetary Affairs and Development,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Intergovernmental Group of Twenty-Four on International Monetary Affairs and Development,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Intergovernmental Group of Twenty-Four on International Monetary Affairs and Development,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Intergovernmental Group of Twenty-Four on International Monetary Affairs and Development,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Intergovernmental Group of Twenty-Four on International Monetary Affairs and Development,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Step 4: Manual Mapping to Topics

manual_mappings = {
    "climate change": ["climate", "carbon tax", "sustainability"],
    "economic growth": ["growth", "debt reform", "financial regulations"],
    # Add remaining 131 topics here...
}

# Step 3: Count Topic Appearances in Each Document
def count_topic_appearances(text_tokens, lexicon):
    topic_counts = {topic: sum(text_tokens.count(term) for term in terms) for topic, terms in lexicon.items()}
    return topic_counts

data['Topic_Counts'] = data['Processed_Text'].apply(lambda x: count_topic_appearances(x, manual_mappings))

# Convert dictionary column into separate columns
topic_counts_df = pd.DataFrame(data['Topic_Counts'].tolist(), index=data.index)

# Step 4: Compute Total Topic Count Per Document
data['Total_Topic_Count'] = topic_counts_df.sum(axis=1)

# Step 5: Compute Topic Distribution (Normalized Frequency)
topic_distribution = topic_counts_df.div(data['Total_Topic_Count'], axis=0).fillna(0)

# Step 6: Merge Topic Distribution with Original Data
data = pd.concat([data, topic_distribution], axis=1)

# Print final dataframe with topic distribution
print("\nTopic Distribution:\n", data[['Title', 'Date'] + list(topic_distribution.columns)].head())



Topic Distribution:
                                                Title              Date  \
0  Intergovernmental Group of Twenty-Four on Inte...  October 22, 2024   
1  Intergovernmental Group of Twenty-Four on Inte...    April 16, 2024   
2  Intergovernmental Group of Twenty-Four on Inte...  October 10, 2023   
3  Intergovernmental Group of Twenty-Four on Inte...    April 11, 2023   
4  Intergovernmental Group of Twenty-Four on Inte...  October 11, 2022   

   climate change  climate change  economic growth  economic growth  
0        0.529412        0.529412         0.470588         0.470588  
1        0.736842        0.736842         0.263158         0.263158  
2        0.687500        0.687500         0.312500         0.312500  
3        0.000000        0.000000         0.000000         0.000000  
4        0.794118        0.794118         0.205882         0.205882  


In [14]:
# Step 5: Alternative LDA Topic Modeling using Sklearn
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda.fit_transform(dtm)

# Function to display top words per topic
def display_topics(model, feature_names, num_words=5):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
        print(f"Topic {topic_idx+1}: {', '.join(top_words)}")

print("\nLDA Topics :")
display_topics(lda, vectorizer.get_feature_names_out(), num_words=5)


LDA Topics :
Topic 1: minister, finance, fiscal, budget, minister finance
Topic 2: countries, imf, development, global, support
Topic 3: financial, countries, imf, bank, fund
Topic 4: committee, countries, fund, finance, minister
Topic 5: ministers, countries, financial, fund, committee
