In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

In [2]:
# nltk resources download
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Husnain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Husnain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Husnain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
data=pd.read_csv(r'csvs\Cleaned_constituency_data.csv')

In [4]:
# Step 1: Preprocess Text
stop_words = set(stopwords.words('english'))
stop_words.add('monetary')
stop_words.add('fund')

def preprocess_text(text):
    tokens = nltk.word_tokenize(str(text).lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords and non-alphabetic words
    return tokens

# Apply preprocessing to the 'Extracted_Text' column
data['Processed_Text'] = data['Extracted_Text'].apply(preprocess_text)

In [5]:
data

Unnamed: 0,Year,Title,Extracted_Text,Region/Authority,Constituency,Processed_Text
0,2024,"IMFC Statement by Christine Lagarde, President...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,European Central Bank,OBS,"[international, financial, committee, fiftieth..."
1,2024,"IMFC Statement by HE Haitham Al Ghais, Secreta...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,Organization of the Petroleum Exporting Countries,OBS,"[international, financial, committee, fiftieth..."
2,2024,"IMFC Statement by Ayman Al-Sayari, Governor of...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,Saudi Arabia,SA,"[international, financial, committee, fiftieth..."
3,2024,"IMFC Statement by Antoine Armand, Minister of ...",INTERNATIONAL MONETARY AND FINANCIAL COMMITTE...,France,FF,"[international, financial, committee, f, iftie..."
4,2024,"IMFC Statement by Luis Caputo, Minister of Eco...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,Argentina,AG,"[international, financial, committee, fiftieth..."
...,...,...,...,...,...,...
559,2004,IMFC Statement by the Honorable Domenico Sinis...,\r\n International Monetary and \r\nFinancial...,Italy,IT,"[international, financial, committee, tenth, m..."
560,2004,"IMFC Statement by the Honorable John W. Snow, ...",\r\n International Monetary and \r\nFinancial...,United States,US,"[international, financial, committee, tenth, m..."
561,2004,IMFC Statement by H.E. Sadakazu Tanigaki Minis...,\r\n International Monetary and \r\nFinancial...,Japan,JA,"[international, financial, committee, tenth, m..."
562,2004,"IMFC Statement By James D. Wolfensohn, Preside...",\r\n International Monetary and \r\nFinancial...,World Bank,OBS,"[international, financial, committee, tenth, m..."


In [6]:
# Step 2: Extract Bigrams and Trigrams
def get_ngrams(tokenized_text, n=2):
    return list(ngrams(tokenized_text, n))

data['Bigrams'] = data['Processed_Text'].apply(lambda x: get_ngrams(x, 2))
data['Trigrams'] = data['Processed_Text'].apply(lambda x: get_ngrams(x, 3))

# Flatten lists and count occurrences
bigrams_flat = Counter([item for sublist in data['Bigrams'] for item in sublist])
trigrams_flat = Counter([item for sublist in data['Trigrams'] for item in sublist])

# Extract unique words, bigrams, trigrams
unique_terms = set([word for doc in data['Processed_Text'] for word in doc])
unique_bigrams = set(bigrams_flat.keys())
unique_trigrams = set(trigrams_flat.keys())

In [7]:
data

Unnamed: 0,Year,Title,Extracted_Text,Region/Authority,Constituency,Processed_Text,Bigrams,Trigrams
0,2024,"IMFC Statement by Christine Lagarde, President...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,European Central Bank,OBS,"[international, financial, committee, fiftieth...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
1,2024,"IMFC Statement by HE Haitham Al Ghais, Secreta...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,Organization of the Petroleum Exporting Countries,OBS,"[international, financial, committee, fiftieth...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
2,2024,"IMFC Statement by Ayman Al-Sayari, Governor of...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,Saudi Arabia,SA,"[international, financial, committee, fiftieth...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
3,2024,"IMFC Statement by Antoine Armand, Minister of ...",INTERNATIONAL MONETARY AND FINANCIAL COMMITTE...,France,FF,"[international, financial, committee, f, iftie...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
4,2024,"IMFC Statement by Luis Caputo, Minister of Eco...",\r\n \r\n INTERNATIONAL MONETARY AND FINANCIA...,Argentina,AG,"[international, financial, committee, fiftieth...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
...,...,...,...,...,...,...,...,...
559,2004,IMFC Statement by the Honorable Domenico Sinis...,\r\n International Monetary and \r\nFinancial...,Italy,IT,"[international, financial, committee, tenth, m...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
560,2004,"IMFC Statement by the Honorable John W. Snow, ...",\r\n International Monetary and \r\nFinancial...,United States,US,"[international, financial, committee, tenth, m...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
561,2004,IMFC Statement by H.E. Sadakazu Tanigaki Minis...,\r\n International Monetary and \r\nFinancial...,Japan,JA,"[international, financial, committee, tenth, m...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."
562,2004,"IMFC Statement By James D. Wolfensohn, Preside...",\r\n International Monetary and \r\nFinancial...,World Bank,OBS,"[international, financial, committee, tenth, m...","[(international, financial), (financial, commi...","[(international, financial, committee), (finan..."


In [9]:
# Flatten and count bigram occurrences
bigram_counter = Counter(bigram for row in data['Bigrams'] for bigram in row)

# Convert to a sorted list (optional, sorted by frequency in descending order)
sorted_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)

# Display the results
sorted_bigrams

[(('international', 'financial'), 1282),
 (('global', 'economy'), 1119),
 (('th', 'e'), 1072),
 (('financial', 'committee'), 962),
 (('per', 'cent'), 891),
 (('developing', 'countries'), 863),
 (('global', 'financial'), 809),
 (('financial', 'stability'), 796),
 (('euro', 'area'), 790),
 (('structural', 'reforms'), 676),
 (('advanced', 'economies'), 623),
 (('financial', 'sector'), 607),
 (('exchange', 'rate'), 592),
 (('financial', 'system'), 581),
 (('global', 'economic'), 572),
 (('economic', 'growth'), 559),
 (('emerging', 'market'), 556),
 (('financial', 'markets'), 553),
 (('look', 'forward'), 525),
 (('meeting', 'october'), 520),
 (('world', 'bank'), 493),
 (('global', 'growth'), 488),
 (('debt', 'sustainability'), 487),
 (('percent', 'gdp'), 480),
 (('climate', 'change'), 468),
 (('central', 'bank'), 454),
 (('member', 'states'), 449),
 (('world', 'economy'), 447),
 (('united', 'states'), 413),
 (('economic', 'activity'), 396),
 (('fiscal', 'consolidation'), 393),
 (('interest'

In [11]:
# Flatten and count trigram occurrences
trigram_counter = Counter(trigram for row in data['Trigrams'] for trigram in row)

# Convert to a sorted list (optional, sorted by frequency in descending order)
sorted_trigrams = sorted(trigram_counter.items(), key=lambda x: x[1], reverse=True)

# Display the results
sorted_trigrams

[(('international', 'financial', 'committee'), 894),
 (('emerging', 'market', 'economies'), 266),
 (('meeting', 'october', 'statement'), 253),
 (('october', 'imfc', 'statement'), 245),
 (('financial', 'safety', 'net'), 244),
 (('global', 'financial', 'safety'), 241),
 (('meeting', 'october', 'imfc'), 229),
 (('financial', 'committee', 'meeting'), 227),
 (('general', 'review', 'quotas'), 220),
 (('eu', 'member', 'states'), 208),
 (('committee', 'meeting', 'october'), 174),
 (('imf', 'world', 'bank'), 171),
 (('low', 'income', 'countries'), 156),
 (('poverty', 'reduction', 'growth'), 133),
 (('emerging', 'market', 'developing'), 133),
 (('global', 'financial', 'crisis'), 133),
 (('reduction', 'growth', 'trust'), 114),
 (('global', 'policy', 'agenda'), 112),
 (('financial', 'committee', 'washington'), 109),
 (('international', 'financial', 'system'), 105),
 (('current', 'account', 'deficit'), 105),
 (('global', 'financial', 'system'), 98),
 (('global', 'financial', 'stability'), 96),
 (('

In [8]:
# Step 3: Create Document-Term Matrix (DTM)
data['Extracted_Text'] = data['Extracted_Text'].fillna("")  # Replace NaN with an empty string
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
dtm = vectorizer.fit_transform(data['Extracted_Text'])

# Convert DTM to DataFrame
dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out(), index=data['Title'])
dtm_df.head()

Unnamed: 0_level_0,00,00 04,00 04 20042005,00 04 20042005min,00 04 stock,00 cent,00 cent 00,00 cent august,00 indicated,00 indicated result,...,ﬁts ai,ﬁts ai addiɵon,ﬁts search,ﬁts search security,ﬁx,ﬁx foundaɵons,ﬁx foundaɵons economy,ﬂows,ﬂows eﬃcacy,ﬂows eﬃcacy global
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"IMFC Statement by Christine Lagarde, President of the ECB",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"IMFC Statement by HE Haitham Al Ghais, Secretary General, OPEC",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"IMFC Statement by Ayman Al-Sayari, Governor of the Saudi Central Bank (SAMA)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"IMFC Statement by Antoine Armand, Minister of the Economy, Finance and Industry, France",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"IMFC Statement by Luis Caputo, Minister of Economy, Argentina",1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
def read_file_to_list(file_path):
    """
    Reads a text file and returns a list of lines with leading and trailing whitespace removed.

    :param file_path: Path to the text file
    :return: List of stripped lines
    """
    try:
        with open(file_path, 'r') as file:
            content = file.read()
            lines = [line.strip().replace('_',' ').lower() for line in content.splitlines()]
        return lines
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [18]:
# Step 4: Manual Mapping to Topics
manual_mappings = {
    "Climate Change": [
    "bio-fuel", "biofuel_production", "carbon", "carbon_pricing", "carbon_tax", 
    "climate_change", "climate_change_mitigation", "climate-related", "climatic", 
    "conserve_energy", "cop", "drought", "emission", "energy_conservation", 
    "energy_intensity", "energy-efficient", "energy-saving", "extreme_weather", 
    "global_warming", "low-carbon", "niño", "tackling_climate_change"
],
    "Economic Growth":  [
    "grown faster", "growth", "growth-", "growth-an", "growth-and", 
    "growth-boosting", "growth-critical", "growth-enhancing", 
    "growth-friendly", "growth-friendly_manner", "growth-inducing", 
    "growth-oriented", "growth-promoting", "growth-supporting", 
    "growth-supportive", "private_sector-led_growth", "pro-growth"
],
    "Debt " : [
    "bond issuance", "bonded debt", "debt", "debt-", "debt/gdp_ratio", 
    "debt_overhang", "debt_service-to-export_ratio", "debt_servicing", 
    "debt sustainability", "debt-to-gdp_ratio", "debt-management", 
    "debt-related", "debt-service", "debt-services", "debt-sustainability", 
    "debt-to-gdp", "debt-to-gdp-ratio", "debt-was", "dis-indebtedness", 
    "enhanced_hipe_initiative", "heavily_indebted", "high_debt", 
    "highly_indebted", "highly-indebted", "hipe-to-hipc_debt", "indebted", 
    "indebtedness", "nonconcessional_borrowing", "non-concessional_borrowing", 
    "non-paris_club", "non-paris_club_creditor", "non-paris_club_official", 
    "paris_club", "paris_conference", "paris_declaration", 
    "public-debt-to-gdp_ratio", "sovereign_bond", "sovereign_bond_issuance", 
    "sovereign_default", "syndicated_loan", "upper-credit_tranche"
],
    "Reform" : [
    "bankruptcy code", "bankruptcy law", "cut_red_tape", "deregulating", 
    "deregulation", "insolvency_law", "reduce_red_tape", "reform", 
    "reform_fatigue", "structural_reform_agenda", "structural_reforms", 
    "unfinished_reform_agenda"
]
,

}

In [21]:
# Step 3: Count Topic Appearances in Each Document
def count_topic_appearances(text_tokens, bigrams, trigrams, lexicon):
    topic_counts = {topic: 0 for topic in lexicon.keys()}

    for topic, terms in lexicon.items():
        for term in terms:
            term_parts = term.split("_")  # Split term into words
            
            if len(term_parts) == 1:  # Single-word terms
                topic_counts[topic] += text_tokens.count(term)

            elif len(term_parts) == 2:  # Bigram terms
                topic_counts[topic] += bigrams.count(tuple(term_parts))

            elif len(term_parts) == 3:  # Trigram terms
                topic_counts[topic] += trigrams.count(tuple(term_parts))
    
    return topic_counts

# Apply function
data['Topic_Counts'] = data.apply(lambda row: count_topic_appearances(row['Processed_Text'], row['Bigrams'], row['Trigrams'], manual_mappings), axis=1)

# Convert dictionary column into separate columns
topic_counts_df = pd.DataFrame(data['Topic_Counts'].tolist(), index=data.index)

# Step 4: Compute Total Topic Count Per Document
data['Total_Topic_Count'] = topic_counts_df.sum(axis=1)

# Step 5: Compute Topic Distribution (Normalized Frequency)
topic_distribution = topic_counts_df.div(data['Total_Topic_Count'], axis=0).fillna(0)

# Step 6: Merge Topic Distribution with Original Data
data = pd.concat([data, topic_distribution], axis=1)
data[['Title', 'Year'] + list(topic_distribution.columns)]


Unnamed: 0,Title,Year,Climate Change,Economic Growth,Debt,Reform
0,"IMFC Statement by Christine Lagarde, President...",2024,0.133333,0.800000,0.066667,0.000000
1,"IMFC Statement by HE Haitham Al Ghais, Secreta...",2024,0.000000,1.000000,0.000000,0.000000
2,"IMFC Statement by Ayman Al-Sayari, Governor of...",2024,0.000000,0.444444,0.481481,0.074074
3,"IMFC Statement by Antoine Armand, Minister of ...",2024,0.222222,0.111111,0.444444,0.222222
4,"IMFC Statement by Luis Caputo, Minister of Eco...",2024,0.044118,0.455882,0.411765,0.088235
...,...,...,...,...,...,...
559,IMFC Statement by the Honorable Domenico Sinis...,2004,0.000000,0.547170,0.339623,0.113208
560,"IMFC Statement by the Honorable John W. Snow, ...",2004,0.000000,0.461538,0.384615,0.153846
561,IMFC Statement by H.E. Sadakazu Tanigaki Minis...,2004,0.000000,0.250000,0.375000,0.375000
562,"IMFC Statement By James D. Wolfensohn, Preside...",2004,0.000000,0.620690,0.241379,0.137931


In [29]:
data[['Title', 'Year'] + list(topic_distribution.columns)]

Unnamed: 0,Title,Year,Climate Change,Economic Growth,Environment,Human Rights
0,"IMFC Statement by Christine Lagarde, President...",2024,0.067568,0.162162,0.581081,0.189189
1,"IMFC Statement by HE Haitham Al Ghais, Secreta...",2024,0.000000,0.526316,0.368421,0.105263
2,"IMFC Statement by Ayman Al-Sayari, Governor of...",2024,0.019608,0.117647,0.441176,0.421569
3,"IMFC Statement by Antoine Armand, Minister of ...",2024,0.058824,0.011765,0.647059,0.282353
4,"IMFC Statement by Luis Caputo, Minister of Eco...",2024,0.041985,0.118321,0.564885,0.274809
...,...,...,...,...,...,...
559,IMFC Statement by the Honorable Domenico Sinis...,2004,0.043478,0.180124,0.503106,0.273292
560,"IMFC Statement by the Honorable John W. Snow, ...",2004,0.026316,0.157895,0.644737,0.171053
561,IMFC Statement by H.E. Sadakazu Tanigaki Minis...,2004,0.021739,0.043478,0.717391,0.217391
562,"IMFC Statement By James D. Wolfensohn, Preside...",2004,0.041322,0.148760,0.561983,0.247934


In [12]:
# Step 5: Alternative LDA Topic Modeling using Sklearn
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda.fit_transform(dtm)

# Function to display top words per topic
def display_topics(model, feature_names, num_words=5):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
        print(f"Topic {topic_idx+1}: {', '.join(top_words)}")

print("\nLDA Topics :")
display_topics(lda, vectorizer.get_feature_names_out(), num_words=5)


LDA Topics :
Topic 1: growth, financial, imf, global, countries
Topic 2: growth, countries, financial, global, fund
Topic 3: financial, imf, growth, global, countries
Topic 4: financial, countries, fund, growth, global
Topic 5: financial, countries, global, growth, imf
