In [27]:
from pathlib import Path
import pandas as pd
from glob import glob
import os
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [23]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('words')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swastik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/swastik/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/swastik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/swastik/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

## Importing the data
This code loops from the folders of TXT and takes every .txt file that does not start with ._ which contain the speeches per country per year. These are then added to a dataframe, storing the year, country code and the speech

In [24]:
path = Path("/home/swastik/Downloads/UNGDC_1946-2024(1)/TXT")

name_text = []
for folder in path.iterdir():
    if folder.is_dir():
        files = [f for f in folder.glob("*.txt") if not f.name.startswith("._")]

        for file in files:
            name = file.name
            text = file.read_text(encoding="utf-8")
            
            name_text.append({
                        "country": name[:3],
                        "year": name[-8:-4],
                        "speech": text
                    })
    
df = pd.DataFrame(name_text)

df.head(5)

Unnamed: 0,country,year,speech
0,YEM,2013,"Allow \nme, at the outset to extend sincere my..."
1,NER,2013,It is a great \nhonour for me to take the floo...
2,SWZ,2013,It is a great pleasure for me to have \nthis o...
3,MNG,2013,Allow me to first congratulate \nMr. John Ashe...
4,SGP,2013,I warmly \ncongratulate Mr. John Ashe on his e...


In [7]:
df.tail(5)

Unnamed: 0,country,year,speech
3283,DEU,2019,"Here in New York over the past few days, we ha..."
3284,MKD,2019,My country has been a Member of the United Nat...
3285,LAO,2019,"At the outset, I would like to congratulate Pr..."
3286,GUY,2019,I bring to President Tijjani Muhammad-Bande fr...
3287,GRD,2019,I extend cordial greetings from the Government...


## Punctuation - Stopwords - Tokenizing
The following code removes all punctuation from the texts, it also tokenizes the string (returns a list of each word separately as a string) and removes stopwords from it and non alphabetical tokens

https://www.geeksforgeeks.org/nlp/removing-stop-words-nltk-python/ 

https://www.geeksforgeeks.org/python/python-remove-punctuation-from-string/

We also remove words that are related to countries as can be found in the CSV file

https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations

In [38]:
countries_df = pd.read_csv("List_of_adjectival_and_demonymic_forms_for_countries_and_nations_1.csv")
countries_flat = countries_df.values.ravel().tolist()
countries = []

def split_small_capital(text):
    split = re.sub(r'([a-z])([A-Z])', r'\1 \2', text).split()
    countries.extend(split)

for country_adj in countries_flat: split_small_capital(country_adj)

countries = [country.lower()for country in countries]

In [None]:

stop_words = set(stopwords.words('english'))
english_vocab = set(words.words())

df_tokenize = df.copy()

def punc_stop_token_english(speech: str):
    # Remove punctuation
    speech_no_punctuation = re.sub(r'[^\w\s]', '', speech)
    
    # Tokenize and lowercase
    tokens = word_tokenize(speech_no_punctuation.lower())
    
    # POS tagging
    pos_tags = pos_tag(tokens)
    
    # Keep only words that:
    # - are alphabetic
    # - are not stopwords
    # - are not proper nouns (NNP, NNPS)
    # - are in English vocabulary
    new_speech = [
        word for word, tag in pos_tags
        if (word.isalpha()) 
        and (word not in stop_words)
        and (word not in countries)
        and (tag not in ['NNP', 'NNPS'])
        and (word in english_vocab)
    ]
    
    return new_speech

# Apply to your dataframe
df_tokenize['speech_token'] = df_tokenize['speech'].apply(punc_stop_token_english)
df_tokenize.head(5)

Unnamed: 0,country,year,speech,speech_token
0,YEM,2013,"Allow \nme, at the outset to extend sincere my...","[allow, outset, extend, sincere, ambassador, p..."
1,NER,2013,It is a great \nhonour for me to take the floo...,"[great, take, floor, session, general, assembl..."
2,SWZ,2013,It is a great pleasure for me to have \nthis o...,"[great, pleasure, opportunity, join, fellow, s..."
3,MNG,2013,Allow me to first congratulate \nMr. John Ashe...,"[allow, first, congratulate, assuming, preside..."
4,SGP,2013,I warmly \ncongratulate Mr. John Ashe on his e...,"[warmly, congratulate, election, president, se..."


In [29]:
# --- Bigram frequency check (after tokenization) ---
from collections import Counter
from nltk import ngrams

all_bigrams = []
for tokens in df_tokenize['speech_token']:   # your tokens are in this column
    if isinstance(tokens, list):
        all_bigrams.extend(list(ngrams(tokens, 2)))

bigram_counts = Counter(all_bigrams)

print("Top 20 most common bigrams:")
for (w1, w2), cnt in bigram_counts.most_common(20):
    print(f"{w1} {w2:<20} {cnt}")

Top 20 most common bigrams:
climate change               8947
sustainable development          7608
general assembly             7281
international community            7275
security council              6496
peace security             5437
per cent                 3957
would like                 3841
international law                  2820
international peace                2012
middle east                 1945
rule law                  1902
let us                   1807
small island               1805
assembly session              1714
charter united               1597
millennium development          1553
economic social               1551
development agenda               1484
president general              1394


## Polarization score

In [30]:
polarization_df = pd.read_csv("political-polarization-score.csv")
polarization_df.head()

polarization_df.columns = polarization_df.columns.str.lower()

bounds = [ -3, -1, 1, 3]

labels = [
    "Stable",
    "Neutral",
    "Polarized"
]

polarization_df["polarization label"] = pd.cut(
    polarization_df["political polarization score (central estimate)"],
    bins=bounds,
    labels=labels,
    include_lowest=True,
    right=False
)
polarization_df = polarization_df.rename(columns={"code": "country"})


polarization_df

Unnamed: 0,entity,country,year,political polarization score (central estimate),polarization label
0,Afghanistan,AFG,1992,2.775,Polarized
1,Afghanistan,AFG,1993,2.775,Polarized
2,Afghanistan,AFG,1994,2.775,Polarized
3,Afghanistan,AFG,1995,2.775,Polarized
4,Afghanistan,AFG,1996,2.775,Polarized
...,...,...,...,...,...
22674,Zimbabwe,ZWE,2020,2.499,Polarized
22675,Zimbabwe,ZWE,2021,2.066,Polarized
22676,Zimbabwe,ZWE,2022,1.551,Polarized
22677,Zimbabwe,ZWE,2023,1.984,Polarized


In [31]:
for df in (df_tokenize, polarization_df):
    df['country'] = df['country'].astype(str).str.strip()
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

bad_tf = df_tokenize[df_tokenize['year'].isna()]
bad_pol = polarization_df[polarization_df['year'].isna()]

merged_df = df_tokenize.merge(
    polarization_df,
    how='right',
    on=['country', 'year']
)
merged_df = merged_df.drop(columns=['entity'])
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,country,year,speech,speech_token,political polarization score (central estimate),polarization label
16,AFG,2008,Since the last time we \ngathered here in this...,"[since, last, time, great, hall, year, great, ...",-0.014,Neutral
17,AFG,2009,"First, I would like to \ncongratulate His Exce...","[first, would, like, congratulate, excellency,...",-0.014,Neutral
18,AFG,2010,I join previous \nspeakers in congratulating y...,"[join, previous, sir, election, president, gen...",-0.014,Neutral
19,AFG,2011,I am honoured to be \nhere to read the stateme...,"[read, statement, president, excellency, retur...",-0.014,Neutral
20,AFG,2012,"﻿As we speak today, the world\nis being shaken...","[speak, today, world, shaken, depravity, insul...",-0.014,Neutral
...,...,...,...,...,...,...
22674,ZWE,2020,"Your Excellency, Ambassador Volkan Bozkir, Pre...","[excellency, ambassador, president, session, u...",2.499,Polarized
22675,ZWE,2021,"Your Excellency Abdulla Shahid, President of t...","[excellency, president, session, general, asse...",2.066,Polarized
22676,ZWE,2022,It is my singular honour to deliver this state...,"[singular, deliver, statement, assembly, allow...",1.551,Polarized
22677,ZWE,2023,I wish to congratulate Mr. Dennis Francis on h...,"[wish, congratulate, election, president, gene...",1.984,Polarized


## TF-IDF
The following code calculates the TF-IDF score for each word in every speech, this is then stored in the Dataframe as a list of pairs, containing (word, tf-idf score), sorted descendingly, so you get the higher TF-IDF scores first

https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [32]:
df_tf_idf = merged_df.copy()

df_tf_idf['speech_token'] = df_tf_idf['speech_token'].str.join(' ')

tfidf_vector = TfidfVectorizer()
speeches = df_tf_idf['speech_token']

tf_idf_matrix  = tfidf_vector.fit_transform(speeches)

In [33]:
print(tf_idf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1266635 stored elements and shape (2802, 15581)>
  Coords	Values
  (0, 12434)	0.03989089265430518
  (0, 7753)	0.09191268839713386
  (0, 13821)	0.03229353297424898
  (0, 5997)	0.059467195739140166
  (0, 6114)	0.0388121467370732
  (0, 15535)	0.052166142284736414
  (0, 5989)	0.04020515057982752
  (0, 5893)	0.0859414606539855
  (0, 6461)	0.015512633609364748
  (0, 121)	0.07095328379300604
  (0, 13037)	0.05618339681318859
  (0, 6451)	0.05313908805478291
  (0, 818)	0.07544678866583744
  (0, 15464)	0.08300388692301199
  (0, 6047)	0.09158624974780905
  (0, 5884)	0.029770658816654014
  (0, 14423)	0.03270752445871978
  (0, 2254)	0.01707842060852194
  (0, 2068)	0.016076773209113465
  (0, 11292)	0.02711690194352751
  (0, 12099)	0.0616683981782035
  (0, 8520)	0.09083799344966703
  (0, 12329)	0.12916354002124258
  (0, 5456)	0.05027524356593249
  (0, 8227)	0.05119771275913079
  :	:
  (2801, 2778)	0.05409247272048699
  (2801, 921)	0.0389761

In [34]:
feature_names = tfidf_vector.get_feature_names_out()

def matrix_to_tfidf_pairs(row):
    row_array = row.toarray().flatten()  
    word_tf_idf_pairs = [(word, score) for word, score in zip(feature_names, row_array) if score > 0]
    pairs_sorted = sorted(word_tf_idf_pairs, key=lambda x: x[1], reverse=True)
    return pairs_sorted

df_tf_idf['speech_score'] = [matrix_to_tfidf_pairs(tf_idf_matrix[i]) for i in range(tf_idf_matrix.shape[0])]

In [35]:
df_tf_idf[['country', 'year', 'speech_score']].head()

Unnamed: 0,country,year,speech_score
16,AFG,2008,"[(terrorism, 0.19712251036538642), (developmen..."
17,AFG,2009,"[(intellectual, 0.19534902855967673), (interna..."
18,AFG,2010,"[(jirga, 0.34036720227934886), (people, 0.1781..."
19,AFG,2011,"[(transition, 0.21339185483483689), (internati..."
20,AFG,2012,"[(peace, 0.20130430468300325), (security, 0.20..."


In [None]:
# --- TF-IDF with unigrams + bigrams (n-grams) ---
from sklearn.feature_extraction.text import TfidfVectorizer

# use the same joined text you created above
texts = df_tf_idf['speech']  # already ' '.join(tokens)

tfidf_ngram = TfidfVectorizer(
    lowercase=True,
    token_pattern=r"(?u)\b\w+\b",  # keep words and numbers
    ngram_range=(1, 2),            # unigrams + bigrams
    min_df=2,
    max_df=0.9,
)

tf_idf_matrix_ngram = tfidf_ngram.fit_transform(texts)
feature_names_ngram = tfidf_ngram.get_feature_names_out()

print("Docs:", tf_idf_matrix_ngram.shape[0],
      "Vocab size (with bigrams):", tf_idf_matrix_ngram.shape[1])

def matrix_to_tfidf_pairs_ng(row):
    arr = row.toarray().flatten()
    pairs = [(w, s) for w, s in zip(feature_names_ngram, arr) if s > 0]
    return sorted(pairs, key=lambda x: x[1], reverse=True)

# store bigram-aware scores in a NEW column, while keeping original intact
df_tf_idf['speech_score_bigrams'] = [
    matrix_to_tfidf_pairs_ng(tf_idf_matrix_ngram[i])
    for i in range(tf_idf_matrix_ngram.shape[0])
]

# quick peek into the dataframe with speech score for both unigrams AND bigrams
df_tf_idf[['country', 'year', 'speech_score_bigrams']].tail()
df_tf_idf[['country', 'year', 'speech_score_bigrams']].head()


Docs: 2802 Vocab size (with bigrams): 314704


## Linear Regression - Lasso

https://scikit-learn.org/stable/modules/linear_model.html

In [18]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [19]:
df_lin = merged_df.copy()
df_lin = df_lin.rename(columns={"political polarization score (central estimate)": "polarization score"})

y = np.array(df_lin["polarization score"]).reshape(-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))
y_scaled = scaler.fit_transform(y)

df_lin["polarization scaled"] = y_scaled


X = tf_idf_matrix
y = df_lin['polarization scaled']
# y = df_lin['polarization score']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_lin = linear_model.Ridge(alpha=0.1)
model_lin.fit(X_train, y_train)

y_pred = model_lin.predict(X_test)

MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", MSE)
print("Mean Absolute Error:", MAE)
print("R^2 Score:", r2)

Mean Squared Error: 0.09115565348494908
Mean Absolute Error: 0.2446033237264886
R^2 Score: 0.5249242772266041


## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [21]:
X = tf_idf_matrix
y = merged_df['polarization label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.6185383244206774
              precision    recall  f1-score   support

     Neutral       0.55      0.94      0.70       260
   Polarized       0.76      0.21      0.33       152
      Stable       0.90      0.48      0.62       149

    accuracy                           0.62       561
   macro avg       0.74      0.54      0.55       561
weighted avg       0.70      0.62      0.58       561

