In [1]:
from pathlib import Path
import pandas as pd
from glob import glob
import os
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RickA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\RickA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RickA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Importing the data
This code loops from the folders of TXT and takes every .txt file that does not start with ._ which contain the speeches per country per year. These are then added to a dataframe, storing the year, country code and the speech

In [3]:
path = Path("../TXT")

name_text = []
for folder in path.iterdir():
    if folder.is_dir():
        files = [f for f in folder.glob("*.txt") if not f.name.startswith("._")]

        for file in files:
            name = file.name
            text = file.read_text(encoding="utf-8")
            
            name_text.append({
                        "country": name[:3],
                        "year": name[-8:-4],
                        "speech": text
                    })
    
df = pd.DataFrame(name_text)

df.head(5)

Unnamed: 0,country,year,speech
0,ARG,1946,At the resumption of the first session of the ...
1,AUS,1946,The General Assembly of the United Nations is ...
2,BEL,1946,The\tprincipal organs of the United Nations ha...
3,BLR,1946,As more than a year has elapsed since the Unit...
4,BOL,1946,Coming to this platform where so many distingu...


In [4]:
df.tail(5)

Unnamed: 0,country,year,speech
10948,WSM,2024,"Excellencies, \nI extend my congratulations t..."
10949,YEM,2024,"Your Majesties, Excellencies, and Highnesses, ..."
10950,ZAF,2024,President of the 79th Session of the UN Genera...
10951,ZMB,2024,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O..."
10952,ZWE,2024,"Your Excellency, Mr. Philemon Yang, President ..."


## Punctuation - Stopwords - Tokenizing
The following code removes all punctuation from the texts, it also tokenizes the string (returns a list of each word separately as a string) and removes stopwords from it and non alphabetical tokens

https://www.geeksforgeeks.org/nlp/removing-stop-words-nltk-python/ 

https://www.geeksforgeeks.org/python/python-remove-punctuation-from-string/

We also remove words that are related to countries as can be found in the CSV file

https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations

In [5]:
countries_df = pd.read_csv("List_of_adjectival_and_demonymic_forms_for_countries_and_nations_1.csv")
countries_flat = countries_df.values.ravel().tolist()
countries = []

def split_small_capital(text):
    split = re.sub(r'([a-z])([A-Z])', r'\1 \2', text).split()
    countries.extend(split)

for country_adj in countries_flat: split_small_capital(country_adj)

countries = [country.lower()for country in countries]

In [6]:
stop_words = set(stopwords.words('english'))

df_tokenize = df.copy()

def punc_stop_token(speech : str):
    speech_no_punctuation = re.sub(r'[^\w\s]', '', speech)
    tokens = word_tokenize(speech_no_punctuation.lower())
    new_speech = [word for word in tokens if (word.isalpha()) and (word not in stop_words) and (word not in countries)]
    return new_speech

speeches = df_tokenize['speech']
new_speeches = speeches.apply(punc_stop_token)
df_tokenize['speech'] = new_speeches
df_tokenize.head(5)

Unnamed: 0,country,year,speech
0,ARG,1946,"[resumption, first, session, general, assembly..."
1,AUS,1946,"[general, assembly, nations, meeting, time, ho..."
2,BEL,1946,"[principal, organs, nations, functioning, near..."
3,BLR,1946,"[year, elapsed, since, nations, charter, signe..."
4,BOL,1946,"[coming, platform, many, distinguished, eloque..."


## Polarization score

In [7]:
polarization_df = pd.read_csv("political-polarization-score.csv")
polarization_df.head()

polarization_df.columns = polarization_df.columns.str.lower()

bounds = [-np.inf, -3, -2, -1, 0, 1, 2, 3, np.inf]

labels = [
    "Very stable",
    "Stable",
    "Relatively Stable",
    "Slightly Stable",
    "Slightly Polarized",
    "Relatively Polarized",
    "Polarized",
    "Very Polarized"
]

polarization_df["polarization label"] = pd.cut(
    polarization_df["political polarization score (central estimate)"],
    bins=bounds,
    labels=labels,
    include_lowest=True,
    right=False
)
polarization_df = polarization_df.rename(columns={"code": "country"})


polarization_df

Unnamed: 0,entity,country,year,political polarization score (central estimate),polarization label
0,Afghanistan,AFG,1992,2.775,Polarized
1,Afghanistan,AFG,1993,2.775,Polarized
2,Afghanistan,AFG,1994,2.775,Polarized
3,Afghanistan,AFG,1995,2.775,Polarized
4,Afghanistan,AFG,1996,2.775,Polarized
...,...,...,...,...,...
22674,Zimbabwe,ZWE,2020,2.499,Polarized
22675,Zimbabwe,ZWE,2021,2.066,Polarized
22676,Zimbabwe,ZWE,2022,1.551,Relatively Polarized
22677,Zimbabwe,ZWE,2023,1.984,Relatively Polarized


In [8]:
for df in (df_tokenize, polarization_df):
    df['country'] = df['country'].astype(str).str.strip()
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

bad_tf = df_tokenize[df_tokenize['year'].isna()]
bad_pol = polarization_df[polarization_df['year'].isna()]

merged_df = df_tokenize.merge(
    polarization_df,
    how='right',
    on=['country', 'year']
)
merged_df = merged_df.drop(columns=['entity'])
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,country,year,speech,political polarization score (central estimate),polarization label
0,AFG,1992,"[shall, read, following, statement, behalf, de...",2.775,Polarized
1,AFG,1993,"[permit, first, congratulate, ambassador, insa...",2.775,Polarized
2,AFG,1994,"[gives, pleasure, convey, behalf, islamic, sta...",2.775,Polarized
3,AFG,1995,"[eve, fiftieth, anniversary, nations, represen...",2.775,Polarized
4,AFG,1996,"[outset, allow, sir, express, delegations, con...",2.775,Polarized
...,...,...,...,...,...
22674,ZWE,2020,"[excellency, ambassador, volkan, bozkir, presi...",2.499,Polarized
22675,ZWE,2021,"[excellency, abdulla, shahid, president, sessi...",2.066,Polarized
22676,ZWE,2022,"[singular, honour, deliver, statement, assembl...",1.551,Relatively Polarized
22677,ZWE,2023,"[wish, congratulate, mr, dennis, francis, elec...",1.984,Relatively Polarized


## TF-IDF
The following code calculates the TF-IDF score for each word in every speech, this is then stored in the Dataframe as a list of pairs, containing (word, tf-idf score), sorted descendingly, so you get the higher TF-IDF scores first

https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [9]:
df_tf_idf = merged_df.copy()

df_tf_idf['speech'] = df_tf_idf['speech'].str.join(' ')

tfidf_vector = TfidfVectorizer()
speeches = df_tf_idf['speech']

tf_idf_matrix  = tfidf_vector.fit_transform(speeches)

In [10]:
print(tf_idf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7864559 stored elements and shape (9894, 79999)>
  Coords	Values
  (0, 63920)	0.009015963036986485
  (0, 57526)	0.017984991327201022
  (0, 26602)	0.00895966365946245
  (0, 67089)	0.009999600899307853
  (0, 7068)	0.016532495016897616
  (0, 18351)	0.02050410075091664
  (0, 55100)	0.011676124715739289
  (0, 43902)	0.01927325672916616
  (0, 26824)	0.006632779014064316
  (0, 2495)	0.008117366842256859
  (0, 24570)	0.021271411527594847
  (0, 77247)	0.01518489226646538
  (0, 14190)	0.008997527608877445
  (0, 21919)	0.0248843047640319
  (0, 54864)	0.005240860961197499
  (0, 27109)	0.06771922189895671
  (0, 63641)	0.05153348584434548
  (0, 28444)	0.05188160305352253
  (0, 5096)	0.07928571045681823
  (0, 27841)	0.020634246934220276
  (0, 13990)	0.010855891705873123
  (0, 78668)	0.010412368890121966
  (0, 79396)	0.01657304308903224
  (0, 27722)	0.012925234546252105
  (0, 34110)	0.0076403457410477394
  :	:
  (9893, 19925)	0.045556092949

In [11]:
feature_names = tfidf_vector.get_feature_names_out()

def matrix_to_tfidf_pairs(row):
    row_array = row.toarray().flatten()  
    word_tf_idf_pairs = [(word, score) for word, score in zip(feature_names, row_array) if score > 0]
    pairs_sorted = sorted(word_tf_idf_pairs, key=lambda x: x[1], reverse=True)
    return pairs_sorted

df_tf_idf['speech_score'] = [matrix_to_tfidf_pairs(tf_idf_matrix[i]) for i in range(tf_idf_matrix.shape[0])]

In [12]:
df_tf_idf[['country', 'year', 'speech_score']].head()

Unnamed: 0,country,year,speech_score
0,AFG,1992,"[(islamic, 0.5221677639505987), (jirgah, 0.233..."
1,AFG,1993,"[(islamic, 0.20785520388469753), (mines, 0.158..."
2,AFG,1994,"[(islamic, 0.4086762737986923), (herat, 0.1800..."
3,AFG,1995,"[(taliban, 0.31195169642726), (kabul, 0.287801..."
4,AFG,1996,"[(taliban, 0.7821225804972347), (kabul, 0.2563..."


## Linear Regression - Lasso

https://scikit-learn.org/stable/modules/linear_model.html

In [33]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
df_lin = merged_df.copy()
df_lin = df_lin.rename(columns={"political polarization score (central estimate)": "polarization score"})

# y = np.array(df_lin["polarization score"]).reshape(-1, 1)
# scaler = MinMaxScaler(feature_range=(-1, 1))
# y_scaled = scaler.fit_transform(y)

# df_lin["polarization scaled"] = y_scaled


X = tf_idf_matrix
# y = df_lin['polarization scaled']
y = df_lin['polarization score']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_lin = linear_model.Ridge(alpha=0.1)
model_lin.fit(X_train, y_train)

y_pred = model_lin.predict(X_test)

MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", MSE)
print("Mean Absolute Error:", MAE)
print("R^2 Score:", r2)

Mean Squared Error: 1.136760680692018
Mean Absolute Error: 0.8357692406696215
R^2 Score: 0.434559127565478


## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [14]:
X = tf_idf_matrix
y = merged_df['polarization label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.46993431025770593
                      precision    recall  f1-score   support

           Polarized       0.87      0.17      0.28       120
Relatively Polarized       0.55      0.30      0.39       298
   Relatively Stable       0.51      0.54      0.52       367
  Slightly Polarized       0.41      0.67      0.51       492
     Slightly Stable       0.47      0.55      0.50       461
              Stable       0.64      0.20      0.30       197
      Very Polarized       0.00      0.00      0.00        17
         Very stable       0.00      0.00      0.00        27

            accuracy                           0.47      1979
           macro avg       0.43      0.30      0.31      1979
        weighted avg       0.51      0.47      0.45      1979



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6271824 stored elements and shape (7915, 79999)>
  Coords	Values
  (0, 7068)	0.017506862675892066
  (0, 14190)	0.0190556210795972
  (0, 63641)	0.01091413799266075
  (0, 28444)	0.009988967997816288
  (0, 5096)	0.009877474589281557
  (0, 27841)	0.014566905040794803
  (0, 78668)	0.033078112911048604
  (0, 34110)	0.016181282201887498
  (0, 59962)	0.03383480713401744
  (0, 62672)	0.012484284066831745
  (0, 53148)	0.013181082989762382
  (0, 35528)	0.07762643339756936
  (0, 13356)	0.02209489360883173
  (0, 75935)	0.019851415621355353
  (0, 21644)	0.013317288105085312
  (0, 2812)	0.0481936058329493
  (0, 51422)	0.030708793945282524
  (0, 59444)	0.02350179953696341
  (0, 49847)	0.05442073682369456
  (0, 79246)	0.0231641888867396
  (0, 77715)	0.015335792430812824
  (0, 78724)	0.10638095684095122
  (0, 14953)	0.011027163245881093
  (0, 64781)	0.012412494126723586
  (0, 49776)	0.011323231485807186
  :	:
  (7914, 66190)	0.055158475847537

In [16]:
print(y_train)

13882     Relatively Stable
2031      Relatively Stable
21460                Stable
8159              Polarized
13972     Relatively Stable
                ...        
12488    Slightly Polarized
11363    Slightly Polarized
11737                Stable
2219        Slightly Stable
16150     Relatively Stable
Name: polarization label, Length: 7915, dtype: category
Categories (8, object): ['Very stable' < 'Stable' < 'Relatively Stable' < 'Slightly Stable' < 'Slightly Polarized' < 'Relatively Polarized' < 'Polarized' < 'Very Polarized']
