In [1]:
from pathlib import Path
import pandas as pd
from glob import glob
import os
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christosgeorghiou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/christosgeorghiou/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christosgeorghiou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Importing the data
This code loops from the folders of TXT and takes every .txt file that does not start with ._ which contain the speeches per country per year. These are then added to a dataframe, storing the year, country code and the speech

In [3]:
path = Path("../TXT")

name_text = []
for folder in path.iterdir():
    if folder.is_dir():
        files = [f for f in folder.glob("*.txt") if not f.name.startswith("._")]

        for file in files:
            name = file.name
            text = file.read_text(encoding="utf-8")
            
            name_text.append({
                        "country": name[:3],
                        "year": name[-8:-4],
                        "speech": text
                    })
    
df = pd.DataFrame(name_text)

df.head(5)

Unnamed: 0,country,year,speech
0,BEL,1950,Never before have men throughout the world mor...
1,BLR,1950,It is now five years since the United Nations ...
2,FRA,1950,"At the conclusion of this general discussion, ..."
3,PAK,1950,May I be permitted to congratulate the General...
4,TUR,1950,The era inaugurated by the creation of the Uni...


In [4]:
df.tail(5)

Unnamed: 0,country,year,speech
10948,LIE,2014,This has been an \nenormously difficult year f...
10949,AZE,2014,"At the outset, \nI would like to congratulate ..."
10950,GRC,2014,This sixty-ninth session of the General Assemb...
10951,ISL,2014,Next year we will \ncelebrate the seventieth a...
10952,HUN,2014,"“If you seek peace, prepare for war.” Those ar..."


## Punctuation - Stopwords - Tokenizing
The following code removes all punctuation from the texts, it also tokenizes the string (returns a list of each word separately as a string) and removes stopwords from it and non alphabetical tokens

https://www.geeksforgeeks.org/nlp/removing-stop-words-nltk-python/ 

https://www.geeksforgeeks.org/python/python-remove-punctuation-from-string/

We also remove words that are related to countries as can be found in the CSV file

https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations

In [5]:
countries_df = pd.read_csv("List_of_adjectival_and_demonymic_forms_for_countries_and_nations_1.csv")
countries_flat = countries_df.values.ravel().tolist()
countries = []

def split_small_capital(text):
    split = re.sub(r'([a-z])([A-Z])', r'\1 \2', text).split()
    countries.extend(split)

for country_adj in countries_flat: split_small_capital(country_adj)

countries = [country.lower()for country in countries]

In [6]:
stop_words = set(stopwords.words('english'))

df_tokenize = df.copy()

def punc_stop_token(speech : str):
    speech_no_punctuation = re.sub(r'[^\w\s]', '', speech)
    tokens = word_tokenize(speech_no_punctuation.lower())
    new_speech = [word for word in tokens if (word.isalpha()) and (word not in stop_words) and (word not in countries)]
    return new_speech

speeches = df_tokenize['speech']
new_speeches = speeches.apply(punc_stop_token)
df_tokenize['speech'] = new_speeches
df_tokenize.head(5)

Unnamed: 0,country,year,speech
0,BEL,1950,"[never, men, throughout, world, fervently, una..."
1,BLR,1950,"[five, years, since, nations, charter, laid, f..."
2,FRA,1950,"[conclusion, general, discussion, much, eloque..."
3,PAK,1950,"[may, permitted, congratulate, general, assemb..."
4,TUR,1950,"[era, inaugurated, creation, nations, one, cri..."


## Polarization score

In [24]:
polarization_df = pd.read_csv("political-polarization-score.csv")
polarization_df.head()

polarization_df.columns = polarization_df.columns.str.lower()

bounds = [ -3, -1, 1, 3]

labels = [
    "Stable",
    "Neutral",
    "Polarized"
]

polarization_df["polarization label"] = pd.cut(
    polarization_df["political polarization score (central estimate)"],
    bins=bounds,
    labels=labels,
    include_lowest=True,
    right=False
)
polarization_df = polarization_df.rename(columns={"code": "country"})


polarization_df

Unnamed: 0,entity,country,year,political polarization score (central estimate),polarization label
0,Afghanistan,AFG,1992,2.775,Polarized
1,Afghanistan,AFG,1993,2.775,Polarized
2,Afghanistan,AFG,1994,2.775,Polarized
3,Afghanistan,AFG,1995,2.775,Polarized
4,Afghanistan,AFG,1996,2.775,Polarized
...,...,...,...,...,...
22674,Zimbabwe,ZWE,2020,2.499,Polarized
22675,Zimbabwe,ZWE,2021,2.066,Polarized
22676,Zimbabwe,ZWE,2022,1.551,Polarized
22677,Zimbabwe,ZWE,2023,1.984,Polarized


In [25]:
for df in (df_tokenize, polarization_df):
    df['country'] = df['country'].astype(str).str.strip()
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

bad_tf = df_tokenize[df_tokenize['year'].isna()]
bad_pol = polarization_df[polarization_df['year'].isna()]

merged_df = df_tokenize.merge(
    polarization_df,
    how='right',
    on=['country', 'year']
)
merged_df = merged_df.drop(columns=['entity'])
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,country,year,speech,political polarization score (central estimate),polarization label
0,AFG,1992,"[shall, read, following, statement, behalf, de...",2.775,Polarized
1,AFG,1993,"[permit, first, congratulate, ambassador, insa...",2.775,Polarized
2,AFG,1994,"[gives, pleasure, convey, behalf, islamic, sta...",2.775,Polarized
3,AFG,1995,"[eve, fiftieth, anniversary, nations, represen...",2.775,Polarized
4,AFG,1996,"[outset, allow, sir, express, delegations, con...",2.775,Polarized
...,...,...,...,...,...
22674,ZWE,2020,"[excellency, ambassador, volkan, bozkir, presi...",2.499,Polarized
22675,ZWE,2021,"[excellency, abdulla, shahid, president, sessi...",2.066,Polarized
22676,ZWE,2022,"[singular, honour, deliver, statement, assembl...",1.551,Polarized
22677,ZWE,2023,"[wish, congratulate, mr, dennis, francis, elec...",1.984,Polarized


## TF-IDF
The following code calculates the TF-IDF score for each word in every speech, this is then stored in the Dataframe as a list of pairs, containing (word, tf-idf score), sorted descendingly, so you get the higher TF-IDF scores first

https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [26]:
df_tf_idf = merged_df.copy()

df_tf_idf['speech'] = df_tf_idf['speech'].str.join(' ')

tfidf_vector = TfidfVectorizer()
speeches = df_tf_idf['speech']

tf_idf_matrix  = tfidf_vector.fit_transform(speeches)

In [27]:
print(tf_idf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7715262 stored elements and shape (9696, 79301)>
  Coords	Values
  (0, 63356)	0.008992710203389036
  (0, 57025)	0.01796484652837454
  (0, 26395)	0.008950440498711592
  (0, 66497)	0.009979434839641954
  (0, 7014)	0.01653837802389735
  (0, 18203)	0.020463824279574792
  (0, 54620)	0.011740077981895043
  (0, 43541)	0.019340801872737134
  (0, 26617)	0.0066284863058919605
  (0, 2473)	0.008107053436387002
  (0, 24377)	0.02123422100085243
  (0, 76573)	0.015146792479547927
  (0, 14076)	0.008976501730049989
  (0, 21741)	0.02485072429747429
  (0, 54386)	0.005237630655317801
  (0, 26902)	0.06765893408092226
  (0, 63081)	0.051500308689315544
  (0, 28225)	0.05189452076004106
  (0, 5054)	0.07932002913483883
  (0, 27631)	0.020654539902360804
  (0, 13878)	0.010832607189594818
  (0, 77983)	0.010420825635106651
  (0, 78705)	0.01656346099742887
  (0, 27514)	0.01291394584459757
  (0, 33852)	0.007639756677311853
  :	:
  (9695, 19769)	0.0457350717

In [28]:
feature_names = tfidf_vector.get_feature_names_out()

def matrix_to_tfidf_pairs(row):
    row_array = row.toarray().flatten()  
    word_tf_idf_pairs = [(word, score) for word, score in zip(feature_names, row_array) if score > 0]
    pairs_sorted = sorted(word_tf_idf_pairs, key=lambda x: x[1], reverse=True)
    return pairs_sorted

df_tf_idf['speech_score'] = [matrix_to_tfidf_pairs(tf_idf_matrix[i]) for i in range(tf_idf_matrix.shape[0])]

In [29]:
df_tf_idf[['country', 'year', 'speech_score']].head()

Unnamed: 0,country,year,speech_score
0,AFG,1992,"[(islamic, 0.5224163901117383), (jirgah, 0.233..."
1,AFG,1993,"[(islamic, 0.20781826624680713), (mines, 0.158..."
2,AFG,1994,"[(islamic, 0.40860933366389796), (herat, 0.179..."
3,AFG,1995,"[(taliban, 0.31177407033282845), (kabul, 0.288..."
4,AFG,1996,"[(taliban, 0.7818927247795634), (kabul, 0.2568..."


## Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [31]:
X = tf_idf_matrix
y = merged_df['polarization label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.6829896907216495
              precision    recall  f1-score   support

     Neutral       0.65      0.90      0.76      1003
   Polarized       0.77      0.32      0.46       404
      Stable       0.76      0.54      0.63       533

    accuracy                           0.68      1940
   macro avg       0.73      0.59      0.62      1940
weighted avg       0.71      0.68      0.66      1940



In [49]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6271824 stored elements and shape (7915, 79999)>
  Coords	Values
  (0, 7068)	0.017506862675892066
  (0, 14190)	0.0190556210795972
  (0, 63641)	0.01091413799266075
  (0, 28444)	0.009988967997816288
  (0, 5096)	0.009877474589281557
  (0, 27841)	0.014566905040794803
  (0, 78668)	0.033078112911048604
  (0, 34110)	0.016181282201887498
  (0, 59962)	0.03383480713401744
  (0, 62672)	0.012484284066831745
  (0, 53148)	0.013181082989762382
  (0, 35528)	0.07762643339756936
  (0, 13356)	0.02209489360883173
  (0, 75935)	0.019851415621355353
  (0, 21644)	0.013317288105085312
  (0, 2812)	0.0481936058329493
  (0, 51422)	0.030708793945282524
  (0, 59444)	0.02350179953696341
  (0, 49847)	0.05442073682369456
  (0, 79246)	0.0231641888867396
  (0, 77715)	0.015335792430812824
  (0, 78724)	0.10638095684095122
  (0, 14953)	0.011027163245881093
  (0, 64781)	0.012412494126723586
  (0, 49776)	0.011323231485807186
  :	:
  (7914, 66190)	0.055158475847537

In [50]:
print(y_train)

13882     Relatively Stable
2031      Relatively Stable
21460                Stable
8159              Polarized
13972     Relatively Stable
                ...        
12488    Slightly Polarized
11363    Slightly Polarized
11737                Stable
2219        Slightly Stable
16150     Relatively Stable
Name: polarization label, Length: 7915, dtype: category
Categories (8, object): ['Very stable' < 'Stable' < 'Relatively Stable' < 'Slightly Stable' < 'Slightly Polarized' < 'Relatively Polarized' < 'Polarized' < 'Very Polarized']
