In [1]:
from pathlib import Path
import pandas as pd
from glob import glob
import os
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import word_tokenize, pos_tag
import re
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\20210896\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20210896\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\20210896\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20210896\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Importing the data
This code loops from the folders of TXT and takes every .txt file that does not start with ._ which contain the speeches per country per year. These are then added to a dataframe, storing the year, country code and the speech

In [2]:
path = Path("C:\\Users\\20210896\\OneDrive - TU Eindhoven\\Documents\\1Masters\\FDS\\Assigment 1\\UNGDC_1946-2024\\TXT")

name_text = []
for folder in path.iterdir():
    if folder.is_dir():
        files = [f for f in folder.glob("*.txt") if not f.name.startswith("._")]

        for file in files:
            name = file.name
            text = file.read_text(encoding="utf-8")
            
            name_text.append({
                        "country": name[:3],
                        "year": name[-8:-4],
                        "speech": text
                    })
    
df = pd.DataFrame(name_text)

df.head(5)

Unnamed: 0,country,year,speech
0,ARG,1946,At the resumption of the first session of the ...
1,AUS,1946,The General Assembly of the United Nations is ...
2,BEL,1946,The\tprincipal organs of the United Nations ha...
3,BLR,1946,As more than a year has elapsed since the Unit...
4,BOL,1946,Coming to this platform where so many distingu...


In [3]:
df.tail(5)

Unnamed: 0,country,year,speech
10948,WSM,2024,"Excellencies, \nI extend my congratulations t..."
10949,YEM,2024,"Your Majesties, Excellencies, and Highnesses, ..."
10950,ZAF,2024,President of the 79th Session of the UN Genera...
10951,ZMB,2024,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O..."
10952,ZWE,2024,"Your Excellency, Mr. Philemon Yang, President ..."


## Punctuation - Stopwords - Tokenizing
The following code removes all punctuation from the texts, it also tokenizes the string (returns a list of each word separately as a string) and removes stopwords from it and non alphabetical tokens

https://www.geeksforgeeks.org/nlp/removing-stop-words-nltk-python/ 

https://www.geeksforgeeks.org/python/python-remove-punctuation-from-string/

We also remove words that are related to countries as can be found in the CSV file

https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations

In [4]:
countries_df = pd.read_csv("List_of_adjectival_and_demonymic_forms_for_countries_and_nations_1.csv")
countries_flat = countries_df.values.ravel().tolist()
countries = []

def split_small_capital(text):
    split = re.sub(r'([a-z])([A-Z])', r'\1 \2', text).split()
    countries.extend(split)

for country_adj in countries_flat: split_small_capital(country_adj)

countries = [country.lower()for country in countries]

In [5]:
stop_words = set(stopwords.words('english'))

df_tokenize = df.copy()

def punc_stop_token(speech : str):
    speech_no_punctuation = re.sub(r'[^\w\s]', '', speech)
    tokens = word_tokenize(speech_no_punctuation.lower())
    new_speech = [word for word in tokens if (word.isalpha()) and (word not in stop_words) and (word not in countries)]
    return new_speech

speeches = df_tokenize['speech']
new_speeches = speeches.apply(punc_stop_token)
df_tokenize['speech'] = new_speeches
df_tokenize.head(5)

Unnamed: 0,country,year,speech
0,ARG,1946,"[resumption, first, session, general, assembly..."
1,AUS,1946,"[general, assembly, nations, meeting, time, ho..."
2,BEL,1946,"[principal, organs, nations, functioning, near..."
3,BLR,1946,"[year, elapsed, since, nations, charter, signe..."
4,BOL,1946,"[coming, platform, many, distinguished, eloque..."


## TF-IDF
The following code calculates the TF-IDF score for each word in every speech, this is then stored in the Dataframe as a list of pairs, containing (word, tf-idf score), sorted descendingly, so you get the higher TF-IDF scores first

https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [6]:
df_tf_idf = df_tokenize.copy()

df_tf_idf['speech'] = df_tf_idf['speech'].str.join(' ')

tfidf_vector = TfidfVectorizer()
speeches = df_tf_idf['speech']

tf_idf_matrix  = tfidf_vector.fit_transform(speeches)

In [7]:
feature_names = tfidf_vector.get_feature_names_out()

def matrix_to_tfidf_pairs(row):
    row_array = row.toarray().flatten()  
    word_tf_idf_pairs = [(word, score) for word, score in zip(feature_names, row_array) if score > 0]
    pairs_sorted = sorted(word_tf_idf_pairs, key=lambda x: x[1], reverse=True)
    return pairs_sorted

df_tf_idf['speech_score'] = [matrix_to_tfidf_pairs(tf_idf_matrix[i]) for i in range(tf_idf_matrix.shape[0])]

In [8]:
df_tf_idf['speech_score'][0]

[('arbitration', np.float64(0.27761745595409615)),
 ('francisco', np.float64(0.11976783241985094)),
 ('privilege', np.float64(0.11751665191650619)),
 ('misiones', np.float64(0.11458023474321136)),
 ('verdi', np.float64(0.11458023474321136)),
 ('downfall', np.float64(0.10566267467727397)),
 ('truth', np.float64(0.10173973889696954)),
 ('composer', np.float64(0.10147915688001788)),
 ('signed', np.float64(0.09939367113859544)),
 ('us', np.float64(0.09653875504550573)),
 ('chaco', np.float64(0.09538750258076913)),
 ('verdicts', np.float64(0.0905522845453208)),
 ('musical', np.float64(0.08905969967406435)),
 ('charter', np.float64(0.08838403749604716)),
 ('arbitrator', np.float64(0.08653981443530452)),
 ('war', np.float64(0.08560428755622343)),
 ('civilization', np.float64(0.08505964213538748)),
 ('must', np.float64(0.08253376631323568)),
 ('frontier', np.float64(0.07858728931363226)),
 ('hat', np.float64(0.07549990698906293)),
 ('victory', np.float64(0.07266151805675403)),
 ('international

In [9]:
df_tf_idf[['country', 'year', 'speech_score']].head()

Unnamed: 0,country,year,speech_score
0,ARG,1946,"[(arbitration, 0.27761745595409615), (francisc..."
1,AUS,1946,"[(atomic, 0.24403156852522698), (veto, 0.23323..."
2,BEL,1946,"[(council, 0.2572327035881609), (court, 0.1883..."
3,BLR,1946,"[(franco, 0.5924561211308828), (fascist, 0.205..."
4,BOL,1946,"[(dumbarton, 0.23079860103998934), (oaks, 0.23..."


## NLTK Sentiment Analysis
The following code calculates the TF-IDF score for each word in every speech, this is then stored in the Dataframe as a list of pairs, containing (word, tf-idf score), sorted descendingly, so you get the higher TF-IDF scores first

https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [10]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

df['sentiment'] = df['speech'].apply(lambda x: sia.polarity_scores(x))
df[['country', 'year', 'sentiment']].head()

Unnamed: 0,country,year,sentiment
0,ARG,1946,"{'neg': 0.074, 'neu': 0.739, 'pos': 0.187, 'co..."
1,AUS,1946,"{'neg': 0.048, 'neu': 0.774, 'pos': 0.178, 'co..."
2,BEL,1946,"{'neg': 0.061, 'neu': 0.778, 'pos': 0.16, 'com..."
3,BLR,1946,"{'neg': 0.091, 'neu': 0.729, 'pos': 0.18, 'com..."
4,BOL,1946,"{'neg': 0.058, 'neu': 0.787, 'pos': 0.156, 'co..."
