In [1]:
from pathlib import Path
import pandas as pd
from glob import glob
import os
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ileshyadav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/ileshyadav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ileshyadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Importing the data
This code loops from the folders of TXT and takes every .txt file that does not start with ._ which contain the speeches per country per year. These are then added to a dataframe, storing the year, country code and the speech

In [None]:
path = Path("../TXT")

name_text = []
for folder in path.iterdir():
    if folder.is_dir():
        files = [f for f in folder.glob("*.txt") if not f.name.startswith("._")]

        for file in files:
            name = file.name
            text = file.read_text(encoding="utf-8")
            
            name_text.append({
                        "country": name[:3],
                        "year": name[-8:-4],
                        "speech": text
                    })
    
df = pd.DataFrame(name_text)

df.head(5)

In [None]:
df.tail(5)

## Punctuation - Stopwords - Tokenizing
The following code removes all punctuation from the texts, it also tokenizes the string (returns a list of each word separately as a string) and removes stopwords from it and non alphabetical tokens

https://www.geeksforgeeks.org/nlp/removing-stop-words-nltk-python/ 

https://www.geeksforgeeks.org/python/python-remove-punctuation-from-string/

We also remove words that are related to countries as can be found in the CSV file

https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations

In [None]:
countries_df = pd.read_csv("List_of_adjectival_and_demonymic_forms_for_countries_and_nations_1.csv")
countries_flat = countries_df.values.ravel().tolist()
countries = []

def split_small_capital(text):
    split = re.sub(r'([a-z])([A-Z])', r'\1 \2', text).split()
    countries.extend(split)

for country_adj in countries_flat: split_small_capital(country_adj)

countries = [country.lower()for country in countries]

In [None]:
stop_words = set(stopwords.words('english'))

df_tokenize = df.copy()

def punc_stop_token(speech : str):
    speech_no_punctuation = re.sub(r'[^\w\s]', '', speech)
    tokens = word_tokenize(speech_no_punctuation.lower())
    new_speech = [word for word in tokens if (word.isalpha()) and (word not in stop_words) and (word not in countries)]
    return new_speech

speeches = df_tokenize['speech']
new_speeches = speeches.apply(punc_stop_token)
df_tokenize['speech_token'] = new_speeches
df_tokenize.head(5)

In [None]:
# --- Bigram frequency check (after tokenization) ---
from collections import Counter
from nltk import ngrams

all_bigrams = []
for tokens in df_tokenize['speech_token']:   # your tokens are in this column
    if isinstance(tokens, list):
        all_bigrams.extend(list(ngrams(tokens, 2)))

bigram_counts = Counter(all_bigrams)

print("Top 20 most common bigrams:")
for (w1, w2), cnt in bigram_counts.most_common(20):
    print(f"{w1} {w2:<20} {cnt}")

## Polarization score

In [None]:
polarization_df = pd.read_csv("political-polarization-score.csv")
polarization_df.head()

polarization_df.columns = polarization_df.columns.str.lower()

bounds = [ -3, -1, 1, 3]

labels = [
    "Stable",
    "Neutral",
    "Polarized"
]

polarization_df["polarization label"] = pd.cut(
    polarization_df["political polarization score (central estimate)"],
    bins=bounds,
    labels=labels,
    include_lowest=True,
    right=False
)
polarization_df = polarization_df.rename(columns={"code": "country"})


polarization_df

In [None]:
for df in (df_tokenize, polarization_df):
    df['country'] = df['country'].astype(str).str.strip()
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

bad_tf = df_tokenize[df_tokenize['year'].isna()]
bad_pol = polarization_df[polarization_df['year'].isna()]

merged_df = df_tokenize.merge(
    polarization_df,
    how='right',
    on=['country', 'year']
)
merged_df = merged_df.drop(columns=['entity'])
merged_df = merged_df.dropna()
merged_df

## TF-IDF
The following code calculates the TF-IDF score for each word in every speech, this is then stored in the Dataframe as a list of pairs, containing (word, tf-idf score), sorted descendingly, so you get the higher TF-IDF scores first

https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [None]:
df_tf_idf = merged_df.copy()

df_tf_idf['speech_token'] = df_tf_idf['speech_token'].str.join(' ')

tfidf_vector = TfidfVectorizer()
speeches = df_tf_idf['speech_token']

tf_idf_matrix  = tfidf_vector.fit_transform(speeches)

In [None]:
print(tf_idf_matrix)

In [None]:
feature_names = tfidf_vector.get_feature_names_out()

def matrix_to_tfidf_pairs(row):
    row_array = row.toarray().flatten()  
    word_tf_idf_pairs = [(word, score) for word, score in zip(feature_names, row_array) if score > 0]
    pairs_sorted = sorted(word_tf_idf_pairs, key=lambda x: x[1], reverse=True)
    return pairs_sorted

df_tf_idf['speech_score'] = [matrix_to_tfidf_pairs(tf_idf_matrix[i]) for i in range(tf_idf_matrix.shape[0])]

In [None]:
df_tf_idf[['country', 'year', 'speech_score']].head()

In [None]:
# --- TF-IDF with unigrams + bigrams (n-grams) ---
from sklearn.feature_extraction.text import TfidfVectorizer

# use the same joined text you created above
texts = df_tf_idf['speech']  # already ' '.join(tokens)

tfidf_ngram = TfidfVectorizer(
    lowercase=True,
    token_pattern=r"(?u)\b\w+\b",  # keep words and numbers
    ngram_range=(1, 2),            # unigrams + bigrams
    min_df=2,
    max_df=0.9,
)

tf_idf_matrix_ngram = tfidf_ngram.fit_transform(texts)
feature_names_ngram = tfidf_ngram.get_feature_names_out()

print("Docs:", tf_idf_matrix_ngram.shape[0],
      "Vocab size (with bigrams):", tf_idf_matrix_ngram.shape[1])

def matrix_to_tfidf_pairs_ng(row):
    arr = row.toarray().flatten()
    pairs = [(w, s) for w, s in zip(feature_names_ngram, arr) if s > 0]
    return sorted(pairs, key=lambda x: x[1], reverse=True)

# store bigram-aware scores in a NEW column, while keeping original intact
df_tf_idf['speech_score_bigrams'] = [
    matrix_to_tfidf_pairs_ng(tf_idf_matrix_ngram[i])
    for i in range(tf_idf_matrix_ngram.shape[0])
]

# quick peek into the dataframe with speech score for both unigrams AND bigrams
df_tf_idf[['country', 'year', 'speech_score_bigrams']].head()

## Linear Regression - Lasso

https://scikit-learn.org/stable/modules/linear_model.html

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
df_lin = merged_df.copy()
df_lin = df_lin.rename(columns={"political polarization score (central estimate)": "polarization score"})

# y = np.array(df_lin["polarization score"]).reshape(-1, 1)
# scaler = MinMaxScaler(feature_range=(-1, 1))
# y_scaled = scaler.fit_transform(y)

# df_lin["polarization scaled"] = y_scaled


X = tf_idf_matrix
# y = df_lin['polarization scaled']
y = df_lin['polarization score']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_lin = linear_model.Ridge(alpha=0.1)
model_lin.fit(X_train, y_train)

y_pred = model_lin.predict(X_test)

MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", MSE)
print("Mean Absolute Error:", MAE)
print("R^2 Score:", r2)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
X = tf_idf_matrix
y = merged_df['polarization label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
print(X_train)

In [None]:
print(y_train)