TOPIC MODELING

In [1]:
# Libraries

import pandas as pd
import numpy as np

In [2]:
# Importing data

data = pd.read_csv('speeches.csv')
data.head()

Unnamed: 0,year,header,information,speech,name,link
0,2023,Remarks by OPEC Secretary General,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Your Excellency Mr. Chairman, Excellencies, la...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/71...
1,2023,Address by OPEC Secretary General,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Honourable Prime Minister, Excellencies, ladie...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/71...
2,2023,Address by OPEC Secretary General,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Your Highness, Excellencies, Distinguished gue...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/71...
3,2022,OPEC Statement to the UN Climate Change Confer...,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Madame President, distinguished delegates, Th...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/70...
4,2022,Keynote address by OPEC Secretary General,"Delivered by HE Mohammad Sanusi Barkindo, OPEC...","Excellencies, ladies and gentlemen,\n\n It is ...",HE Mohammad Sanusi Barkindo,https://www.opec.org/opec_web/en/press_room/69...


In [3]:
# Need to clean each speech: dropping first and last sentence

In [4]:
# data['speech'] = data['speech'].str.contains('\n\n')

In [5]:
# EMPEZAMOS CON TOPIC MODELING

Corpus

In [6]:
# Corpus

corpus = []

for index, row in data.iterrows():
    corpus.append({
        'year': str(row['year']),
        'id': f"{str(row['year'])}\n_{index + 1}",
        'document': row['speech']
    })
    
corpus_df = pd.DataFrame(corpus)

In [7]:
# Printing
corpus_df.head()

Unnamed: 0,year,id,document
0,2023,2023\n_1,"Your Excellency Mr. Chairman, Excellencies, la..."
1,2023,2023\n_2,"Honourable Prime Minister, Excellencies, ladie..."
2,2023,2023\n_3,"Your Highness, Excellencies, Distinguished gue..."
3,2022,2022\n_4,"Madame President, distinguished delegates, Th..."
4,2022,2022\n_5,"Excellencies, ladies and gentlemen,\n\n It is ..."


Lemmatization and Stopwords

In [8]:
# Libraries for Lemmatization and Stopwords

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import re

# Downloading the stopwords dataset
# nltk.download('stopwords')
# nltk.download('punkt')

# Download the spaCy English model
# spacy.cli.download("en_core_web_sm")

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [9]:
# Creating a function to remove stopwords, symbols and lemmatize

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    lemmatized_words = [token.lemma_ for token in nlp(" ".join(filtered_words))]
    return lemmatized_words

# Apply the function to the 'document' column
corpus_df['document'] = corpus_df['document'].apply(preprocess_text)

Topic Modeling: Latent Dirichlet Allocation (LDA) using gensim library.

In [10]:
# Libraries for Topic Modeling

from gensim import corpora, models

nlp = spacy.load("en_core_web_sm")

In [11]:
# Adding extra words to the bag of words
bag_words = []
extra_words = ['OPEC']
for word in extra_words:
    if word not in bag_words.token2id:
        bag_words.add_documents([[word]])




# Topic Modeling

# Creating a bag of words and applying it to each document
bag_words = corpora.Dictionary(corpus_df['document'])
corpus = [bag_words.doc2bow(doc) for doc in corpus_df['document']]

# LDA model
lda_model = models.LdaModel(corpus, num_topics = 5, id2word = bag_words, passes = 10)

# Printing the topics
for idx, topic in lda_model.print_topics():
    print (f'Topic {idx}: {topic}')
# Assigning topics to documents
corpus_df['topic'] = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]
# Printing
# print(corpus_df[['document', 'topic']])

# Get the top terms for each topic
top_terms_per_topic = lda_model.show_topics(num_topics=5, num_words=5, formatted=False)

# Print the top 5 words and their probabilities for each topic
for topic_id, topic_words in top_terms_per_topic:
    print(f'Topic {topic_id}:')
    for word, prob in topic_words:
        print(f'  {word}: {prob:.4f}')
    print()

AttributeError: 'list' object has no attribute 'token2id'

In [None]:
from gensim import corpora, models

# Assuming 'corpus' and 'dictionary' are already created

# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=bag_words, passes=15)

# Get the top terms for each topic
top_terms_per_topic = lda_model.show_topics(num_topics=5, num_words=5, formatted=False)

# Print the top 5 words and their probabilities for each topic
for topic_id, topic_words in top_terms_per_topic:
    print(f'Topic {topic_id}:')
    for word, prob in topic_words:
        print(f'  {word}: {prob:.4f}')
    print()
