Topic Modeling

In [1]:
# Libraries

import pandas as pd
import numpy as np

In [2]:
# Importing data

data = pd.read_csv('speeches.csv')
data.head()

Unnamed: 0,year,header,information,speech,name,link
0,2023,Remarks by OPEC Secretary General,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Your Excellency Mr. Chairman, Excellencies, la...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/71...
1,2023,Address by OPEC Secretary General,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Honourable Prime Minister, Excellencies, ladie...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/71...
2,2023,Address by OPEC Secretary General,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Your Highness, Excellencies, Distinguished gue...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/71...
3,2022,OPEC Statement to the UN Climate Change Confer...,"Delivered by HE Haitham Al Ghais, OPEC Secreta...","Madame President, distinguished delegates, Th...",HE Haitham Al Ghais,https://www.opec.org/opec_web/en/press_room/70...
4,2022,Keynote address by OPEC Secretary General,"Delivered by HE Mohammad Sanusi Barkindo, OPEC...","Excellencies, ladies and gentlemen,\n\n It is ...",HE Mohammad Sanusi Barkindo,https://www.opec.org/opec_web/en/press_room/69...


Corpus

In [6]:
# Corpus

corpus = []

for index, row in data.iterrows():
    corpus.append({
        'year': str(row['year']),
        'id': f"{str(row['year'])}\n_{index + 1}",
        'document': row['speech']
    })
    
corpus_df = pd.DataFrame(corpus)

In [7]:
# Printing
corpus_df.head()

Unnamed: 0,year,id,document
0,2023,2023\n_1,"Your Excellency Mr. Chairman, Excellencies, la..."
1,2023,2023\n_2,"Honourable Prime Minister, Excellencies, ladie..."
2,2023,2023\n_3,"Your Highness, Excellencies, Distinguished gue..."
3,2022,2022\n_4,"Madame President, distinguished delegates, Th..."
4,2022,2022\n_5,"Excellencies, ladies and gentlemen,\n\n It is ..."


Lemmatization and Stopwords

In [8]:
# Libraries for Lemmatization and Stopwords

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import re

# Downloading the stopwords dataset
# nltk.download('stopwords')
# nltk.download('punkt')

# Downloading the spaCy English model
# spacy.cli.download("en_core_web_sm")

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [9]:
# Creating a function to remove stopwords, symbols and lemmatize

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    lemmatized_words = [token.lemma_ for token in nlp(" ".join(filtered_words))]
    return lemmatized_words

# Apply the function to the 'document' column
corpus_df['document'] = corpus_df['document'].apply(preprocess_text)

Topic Modeling: Latent Dirichlet Allocation (LDA) using gensim library

In [10]:
# Libraries for Topic Modeling

from gensim import corpora, models

nlp = spacy.load("en_core_web_sm")

In [21]:
# Topic Modeling

# Creating a bag of words and applying it to each document
bag_words = corpora.Dictionary(corpus_df['document'])
corpus = [bag_words.doc2bow(doc) for doc in corpus_df['document']]

# LDA model
lda_model = models.LdaModel(corpus, num_topics = 5, id2word = bag_words, passes = 10)

# Printing the topics
for idx, topic in lda_model.print_topics():
    print (f'Topic {idx}: {topic}')
# Assigning topics to documents
corpus_df['topic'] = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]
# Printing
# print(corpus_df[['document', 'topic']])

# Get the top terms for each topic
top_terms_per_topic = lda_model.show_topics(num_topics= 4, num_words= 5, formatted= False)

# Print the top 5 words and their probabilities for each topic
for topic_id, topic_words in top_terms_per_topic:
    print(f'Topic {topic_id}:')
    for word, prob in topic_words:
        print(f'  {word}: {prob:.4f}')
    print()

Topic 0: 0.024*"oil" + 0.014*"market" + 0.012*"OPEC" + 0.012*"demand" + 0.011*"price" + 0.009*"year" + 0.009*"growth" + 0.008*"world" + 0.008*"supply" + 0.006*"country"
Topic 1: 0.019*"energy" + 0.014*"OPEC" + 0.012*"oil" + 0.010*"industry" + 0.008*"need" + 0.007*"global" + 0.006*"also" + 0.006*"world" + 0.005*"future" + 0.005*"year"
Topic 2: 0.017*"OPEC" + 0.014*"oil" + 0.012*"market" + 0.007*"global" + 0.007*"year" + 0.007*"industry" + 0.006*"country" + 0.006*"also" + 0.005*"would" + 0.005*"Declaration"
Topic 3: 0.018*"oil" + 0.014*"energy" + 0.011*"OPEC" + 0.010*"demand" + 0.008*"country" + 0.008*"need" + 0.008*"market" + 0.007*"price" + 0.007*"world" + 0.007*"supply"
Topic 4: 0.010*"OPEC" + 0.006*"oil" + 0.006*"country" + 0.005*"market" + 0.004*"Countries" + 0.004*"work" + 0.004*"we" + 0.004*"would" + 0.003*"develop" + 0.003*"Organization"
Topic 4:
  OPEC: 0.0102
  oil: 0.0062
  country: 0.0057
  market: 0.0050
  Countries: 0.0039

Topic 1:
  energy: 0.0190
  OPEC: 0.0142
  oil: 0.

Plotting with pyLDAvis: Python library for interactive topic model to visualize the results of the Latent Dirichlet Allocation (LDA)

In [12]:
# pip install pyLDAvis

Note: you may need to restart the kernel to use updated packages.


In [15]:
# Libraries for Plotting
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import matplotlib.pyplot as plt

In [22]:
# Plotting
vis_data = gensimvis.prepare(lda_model, corpus, bag_words, n_jobs=-1)
pyLDAvis.display(vis_data)
plt.show()

In [23]:
pyLDAvis.display(vis_data)