In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv('data/News Articles/Bloomberg/BloombergNews100.csv')
df.head()

: 

In [2]:
df = df.dropna(subset=['Headline'])

# Define a function to preprocess the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# Apply the function to preprocess the 'data' column
df['processed_content'] = df['Headline'].apply(preprocess_text)

# Preview the preprocessed text
df['processed_content'].head()

0    [ivory, coast, keeps, cocoa, export, tax, docu...
1    [usda, boxed, beef, cutout, closing, prices, o...
2          [september, small, business, jobs, summary]
3    [greece, gsee, says, meet, talks, troika, athens]
4           [companies, get, tax, breaks, hundt, says]
Name: processed_content, dtype: object

In [3]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['processed_content'])
corpus = [dictionary.doc2bow(text) for text in df['processed_content']]

# Preview the corpus
print(corpus[:2])  # List of tuples (term_id, term_frequency)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]


In [4]:
# Train the LDA model
lda_model = LdaModel(corpus=corpus, num_topics=8, id2word=dictionary, passes=10)

# Print the topics with top words
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

from gensim.models.coherencemodel import CoherenceModel

# Calculate Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_content'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

# Visualize the topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'data/LDA html charts/LDA bloomberg.html')

Topic 0: 0.027*"says" + 0.012*"eu" + 0.012*"world" + 0.012*"gas" + 0.010*"obama" + 0.010*"companies" + 0.009*"power" + 0.008*"deal" + 0.007*"budget" + 0.007*"plan"
Topic 1: 0.061*"says" + 0.033*"bank" + 0.016*"february" + 0.014*"may" + 0.013*"credit" + 0.012*"rate" + 0.012*"central" + 0.011*"japan" + 0.010*"debt" + 0.008*"fed"
Topic 2: 0.027*"says" + 0.016*"euro" + 0.014*"crude" + 0.011*"april" + 0.007*"egypt" + 0.007*"probe" + 0.007*"said" + 0.007*"versus" + 0.006*"move" + 0.006*"little"
Topic 3: 0.032*"profit" + 0.029*"rises" + 0.023*"growth" + 0.015*"estimates" + 0.014*"china" + 0.013*"forecast" + 0.012*"stocks" + 0.012*"first" + 0.011*"months" + 0.010*"years"
Topic 4: 0.028*"prices" + 0.024*"million" + 0.021*"says" + 0.017*"india" + 0.013*"new" + 0.011*"markets" + 0.009*"china" + 0.009*"may" + 0.009*"higher" + 0.009*"copper"
Topic 5: 0.035*"says" + 0.022*"billion" + 0.015*"million" + 0.014*"may" + 0.013*"said" + 0.012*"equity" + 0.011*"south" + 0.010*"reports" + 0.010*"plans" + 0.0