In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt
import ipywidgets
nltk.download('punkt')
nltk.download('stopwords')


# Load the dataset
df = pd.read_csv('data/News Tagging/BloombergNews.csv')  # Ensure your CSV file has columns like 'title' and 'content'

# Print first few rows of the dataset
df.head()

[nltk_data] Downloading package punkt to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Headline,Journalists,Date,Link,Article
0,0,"Ivory Coast Keeps Cocoa Export Tax Below 22%, ...",['Baudelaire Mieu'],2011-10-06 15:14:20,http://www.bloomberg.com/news/2011-10-06/ivory...,"Export taxes on cocoa beans from Ivory Coast ,..."
1,1,USDA Boxed Beef Cutout Closing Prices for Octo...,['Michael Carone'],2011-10-06 20:22:42,http://www.bloomberg.com/news/2011-10-06/usda-...,October 6 (Bloomberg) -- This table details bo...
2,2,U.S. September Small Business Jobs Summary,['Alex Tanzi'],2011-10-06 19:00:00,http://www.bloomberg.com/news/2011-10-06/u-s-s...,U.S. small business plans to hire declined in ...
3,3,Greece’s GSEE Says Won’t Meet For Talks With T...,['Natalie Weeks'],2011-10-06 14:45:34,http://www.bloomberg.com/news/2011-10-06/greec...,"Greece ’s biggest private sector union group, ..."
4,4,Clean-Tech Companies Should Get 10-Year Tax Br...,['Ari Levy'],2011-10-06 18:34:41,http://www.bloomberg.com/news/2011-10-06/clean...,"Reed Hundt, head of the Coalition for Green Ca..."


In [11]:
df = df.dropna(subset=['Headline'])

# Define a function to preprocess the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# Apply the function to preprocess the 'data' column
df['processed_content'] = df['Headline'].apply(preprocess_text)

# Preview the preprocessed text
df['processed_content'].head()

0    [ivory, coast, keeps, cocoa, export, tax, docu...
1    [usda, boxed, beef, cutout, closing, prices, o...
2          [september, small, business, jobs, summary]
3    [greece, gsee, says, meet, talks, troika, athens]
4           [companies, get, tax, breaks, hundt, says]
Name: processed_content, dtype: object

In [12]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['processed_content'])
corpus = [dictionary.doc2bow(text) for text in df['processed_content']]

# Preview the corpus
print(corpus[:2])  # List of tuples (term_id, term_frequency)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]


In [13]:
# Train the LDA model
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=10)

# Print the topics with top words
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.049*"bank" + 0.041*"million" + 0.030*"says" + 0.017*"central" + 0.014*"billion" + 0.011*"fund" + 0.011*"wins" + 0.009*"seeks" + 0.009*"may" + 0.009*"raise"
Topic 1: 0.037*"june" + 0.029*"says" + 0.025*"debt" + 0.017*"first" + 0.013*"concern" + 0.013*"europe" + 0.013*"two" + 0.013*"euro" + 0.012*"earnings" + 0.012*"crisis"
Topic 2: 0.039*"says" + 0.016*"said" + 0.016*"obama" + 0.014*"billion" + 0.014*"buy" + 0.012*"hong" + 0.012*"kong" + 0.012*"million" + 0.011*"chief" + 0.010*"end"
Topic 3: 0.036*"india" + 0.035*"new" + 0.024*"credit" + 0.023*"market" + 0.017*"money" + 0.017*"bond" + 0.016*"bank" + 0.015*"markets" + 0.015*"billion" + 0.014*"york"
Topic 4: 0.026*"south" + 0.022*"prices" + 0.018*"gain" + 0.017*"fed" + 0.017*"world" + 0.016*"africa" + 0.012*"copper" + 0.012*"quarter" + 0.010*"african" + 0.010*"september"
Topic 5: 0.060*"says" + 0.022*"may" + 0.019*"reports" + 0.016*"sell" + 0.012*"group" + 0.011*"unit" + 0.010*"stake" + 0.009*"next" + 0.009*"top" + 0.009*"busin

In [8]:
from gensim.models.coherencemodel import CoherenceModel

# Calculate Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_content'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.2867346163486801


In [6]:
# Visualize the topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'data/LDA bloomberg.html')