In [20]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt
import ipywidgets
nltk.download('punkt')
nltk.download('stopwords')


# Load the dataset
df = pd.read_csv('data/Training Data/Labeled-headlines.csv', encoding='ISO-8859-1', on_bad_lines='skip', header=None)  # Ensure your CSV file has columns like 'title' and 'content'
df = df[df.iloc[:, 0] == 'negative'].dropna()
# Print first few rows of the dataset
df.head()

[nltk_data] Downloading package punkt to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1
2,negative,The international electronic industry company ...
415,negative,A tinyurl link takes users to a scamming site ...
421,negative,"Compared with the FTSE 100 index , which rose ..."
423,negative,"Compared with the FTSE 100 index , which rose ..."
500,negative,One of the challenges in the oil production in...


In [22]:
# Define a function to preprocess the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# Apply the function to preprocess the 'data' column
df['processed_content'] = df[1].apply(preprocess_text)

# Preview the preprocessed text
df['processed_content'].head()

2      [international, electronic, industry, company,...
415    [tinyurl, link, takes, users, scamming, site, ...
421    [compared, ftse, index, rose, points, day, rel...
423    [compared, ftse, index, rose, points, day, rel...
500    [one, challenges, oil, production, north, sea,...
Name: processed_content, dtype: object

In [23]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['processed_content'])
corpus = [dictionary.doc2bow(text) for text in df['processed_content']]

# Preview the corpus
print(corpus[:2])  # List of tuples (term_id, term_frequency)

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)], [(20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2)]]


In [27]:
# Train the LDA model
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=10)

# Print the topics with top words
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.021*"omx" + 0.021*"helsinki" + 0.017*"company" + 0.017*"pct" + 0.015*"lower" + 0.011*"eur" + 0.010*"finnish" + 0.010*"decreased" + 0.009*"index" + 0.009*"sales"
Topic 1: 0.065*"mln" + 0.041*"euro" + 0.024*"net" + 0.022*"sales" + 0.019*"pct" + 0.016*"profit" + 0.015*"period" + 0.013*"oyj" + 0.012*"finnish" + 0.008*"quarter"
Topic 2: 0.146*"eur" + 0.108*"mn" + 0.052*"profit" + 0.042*"operating" + 0.030*"period" + 0.028*"compared" + 0.025*"net" + 0.025*"sales" + 0.023*"quarter" + 0.023*"decreased"
Topic 3: 0.014*"negotiations" + 0.013*"company" + 0.013*"personnel" + 0.013*"employees" + 0.012*"staff" + 0.010*"workers" + 0.010*"said" + 0.010*"result" + 0.009*"number" + 0.008*"revenue"
Topic 4: 0.024*"company" + 0.011*"finland" + 0.011*"sales" + 0.011*"production" + 0.010*"finnish" + 0.010*"employees" + 0.009*"jobs" + 0.009*"news" + 0.008*"adp" + 0.008*"situation"
Topic 6: 0.035*"million" + 0.029*"said" + 0.022*"finnish" + 0.021*"net" + 0.021*"company" + 0.021*"oyj" + 0.019*"today

In [28]:
from gensim.models.coherencemodel import CoherenceModel

# Calculate Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_content'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.422728850357601


In [29]:
# Visualize the topics using pyLDAvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)

In [30]:
import IPython
pyLDAvis.display(vis, local=True)