In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

import matplotlib.pyplot as plt
import ipywidgets
nltk.download('punkt')
nltk.download('stopwords')


# Load the dataset
df = pd.read_csv('data/Training Data/Labeled-headlines.csv', encoding='ISO-8859-1', on_bad_lines='skip', header=None)  # Ensure your CSV file has columns like 'title' and 'content'
df = df[df.iloc[:, 0] == 'negative'].dropna()
# Print first few rows of the dataset
df.head()

[nltk_data] Downloading package punkt to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1
2,negative,The international electronic industry company ...
415,negative,A tinyurl link takes users to a scamming site ...
421,negative,"Compared with the FTSE 100 index , which rose ..."
423,negative,"Compared with the FTSE 100 index , which rose ..."
500,negative,One of the challenges in the oil production in...


In [2]:
# Define a function to preprocess the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# Apply the function to preprocess the 'data' column
df['processed_content'] = df[1].apply(preprocess_text)

# Preview the preprocessed text
df['processed_content'].head()

2      [international, electronic, industry, company,...
415    [tinyurl, link, takes, users, scamming, site, ...
421    [compared, ftse, index, rose, points, day, rel...
423    [compared, ftse, index, rose, points, day, rel...
500    [one, challenges, oil, production, north, sea,...
Name: processed_content, dtype: object

In [3]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['processed_content'])
corpus = [dictionary.doc2bow(text) for text in df['processed_content']]

# Preview the corpus
print(corpus[:2])  # List of tuples (term_id, term_frequency)

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)], [(20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2)]]


In [4]:
from collections import Counter

# Step 1: Calculate word frequencies
# Sum word counts across all documents
word_freq = Counter()
for doc in corpus:
    for word_id, count in doc:
        word = dictionary[word_id]
        word_freq[word] += count

# Get the top N most frequent words (e.g., top 20)
top_n = 100
most_common_words = word_freq.most_common(top_n)

# Print the most frequent words
print(f"Top {top_n} most frequent words:")
for word, freq in most_common_words:
    print(f"Word: {word}, Frequency: {freq}")

Top 100 most frequent words:
Word: eur, Frequency: 325
Word: mn, Frequency: 224
Word: profit, Frequency: 156
Word: net, Frequency: 104
Word: company, Frequency: 100
Word: finnish, Frequency: 99
Word: sales, Frequency: 98
Word: operating, Frequency: 97
Word: period, Frequency: 88
Word: quarter, Frequency: 79
Word: million, Frequency: 78
Word: said, Frequency: 77
Word: year, Frequency: 74
Word: loss, Frequency: 69
Word: compared, Frequency: 68
Word: decreased, Frequency: 68
Word: mln, Frequency: 68
Word: first, Frequency: 57
Word: oyj, Frequency: 51
Word: fell, Frequency: 46
Word: corresponding, Frequency: 46
Word: finland, Frequency: 43
Word: group, Frequency: 37
Word: euro, Frequency: 36
Word: result, Frequency: 35
Word: half, Frequency: 34
Word: today, Frequency: 33
Word: pct, Frequency: 32
Word: helsinki, Frequency: 32
Word: share, Frequency: 32
Word: earlier, Frequency: 30
Word: lower, Frequency: 29
Word: hel, Frequency: 28
Word: third, Frequency: 27
Word: per, Frequency: 26
Word: t

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.util import ngrams

# Generate bigrams and trigrams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Collect all bigrams and trigrams
bigrams = []
trigrams = []
for tokens in df['processed_content']:
    bigrams.extend(generate_ngrams(tokens, 2))
    trigrams.extend(generate_ngrams(tokens, 3))

# Count frequencies
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

# Convert tuples to readable strings
bigram_freq = {(' '.join(bigram)): freq for bigram, freq in bigram_freq.items()}
trigram_freq = {(' '.join(trigram)): freq for trigram, freq in trigram_freq.items()}

# Get top N bigrams and trigrams
top_n = 100
top_bigrams = sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
top_trigrams = sorted(trigram_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Print results
print(f"Top {top_n} Bigrams:")
for bigram, freq in top_bigrams:
    print(f"Bigram: {bigram}, Frequency: {freq}")

print(f"\nTop {top_n} Trigrams:")
for trigram, freq in top_trigrams:
    print(f"Trigram: {trigram}, Frequency: {freq}")

Top 100 Bigrams:
Bigram: eur mn, Frequency: 219
Bigram: operating profit, Frequency: 58
Bigram: mn eur, Frequency: 56
Bigram: net sales, Frequency: 48
Bigram: eur million, Frequency: 44
Bigram: corresponding period, Frequency: 44
Bigram: net profit, Frequency: 34
Bigram: profit eur, Frequency: 32
Bigram: decreased eur, Frequency: 30
Bigram: euro mln, Frequency: 30
Bigram: mn compared, Frequency: 28
Bigram: mln euro, Frequency: 27
Bigram: first half, Frequency: 26
Bigram: totalled eur, Frequency: 26
Bigram: said today, Frequency: 24
Bigram: third quarter, Frequency: 24
Bigram: oyj hel, Frequency: 23
Bigram: operating loss, Frequency: 23
Bigram: year earlier, Frequency: 23
Bigram: mn corresponding, Frequency: 23
Bigram: first quarter, Frequency: 22
Bigram: omx helsinki, Frequency: 19
Bigram: compared profit, Frequency: 18
Bigram: loss eur, Frequency: 17
Bigram: second quarter, Frequency: 17
Bigram: mn first, Frequency: 17
Bigram: compared eur, Frequency: 17
Bigram: fell eur, Frequency: 1

In [6]:
# Train the LDA model
lda_model = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary, passes=10)

# Print the topics with top words
lda_model.show_topics()

[(7,
  '0.050*"eur" + 0.034*"decreased" + 0.026*"sales" + 0.016*"operations" + 0.013*"per" + 0.012*"share" + 0.011*"earnings" + 0.011*"eps" + 0.011*"continuing" + 0.011*"mn"'),
 (11,
  '0.013*"said" + 0.012*"march" + 0.009*"hit" + 0.009*"scanfil" + 0.009*"temporary" + 0.008*"sales" + 0.007*"finnish" + 0.007*"manufacturer" + 0.006*"result" + 0.006*"company"'),
 (10,
  '0.017*"company" + 0.014*"production" + 0.014*"finnish" + 0.013*"printing" + 0.012*"said" + 0.011*"jobs" + 0.010*"negative" + 0.010*"ltd" + 0.010*"would" + 0.008*"today"'),
 (13,
  '0.017*"company" + 0.013*"year" + 0.012*"employment" + 0.011*"end" + 0.010*"prices" + 0.010*"went" + 0.010*"earlier" + 0.010*"reduction" + 0.010*"value" + 0.010*"fair"'),
 (12,
  '0.054*"mln" + 0.044*"net" + 0.036*"period" + 0.031*"profit" + 0.028*"euro" + 0.028*"sales" + 0.026*"year" + 0.023*"quarter" + 0.018*"operating" + 0.017*"company"'),
 (6,
  '0.148*"eur" + 0.127*"mn" + 0.053*"profit" + 0.039*"operating" + 0.023*"compared" + 0.021*"period

In [7]:
from gensim.models.coherencemodel import CoherenceModel

# Calculate Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_content'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.45512923131961186


In [8]:
# Visualize the topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'data/LDA negative sentiment.html')