In [21]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

import matplotlib.pyplot as plt
import ipywidgets
nltk.download('punkt')
nltk.download('stopwords')


# Load the dataset
df = pd.read_csv('data/Training Data/Labeled-headlines.csv', encoding='ISO-8859-1', on_bad_lines='skip', header=None)  # Ensure your CSV file has columns like 'title' and 'content'
df = df[df.iloc[:, 0] == 'positive'].dropna()
# Print first few rows of the dataset
df.head()

[nltk_data] Downloading package punkt to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
6,positive,"For the last quarter of 2010 , Componenta 's n..."
7,positive,"In the third quarter of 2010 , net sales incre..."


In [25]:
# Define a function to preprocess the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# Apply the function to preprocess the 'data' column
df['processed_content'] = df[1].apply(preprocess_text)

# Preview the preprocessed text
df['processed_content'].head()

3    [new, production, plant, company, would, incre...
4    [according, company, updated, strategy, years,...
5    [financing, aspocomp, growth, aspocomp, aggres...
6    [last, quarter, componenta, net, sales, double...
7    [third, quarter, net, sales, increased, eur, m...
Name: processed_content, dtype: object

In [26]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['processed_content'])
corpus = [dictionary.doc2bow(text) for text in df['processed_content']]

# Preview the corpus
print(corpus[:2])  # List of tuples (term_id, term_frequency)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2)], [(1, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1)]]


In [13]:
from collections import Counter

# Step 1: Calculate word frequencies
# Sum word counts across all documents
word_freq = Counter()
for doc in corpus:
    for word_id, count in doc:
        word = dictionary[word_id]
        word_freq[word] += count

# Get the top N most frequent words (e.g., top 20)
top_n = 100
most_common_words = word_freq.most_common(top_n)

# Print the most frequent words
print(f"Top {top_n} most frequent words:")
for word, freq in most_common_words:
    print(f"Word: {word}, Frequency: {freq}")

Top 100 most frequent words:
Word: eur, Frequency: 449
Word: mn, Frequency: 241
Word: company, Frequency: 240
Word: said, Frequency: 230
Word: finnish, Frequency: 198
Word: net, Frequency: 196
Word: sales, Frequency: 192
Word: profit, Frequency: 191
Word: million, Frequency: 170
Word: period, Frequency: 139
Word: year, Frequency: 139
Word: mln, Frequency: 128
Word: operating, Frequency: 122
Word: quarter, Frequency: 107
Word: oyj, Frequency: 97
Word: group, Frequency: 96
Word: rose, Frequency: 94
Word: increased, Frequency: 89
Word: new, Frequency: 82
Word: increase, Frequency: 76
Word: finland, Frequency: 75
Word: first, Frequency: 73
Word: loss, Frequency: 72
Word: compared, Frequency: 71
Word: services, Frequency: 69
Word: euro, Frequency: 67
Word: business, Frequency: 66
Word: share, Frequency: 66
Word: percent, Frequency: 66
Word: corresponding, Frequency: 65
Word: market, Frequency: 64
Word: also, Frequency: 63
Word: today, Frequency: 60
Word: agreement, Frequency: 59
Word: signe

In [14]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.util import ngrams

# Generate bigrams and trigrams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Collect all bigrams and trigrams
bigrams = []
trigrams = []
for tokens in df['processed_content']:
    bigrams.extend(generate_ngrams(tokens, 2))
    trigrams.extend(generate_ngrams(tokens, 3))

# Count frequencies
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

# Convert tuples to readable strings
bigram_freq = {(' '.join(bigram)): freq for bigram, freq in bigram_freq.items()}
trigram_freq = {(' '.join(trigram)): freq for trigram, freq in trigram_freq.items()}

# Get top N bigrams and trigrams
top_n = 100
top_bigrams = sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
top_trigrams = sorted(trigram_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Print results
print(f"Top {top_n} Bigrams:")
for bigram, freq in top_bigrams:
    print(f"Bigram: {bigram}, Frequency: {freq}")

print(f"\nTop {top_n} Trigrams:")
for trigram, freq in top_trigrams:
    print(f"Trigram: {trigram}, Frequency: {freq}")

Top 100 Bigrams:
Bigram: eur mn, Frequency: 231
Bigram: net sales, Frequency: 102
Bigram: operating profit, Frequency: 101
Bigram: eur million, Frequency: 90
Bigram: mn eur, Frequency: 79
Bigram: corresponding period, Frequency: 63
Bigram: net profit, Frequency: 52
Bigram: rose eur, Frequency: 47
Bigram: euro mln, Frequency: 45
Bigram: mln euro, Frequency: 44
Bigram: said today, Frequency: 44
Bigram: profit rose, Frequency: 43
Bigram: oyj hel, Frequency: 36
Bigram: mn corresponding, Frequency: 33
Bigram: per share, Frequency: 32
Bigram: hel said, Frequency: 30
Bigram: net loss, Frequency: 30
Bigram: mln eur, Frequency: 29
Bigram: year earlier, Frequency: 28
Bigram: last year, Frequency: 26
Bigram: first quarter, Frequency: 26
Bigram: nine months, Frequency: 26
Bigram: second quarter, Frequency: 25
Bigram: third quarter, Frequency: 24
Bigram: increased eur, Frequency: 24
Bigram: first nine, Frequency: 23
Bigram: million usd, Frequency: 23
Bigram: period increased, Frequency: 20
Bigram: 

In [29]:
# Train the LDA model
lda_model = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary, passes=10)

# Print the topics with top words
lda_model.show_topics()

Coherence Score: 0.486020841409667


In [19]:
from gensim.models.coherencemodel import CoherenceModel

# Calculate Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_content'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.3813695599602054


In [20]:
# Visualize the topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'data/LDA positive sentiment.html')