In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

import matplotlib.pyplot as plt
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('../data/News Articles/Labelled/Kraggle_labelled.csv', encoding='ISO-8859-1', on_bad_lines='skip', header=None)  # Ensure your CSV file has columns like 'title' and 'content'
df = df[df.iloc[:, 0] == 'neutral'].dropna()
# Print first few rows of the dataset
df.head()

[nltk_data] Downloading package punkt to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
58,neutral,At the request of Finnish media company Alma M...
59,neutral,"In Sweden , Gallerix accumulated SEK denominat..."
60,neutral,The company supports its global customers in d...


In [2]:
# Define a function to preprocess the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# Apply the function to preprocess the 'data' column
df['processed_content'] = df[1].apply(preprocess_text)

# Preview the preprocessed text
df['processed_content'].head()

0     [according, gran, company, plans, move, produc...
1     [technopolis, plans, develop, stages, area, le...
58    [request, finnish, media, company, alma, media...
59    [sweden, gallerix, accumulated, sek, denominat...
60    [company, supports, global, customers, develop...
Name: processed_content, dtype: object

In [3]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['processed_content'])
corpus = [dictionary.doc2bow(text) for text in df['processed_content']]

# Preview the corpus
print(corpus[:2])  # List of tuples (term_id, term_frequency)

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(6, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]


In [4]:
from collections import Counter

# Step 1: Calculate word frequencies
# Sum word counts across all documents
word_freq = Counter()
for doc in corpus:
    for word_id, count in doc:
        word = dictionary[word_id]
        word_freq[word] += count

# Get the top N most frequent words (e.g., top 20)
top_n = 100
most_common_words = word_freq.most_common(top_n)

# Print the most frequent words
print(f"Top {top_n} most frequent words:")
for word, freq in most_common_words:
    print(f"Word: {word}, Frequency: {freq}")

Top 100 most frequent words:
Word: company, Frequency: 508
Word: eur, Frequency: 241
Word: said, Frequency: 238
Word: finland, Frequency: 219
Word: finnish, Frequency: 215
Word: million, Frequency: 192
Word: business, Frequency: 190
Word: group, Frequency: 187
Word: new, Frequency: 179
Word: sales, Frequency: 163
Word: shares, Frequency: 163
Word: services, Frequency: 151
Word: also, Frequency: 145
Word: share, Frequency: 139
Word: market, Frequency: 131
Word: net, Frequency: 112
Word: total, Frequency: 100
Word: helsinki, Frequency: 95
Word: oyj, Frequency: 93
Word: mln, Frequency: 93
Word: nokia, Frequency: 90
Word: financial, Frequency: 87
Word: production, Frequency: 86
Word: mobile, Frequency: 86
Word: capital, Frequency: 84
Word: products, Frequency: 82
Word: corporation, Frequency: 82
Word: companies, Frequency: 81
Word: plant, Frequency: 80
Word: operating, Frequency: 80
Word: investment, Frequency: 79
Word: operations, Frequency: 79
Word: according, Frequency: 78
Word: value, 

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.util import ngrams

# Generate bigrams and trigrams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Collect all bigrams and trigrams
bigrams = []
trigrams = []
for tokens in df['processed_content']:
    bigrams.extend(generate_ngrams(tokens, 2))
    trigrams.extend(generate_ngrams(tokens, 3))

# Count frequencies
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

# Convert tuples to readable strings
bigram_freq = {(' '.join(bigram)): freq for bigram, freq in bigram_freq.items()}
trigram_freq = {(' '.join(trigram)): freq for trigram, freq in trigram_freq.items()}

# Get top N bigrams and trigrams
top_n = 100
top_bigrams = sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
top_trigrams = sorted(trigram_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Print results
print(f"Top {top_n} Bigrams:")
for bigram, freq in top_bigrams:
    print(f"Bigram: {bigram}, Frequency: {freq}")

print(f"\nTop {top_n} Trigrams:")
for trigram, freq in top_trigrams:
    print(f"Trigram: {trigram}, Frequency: {freq}")

Top 100 Bigrams:
Bigram: net sales, Frequency: 96
Bigram: eur million, Frequency: 81
Bigram: company said, Frequency: 41
Bigram: eur mn, Frequency: 41
Bigram: share capital, Frequency: 37
Bigram: mln euro, Frequency: 36
Bigram: euro mln, Frequency: 35
Bigram: operating profit, Frequency: 35
Bigram: stock exchange, Frequency: 33
Bigram: board directors, Frequency: 32
Bigram: omx helsinki, Frequency: 31
Bigram: alma media, Frequency: 29
Bigram: per share, Frequency: 26
Bigram: oyj hel, Frequency: 26
Bigram: voting rights, Frequency: 26
Bigram: real estate, Frequency: 24
Bigram: general meeting, Frequency: 21
Bigram: new shares, Frequency: 19
Bigram: hel said, Frequency: 19
Bigram: stora enso, Frequency: 19
Bigram: said today, Frequency: 18
Bigram: sales eur, Frequency: 17
Bigram: finnish company, Frequency: 17
Bigram: million euros, Frequency: 17
Bigram: helsinki finland, Frequency: 16
Bigram: oyj said, Frequency: 15
Bigram: nasdaq omx, Frequency: 15
Bigram: neste oil, Frequency: 15
Bigr

In [14]:
# Train the LDA model
lda_model = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary, passes=10)

# Print the topics with top words
lda_model.show_topics()

[(6,
  '0.023*"eur" + 0.020*"value" + 0.017*"market" + 0.012*"order" + 0.009*"company" + 0.009*"million" + 0.009*"board" + 0.007*"contracts" + 0.007*"combined" + 0.007*"orders"'),
 (10,
  '0.030*"finland" + 0.020*"services" + 0.017*"company" + 0.011*"well" + 0.010*"mobile" + 0.009*"service" + 0.009*"finnish" + 0.008*"new" + 0.008*"industry" + 0.008*"applications"'),
 (11,
  '0.037*"sales" + 0.034*"company" + 0.032*"eur" + 0.029*"million" + 0.025*"net" + 0.022*"mln" + 0.016*"profit" + 0.016*"group" + 0.013*"stock" + 0.013*"euro"'),
 (9,
  '0.025*"eur" + 0.020*"company" + 0.018*"total" + 0.017*"value" + 0.014*"said" + 0.014*"contract" + 0.012*"pct" + 0.011*"sq" + 0.011*"finland" + 0.010*"area"'),
 (7,
  '0.014*"group" + 0.009*"also" + 0.008*"terms" + 0.008*"phase" + 0.008*"countries" + 0.008*"business" + 0.007*"capacity" + 0.006*"markets" + 0.006*"staff" + 0.006*"distribution"'),
 (1,
  '0.013*"also" + 0.011*"media" + 0.009*"stora" + 0.009*"company" + 0.008*"enso" + 0.008*"first" + 0.007

In [10]:
from gensim.models.coherencemodel import CoherenceModel

# Calculate Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_content'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.26137848790745033


In [None]:
# Visualize the topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'data/Results Charts Models/LDA html charts/LDA neutral sentiment.html')