In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
import pandas as pd

# Load the dataset
data_path = "/Users/vishalsehgal/Desktop/NLP/Assignment-1/Dataset/assignment-2-data.csv"
df = pd.read_csv(data_path)

# Filter articles related to Nvidia
df_nvidia = df[df['clean_content'].str.contains("nvidia", case=False, na=False)].copy()

# Reset index
df_nvidia.reset_index(drop=True, inplace=True)
docs = df_nvidia['clean_content'].tolist()


In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [7]:
def preprocess_text_v1(text):
    """
    Original preprocessing function (Preprocessing v1).
    """
    # Check if the input is a valid string
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation except for '$' and '%'
    text = ''.join([char for char in text if char not in string.punctuation or char in ['$','%']])
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Load stopwords in English and Spanish
    stop_words_en = set(stopwords.words('english'))
    stop_words_es = set(stopwords.words('spanish'))
    stop_words = stop_words_en.union(stop_words_es)
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Remove single-character tokens
    words = [word for word in words if len(word) > 1]
    
    # Filter unwanted numbers, preserving financial symbols and units
    units = ['million', 'billion', 'trillion', 'thousand', 'hundred']
    processed_words = []
    i = 0
    while i < len(words):
        word = words[i]
        keep_word = True
        if word.isdigit():
            keep_word = False
            if i > 0 and words[i - 1] in ['$', 'usd']:
                keep_word = True
            elif i + 1 < len(words) and (words[i + 1] in ['%', 'percent'] or words[i + 1] in units):
                keep_word = True
            elif '$' in word or '%' in word:
                keep_word = True
        if keep_word:
            processed_words.append(word)
        i += 1
    
    # Join the words back into a string
    clean_text = ' '.join(processed_words)
    return clean_text


In [8]:
# Define custom stop words
custom_stop_words = [
    "quarter", "week", "today", "period", "session", "billion", "million", "percent", "index",
    "investment", "earnings", "revenue", "growth", "rate", "high", "volume", "stock", "target",
    "price", "analyst", "market", "fund", "asset", "allocation", "solution", "client", "report",
    "forecast", "record", "technology", "industry", "system", "platform", "device", "company",
    "digital", "sector", "day", "seven", "one", "said",
    # Additional stop words to address duplicate topics
    "traded", "gmt", "exchange", "lowest", "highest", "volume", "session", "start", "range",
    "investing", "com", "rose", "fell", "trade", "today", "yesterday", "week", "monday", "tuesday",
    "wednesday", "thursday", "friday", "nasdaq", "nyse"
]


In [9]:
def preprocess_text_v2(text):
    """
    Preprocessing v2: Preprocessing v1 with custom stop words added.
    """
    text = preprocess_text_v1(text)
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Add custom stop words
    custom_stop_words_lower = set([word.lower() for word in custom_stop_words])
    
    # Remove stop words
    words = [word for word in words if word not in custom_stop_words_lower]
    
    # Join the words back into a string
    clean_text = ' '.join(words)
    return clean_text


In [10]:
import re
import nltk
from nltk.corpus import words
from spellchecker import SpellChecker

def preprocess_text_v3(text):
    """
    Preprocessing v2: Preprocessing v2 + spellcheck and meaning words.
    """

    text = preprocess_text_v2(text)

    # Initialize spell checker and English vocabulary
    spell = SpellChecker()
    english_vocab = set(w.lower() for w in words.words())
    
    # Basic preprocessing (lowercasing and removing special characters)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters

    # Tokenize text into words
    tokens = text.split()
    corrected_tokens = []
    for token in tokens:
        # Check if the token is a known English word
        if token in english_vocab:
            corrected_tokens.append(token)
        else:
            # Attempt to correct the spelling
            corrected_token = spell.correction(token)
            # Add the corrected token if it's in the English vocabulary
            if corrected_token in english_vocab:
                corrected_tokens.append(corrected_token)
            else:
                # If correction is not in English vocabulary, discard the token
                continue
    # Reconstruct the text
    cleaned_text = ' '.join(corrected_tokens)
    return cleaned_text


In [11]:
# Import libraries
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np
import random
import torch

# For evaluation
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import words
from spellchecker import SpellChecker

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Define the grid of number of topics
num_topics_grid = [40, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]
coherence_scores = []
diversity_scores = []

In [13]:
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)


In [14]:
# Configure UMAP model
def create_umap_model():
    return UMAP(
        random_state=42,
        n_neighbors=15,
        min_dist=0.1,
        metric='cosine',
        n_jobs=1
    )


In [15]:

# Configure HDBSCAN model
def create_hdbscan_model():
    return HDBSCAN(
        min_cluster_size=10,
        min_samples=10,
        prediction_data=True,
        core_dist_n_jobs=1
    )


In [16]:
def create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, min_topic_size=10, nr_topics=None):
    return BERTopic(
        vectorizer_model=vectorizer_model,
        min_topic_size=min_topic_size,
        calculate_probabilities=True,
        embedding_model='all-MiniLM-L6-v2',
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        nr_topics= nr_topics,
    )


In [17]:
# Calculate topic coherence
def calculate_coherence(bert_topic, docs):
    texts = [doc.split() for doc in docs]
    dictionary = Dictionary(texts)
    topics = [[word for word, _ in bert_topic.get_topic(topic_id)] for topic_id in bert_topic.get_topics().keys()]
    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

# Calculate topic diversity
def calculate_diversity(bert_topic, top_n=10):
    topics = bert_topic.get_topics()
    unique_words = set()
    total_words = 0
    for topic_id in topics:
        words = [word for word, _ in topics[topic_id][:top_n]]
        unique_words.update(words)
        total_words += len(words)
    return len(unique_words) / total_words

# Function to calculate interpretability score (using coherence as a proxy)
def calculate_interpretability(topic_model):
    return calculate_coherence(topic_model, docs)


# Display topics
def display_topics(bert_topic, iteration):
    print(f"\nTopics for Iteration {iteration}:")
    topic_info = bert_topic.get_topic_info()
    for topic_num in topic_info['Topic'].unique():
        if topic_num == -1:
            continue  # Skip outliers
        print(f"\nTopic {topic_num}:")
        for word, score in bert_topic.get_topic(topic_num):
            print(f"{word}: {score:.4f}")


In [15]:
# Iterate over the parameters
for num_topics in num_topics_grid:
    print(f"\nTraining BERTopic model with num_topics={num_topics}")
    set_seeds()  # Ensure reproducibility

    # Initialize BERTopic
    topic_model = BERTopic(
        nr_topics=num_topics,
        calculate_probabilities=True,
        verbose=False
    )

    # Fit the model
    topics, probabilities = topic_model.fit_transform(docs)

    # Get vocabulary size
    vocab_size = len(topic_model.vectorizer_model.vocabulary_)
    print("Vocabulary Size:", vocab_size)

    # Get topic-word matrix (pwgt)
    pwgt = topic_model.c_tf_idf_
    print("Topic-Word Matrix Shape:", pwgt.shape)

    # Get document-topic matrix (ptgd)
    ptgd = probabilities  # Matrix of probabilities for each document
    print("Document-Topic Matrix Shape:", ptgd.shape)

    # Show topics
    print(f"Topics for num_topics={num_topics}:")
    for topic_num in topic_model.get_topics().keys():
        if topic_num == -1:
            continue  # Skip outlier topic
        print(f"\nTopic {topic_num}:")
        for word, score in topic_model.get_topic(topic_num):
            print(f"{word}: {score:.4f}")

    # Calculate scores
    coherence_score = calculate_coherence(topic_model, docs)
    diversity_score = calculate_diversity(topic_model)
    interpretability_score = calculate_interpretability(topic_model)

    print("\nCoherence Score:", coherence_score)
    print("Diversity Score:", diversity_score)
    print("Interpretability Score:", interpretability_score)

    coherence_scores.append(coherence_score)
    diversity_scores.append(diversity_score)

# After iteration, you can analyze the scores
print("\nNumber of Topics Grid:", num_topics_grid)
print("Coherence Scores:", coherence_scores)
print("Diversity Scores:", diversity_scores)


Training BERTopic model with num_topics=40


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Vocabulary Size: 29442
Topic-Word Matrix Shape: (40, 29442)
Document-Topic Matrix Shape: (3043, 39)
Topics for num_topics=40:

Topic 0:
million: 0.0288
quarter: 0.0279
year: 0.0248
revenue: 0.0237
company: 0.0214
zacks: 0.0213
earnings: 0.0159
rank: 0.0155
cent: 0.0152
consensus: 0.0150

Topic 1:
traded: 0.3420
nvidia: 0.1537
seven: 0.1503
gmt: 0.1497
day: 0.1483
lowest: 0.1468
highest: 0.1362
exchange: 0.1279
volume: 0.1250
session: 0.1191

Topic 2:
amd: 0.0748
graphic: 0.0275
processor: 0.0250
radeon: 0.0243
gpu: 0.0228
year: 0.0198
ryzen: 0.0192
revenue: 0.0172
epyc: 0.0167
quarter: 0.0156

Topic 3:
nvidia: 0.0353
gaming: 0.0220
year: 0.0217
quarter: 0.0206
revenue: 0.0193
billion: 0.0181
nvda: 0.0162
company: 0.0159
fiscal: 0.0150
estimate: 0.0146

Topic 4:
resistance: 0.0253
stock: 0.0250
week: 0.0215
nasdaq: 0.0203
around: 0.0200
today: 0.0186
market: 0.0160
back: 0.0159
break: 0.0159
trader: 0.0159

Topic 5:
oil: 0.0219
nyse: 0.0189
nasdaq: 0.0159
dow: 0.0159
rate: 0.0143
percen

In [18]:
def iteration_1(df_nvidia):
    print("\n--- Iteration 1: Initial Model with topics and preprocessing text form assignment 1 ---")
    set_seeds()
    
    # Preprocessing using preprocess_text_v1
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v1)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    vectorizer_model = CountVectorizer(lowercase=True)
    bert_topic = create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, min_topic_size=10,nr_topics=56)
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=1)
    
    return bert_topic, docs, topics, probabilities

In [19]:
def iteration_2(df_nvidia):
    print("\n--- Iteration 2: Improved Preprocessing with Custom Stop Words ---")
    set_seeds()
    
    # Preprocessing using the updated function
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v2)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    vectorizer_model = CountVectorizer(lowercase=True)
    bert_topic = create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, nr_topics=56)
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=2)
    
    return bert_topic, docs, topics, probabilities

In [20]:
def iteration_3(df_nvidia):
    print("\n--- Iteration 3: Adjusted Model Parameters ---")
    set_seeds()
    
    # Preprocessing
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v2)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    vectorizer_model = CountVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        lowercase=True
    )
    bert_topic = create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, min_topic_size=30, nr_topics=56)
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=3)
    
    return bert_topic, docs, topics, probabilities


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

def iteration_4(df_nvidia):
    print("\n--- Iteration 4: Adjusted Model Parameters with TF-IDF ---")
    set_seeds()
    
    # Preprocessing
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v2)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    
    # Use TfidfVectorizer instead of CountVectorizer
    vectorizer_model = TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        lowercase=True
    )
    
    bert_topic = create_bertopic_model(umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, min_topic_size=30, nr_topics=56
    )
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=4)
    
    return bert_topic, docs, topics, probabilities


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def iteration_5(df_nvidia):
    print("\n--- Iteration 5: TF-IDF without Typo/meaningless words ---")
    set_seeds()
    
    # Preprocessing
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v3)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    
    # Use TfidfVectorizer instead of CountVectorizer
    vectorizer_model = TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        lowercase=True
    )
    
    bert_topic = create_bertopic_model(umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, min_topic_size=30, nr_topics=56
    )
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=5)
    
    return bert_topic, docs, topics, probabilities


In [None]:
if __name__ == "__main__":
    # Iteration 1
    bert_topic1, docs1, topics1, probabilities1 = iteration_1(df_nvidia)
    
    # Iteration 2
    bert_topic2, docs2, topics2, probabilities2 = iteration_2(df_nvidia)
    
    # Iteration 3
    bert_topic3, docs3, topics3, probabilities3 = iteration_3(df_nvidia)

    # Iteration 4
    bert_topic4, docs4, topics4, probabilities4 = iteration_4(df_nvidia)

    # Iteration 5
    bert_topic5, docs5, topics5, probabilities5 = iteration_5(df_nvidia)



--- Iteration 1: Initial Model with topics and preprocessing text form assignment 1 ---
Topic Coherence Score: 0.6070474716021121
Topic Diversity Score: 0.5056603773584906

Topics for Iteration 1:

Topic 0:
traded: 0.3180
seven: 0.1424
gmt: 0.1418
lowest: 0.1389
nvidia: 0.1333
day: 0.1326
highest: 0.1283
exchange: 0.1199
volume: 0.1153
session: 0.1111

Topic 1:
traded: 0.3180
seven: 0.1424
gmt: 0.1418
lowest: 0.1389
nvidia: 0.1333
day: 0.1326
highest: 0.1283
exchange: 0.1199
volume: 0.1153
session: 0.1111

Topic 2:
traded: 0.3173
seven: 0.1421
gmt: 0.1415
lowest: 0.1386
day: 0.1346
nvidia: 0.1330
highest: 0.1280
exchange: 0.1196
volume: 0.1151
session: 0.1109

Topic 3:
at40: 0.0463
t2108: 0.0446
day: 0.0437
50dma: 0.0417
trading: 0.0353
call: 0.0353
vix: 0.0280
respective: 0.0269
support: 0.0243
option: 0.0239

Topic 4:
traded: 0.3180
seven: 0.1424
gmt: 0.1418
lowest: 0.1389
nvidia: 0.1333
day: 0.1326
highest: 0.1283
exchange: 0.1199
volume: 0.1153
session: 0.1111

Topic 5:
traded: 0.

In [29]:
    # Iteration 5
    bert_topic5, docs5, topics5, probabilities5 = iteration_5(df_nvidia)


--- Iteration 5: Adjusted Model Parameters with TF-IDF and Spellchecking ---
Topic Coherence Score: 0.6570774210769069
Topic Diversity Score: 0.625

Topics for Iteration 5:

Topic 0:
share past: 0.7822
past: 0.7719
past share: 0.7488
share: 0.6200
: 0.0000
: 0.0000
: 0.0000
: 0.0000
: 0.0000
: 0.0000

Topic 1:
past share: 0.7751
share past: 0.7730
past: 0.7628
share: 0.6126
: 0.0000
: 0.0000
: 0.0000
: 0.0000
: 0.0000
: 0.0000

Topic 2:
close: 0.0929
performer: 0.0763
point: 0.0737
settle: 0.0553
corporation: 0.0528
worst: 0.0447
worst performer: 0.0444
late: 0.0436
lost settle: 0.0386
oil: 0.0381

Topic 3:
year: 0.0283
rank: 0.0132
estimate: 0.0125
investor: 0.0117
buy: 0.0114
strong: 0.0109
new: 0.0109
semiconductor: 0.0100
consensus: 0.0097
data: 0.0096


In [None]:
# For Iteration 3
bert_topic3.visualize_topics().show()
bert_topic3.visualize_heatmap().show()
