In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import pandas as pd

# Load the dataset
data_path = "/Users/vishalsehgal/Desktop/NLP/Assignment-1/Dataset/assignment-2-data.csv"
df = pd.read_csv(data_path)

# Filter articles related to Nvidia
df_nvidia = df[df['clean_content'].str.contains("nvidia", case=False, na=False)].copy()

# Reset index
df_nvidia.reset_index(drop=True, inplace=True)

# Preview the DataFrame
print(df_nvidia.head())


       id ticker                                              title category  \
0  221539    NIO  A Central Bank War Just Started And Its Good F...  opinion   
1  221547    NIO         6 Stocks To Watch  Nivida Could Be Falling  opinion   
2  221572    NIO  Stocks   Dow Drops Nearly 400 Points as Apple ...     news   
3  221597   UBER                     The Best Of CES 2020  Revised   opinion   
4  221614   UBER                               The Best Of CES 2020  opinion   

                                             content release_date  \
0  ECB Effects\nThe move in the euro was huge  fa...   2019-03-07   
1  6 Stocks To Watch  March 6 Trading Session\nSt...   2019-03-06   
2  Investing com   A rout in Apple and Facebook  ...   2018-11-19   
3  With 4 500 companies bringing their innovation...   2020-01-16   
4  With 4 500 companies bringing their innovation...   2020-01-10   

                    provider  \
0             Michael Kramer   
1             Michael Kramer   
2       

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def preprocess_text_v1(text):
    """
    Original preprocessing function (Preprocessing v1).
    """
    # Check if the input is a valid string
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation except for '$' and '%'
    text = ''.join([char for char in text if char not in string.punctuation or char in ['$','%']])
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Load stopwords in English and Spanish
    stop_words_en = set(stopwords.words('english'))
    stop_words_es = set(stopwords.words('spanish'))
    stop_words = stop_words_en.union(stop_words_es)
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Remove single-character tokens
    words = [word for word in words if len(word) > 1]
    
    # Filter unwanted numbers, preserving financial symbols and units
    units = ['million', 'billion', 'trillion', 'thousand', 'hundred']
    processed_words = []
    i = 0
    while i < len(words):
        word = words[i]
        keep_word = True
        if word.isdigit():
            keep_word = False
            if i > 0 and words[i - 1] in ['$', 'usd']:
                keep_word = True
            elif i + 1 < len(words) and (words[i + 1] in ['%', 'percent'] or words[i + 1] in units):
                keep_word = True
            elif '$' in word or '%' in word:
                keep_word = True
        if keep_word:
            processed_words.append(word)
        i += 1
    
    # Join the words back into a string
    clean_text = ' '.join(processed_words)
    return clean_text


In [None]:
# Define custom stop words
custom_stop_words = [
    "quarter", "week", "today", "period", "session", "billion", "million", "percent", "index",
    "investment", "earnings", "revenue", "growth", "rate", "high", "volume", "stock", "target",
    "price", "analyst", "market", "fund", "asset", "allocation", "solution", "client", "report",
    "forecast", "record", "technology", "industry", "system", "platform", "device", "company",
    "digital", "sector", "day", "seven", "one", "said",
    # Additional stop words to address duplicate topics
    "traded", "gmt", "exchange", "lowest", "highest", "volume", "session", "start", "range",
    "investing", "com", "rose", "fell", "trade", "today", "yesterday", "week", "monday", "tuesday",
    "wednesday", "thursday", "friday", "nasdaq", "nyse"
]


In [6]:
def preprocess_text_v2(text):
    """
    Preprocessing v2: Preprocessing v1 with custom stop words added.
    """
    # Start with preprocessing v1
    # Check if the input is a valid string
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation except for '$' and '%'
    text = ''.join([char for char in text if char not in string.punctuation or char in ['$','%']])
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Load stopwords in English and Spanish
    stop_words_en = set(stopwords.words('english'))
    stop_words_es = set(stopwords.words('spanish'))
    stop_words = stop_words_en.union(stop_words_es)
    
    # Add custom stop words
    custom_stop_words_lower = set([word.lower() for word in custom_stop_words])
    all_stop_words = stop_words.union(custom_stop_words_lower)
    
    # Remove stop words
    words = [word for word in words if word not in all_stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Remove single-character tokens
    words = [word for word in words if len(word) > 1]
    
    # Filter unwanted numbers, preserving financial symbols and units
    units = ['million', 'billion', 'trillion', 'thousand', 'hundred']
    processed_words = []
    i = 0
    while i < len(words):
        word = words[i]
        keep_word = True
        if word.isdigit():
            keep_word = False
            if i > 0 and words[i - 1] in ['$', 'usd']:
                keep_word = True
            elif i + 1 < len(words) and (words[i + 1] in ['%', 'percent'] or words[i + 1] in units):
                keep_word = True
            elif '$' in word or '%' in word:
                keep_word = True
        if keep_word:
            processed_words.append(word)
        i += 1
    
    # Join the words back into a string
    clean_text = ' '.join(processed_words)
    return clean_text


In [7]:
# Import libraries
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np
import random
import torch

# For evaluation
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.metrics import silhouette_score


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)


In [9]:
# Configure UMAP model
def create_umap_model():
    return UMAP(
        random_state=42,
        n_neighbors=15,
        min_dist=0.1,
        metric='cosine',
        n_jobs=1
    )


In [10]:

# Configure HDBSCAN model
def create_hdbscan_model():
    return HDBSCAN(
        min_cluster_size=10,
        min_samples=10,
        prediction_data=True,
        core_dist_n_jobs=1
    )


In [11]:
def create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, min_topic_size=10):
    return BERTopic(
        vectorizer_model=vectorizer_model,
        min_topic_size=min_topic_size,
        calculate_probabilities=True,
        embedding_model='all-MiniLM-L6-v2',
        umap_model=umap_model,
        hdbscan_model=hdbscan_model
    )


In [12]:
# Calculate topic coherence
def calculate_coherence(bert_topic, docs):
    texts = [doc.split() for doc in docs]
    dictionary = Dictionary(texts)
    topics = [[word for word, _ in bert_topic.get_topic(topic_id)] for topic_id in bert_topic.get_topics().keys()]
    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

# Calculate topic diversity
def calculate_diversity(bert_topic, top_n=10):
    topics = bert_topic.get_topics()
    unique_words = set()
    total_words = 0
    for topic_id in topics:
        words = [word for word, _ in topics[topic_id][:top_n]]
        unique_words.update(words)
        total_words += len(words)
    return len(unique_words) / total_words

# Display topics
def display_topics(bert_topic, iteration):
    print(f"\nTopics for Iteration {iteration}:")
    topic_info = bert_topic.get_topic_info()
    for topic_num in topic_info['Topic'].unique():
        if topic_num == -1:
            continue  # Skip outliers
        print(f"\nTopic {topic_num}:")
        for word, score in bert_topic.get_topic(topic_num):
            print(f"{word}: {score:.4f}")


In [13]:
def iteration_1(df_nvidia):
    print("\n--- Iteration 1: Initial Model ---")
    set_seeds()
    
    # Preprocessing using preprocess_text_v1
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v1)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    vectorizer_model = CountVectorizer(lowercase=True)
    bert_topic = create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, min_topic_size=10)
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=1)
    
    return bert_topic, docs, topics, probabilities

In [14]:
def iteration_2(df_nvidia):
    print("\n--- Iteration 2: Improved Preprocessing with Custom Stop Words ---")
    set_seeds()
    
    # Preprocessing using the updated function
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v2)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    vectorizer_model = CountVectorizer(lowercase=True)
    bert_topic = create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, min_topic_size=10)
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=2)
    
    return bert_topic, docs, topics, probabilities

In [15]:
def iteration_3(df_nvidia):
    print("\n--- Iteration 3: Adjusted Model Parameters ---")
    set_seeds()
    
    # Preprocessing
    df_nvidia['Cleaned_Content'] = df_nvidia['clean_content'].apply(preprocess_text_v2)
    docs = df_nvidia['Cleaned_Content'].tolist()
    
    # Initialize models
    umap_model = create_umap_model()
    hdbscan_model = create_hdbscan_model()
    vectorizer_model = CountVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        lowercase=True
    )
    bert_topic = create_bertopic_model(umap_model, hdbscan_model, vectorizer_model, min_topic_size=30)
    
    # Fit the model
    topics, probabilities = bert_topic.fit_transform(docs)
    
    # Evaluate
    coherence = calculate_coherence(bert_topic, docs)
    diversity = calculate_diversity(bert_topic)
    print("Topic Coherence Score:", coherence)
    print("Topic Diversity Score:", diversity)
    
    # Display topics
    display_topics(bert_topic, iteration=3)
    
    return bert_topic, docs, topics, probabilities


In [16]:
if __name__ == "__main__":
    # Iteration 1
    bert_topic1, docs1, topics1, probabilities1 = iteration_1(df_nvidia)
    
    # Iteration 2
    bert_topic2, docs2, topics2, probabilities2 = iteration_2(df_nvidia)
    
    # Iteration 3
    bert_topic3, docs3, topics3, probabilities3 = iteration_3(df_nvidia)



--- Iteration 1: Initial Model ---


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Topic Coherence Score: 0.607047471602112
Topic Diversity Score: 0.5056603773584906

Topics for Iteration 1:

Topic 0:
nyse: 0.0151
trade: 0.0132
nasdaq: 0.0125
week: 0.0121
dow: 0.0119
point: 0.0114
index: 0.0113
market: 0.0112
oil: 0.0111
china: 0.0109

Topic 1:
nvidia: 0.0302
gaming: 0.0184
quarter: 0.0170
year: 0.0167
revenue: 0.0156
billion: 0.0147
company: 0.0139
fiscal: 0.0128
estimate: 0.0127
share: 0.0125

Topic 2:
million: 0.0325
quarter: 0.0282
year: 0.0242
revenue: 0.0236
zacks: 0.0214
company: 0.0186
cent: 0.0180
consensus: 0.0163
rank: 0.0159
earnings: 0.0152

Topic 3:
amd: 0.0778
graphic: 0.0288
radeon: 0.0267
gpu: 0.0239
processor: 0.0234
ryzen: 0.0196
card: 0.0164
year: 0.0159
micro: 0.0155
epyc: 0.0154

Topic 4:
resistance: 0.0458
around: 0.0330
towards: 0.0275
stock: 0.0275
break: 0.0264
roku: 0.0259
nasdaq: 0.0228
rise: 0.0220
higher: 0.0217
support: 0.0208

Topic 5:
vehicle: 0.0525
driving: 0.0522
car: 0.0472
self: 0.0456
autonomous: 0.0385
mobileye: 0.0199
technolo

In [17]:
# For Iteration 3
bert_topic3.visualize_topics().show()
bert_topic3.visualize_heatmap().show()
