In [39]:
from bertopic import BERTopic
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [40]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vishalsehgal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [41]:
df = pd.read_csv("/Users/vishalsehgal/Documents/University/NLP/NLP/Group Assignment/NLP/Assignment-1/Dataset/assignment-2-data.csv")

nvidia_articles = df[df['clean_content'].str.contains("nvidia", case=False, na=False)]['clean_content'].tolist()

In [42]:
# Initialize stopwords and lemmatizer
stop_words_en = set(stopwords.words('english'))
stop_words_es = set(stopwords.words('spanish'))
stop_words = stop_words_en.union(stop_words_es)

# clean text: remove lowercase, punctuation, tokenization, stopwords, and lemmatize
def preprocess_text(doc):
    doc = re.sub(r'\W', ' ', str(doc))
    doc = doc.lower()
    doc = re.sub(r'\s+[a-z]\s+', ' ', doc)
    doc = re.sub(r'^[a-z]\s+', '', doc)
    doc = re.sub(r'\s+', ' ', doc)
    tokens = nltk.word_tokenize(doc)
    tokens = [WordNetLemmatizer().lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to your articles
nvidia_articles = [preprocess_text(article) for article in nvidia_articles]


In [43]:
# Initialize bertopic
bert_topic_model = BERTopic()
topics, probabilities = bert_topic_model.fit_transform(nvidia_articles)

In [44]:
topic_info = bert_topic_model.get_topic_info()
print(topic_info)


    Topic  Count                                 Name  \
0      -1   1061         -1_company_stock_year_nasdaq   
1       0    140       0_million_quarter_revenue_year   
2       1    119       1_amd_graphic_radeon_processor   
3       2     70           2_vehicle_driving_car_self   
4       3     67  3_analyst_zacks_research_investment   
..    ...    ...                                  ...   
60     59     11           59_traded_seven_gmt_lowest   
61     60     11          60_blockchain_etf_fund_blcn   
62     61     11    61_cadence_design_verification_ip   
63     62     10          62_tsmc_samsung_chip_memory   
64     63     10    63_nasdaq_parent_midday_investing   

                                       Representation  \
0   [company, stock, year, nasdaq, market, earning...   
1   [million, quarter, revenue, year, zacks, cent,...   
2   [amd, graphic, radeon, processor, gpu, ryzen, ...   
3   [vehicle, driving, car, self, autonomous, tesl...   
4   [analyst, zacks, research,

In [45]:

for topic_id in topic_info['Topic']:
    if topic_id != -1:
        print(f"Topic {topic_id}:")
        print(bert_topic_model.get_topic(topic_id))

Topic 0:
[('million', 0.030487371346953437), ('quarter', 0.026355577600872558), ('revenue', 0.022317622686920252), ('year', 0.02148848464892058), ('zacks', 0.019292111915568497), ('cent', 0.016809057047680487), ('company', 0.016751358320266144), ('gaap', 0.015468866923273887), ('consensus', 0.015370797204749249), ('rank', 0.014724712314913438)]
Topic 1:
[('amd', 0.0619426142003229), ('graphic', 0.0235209633811003), ('radeon', 0.021839438837556513), ('processor', 0.021706644420291566), ('gpu', 0.019743371053652516), ('ryzen', 0.017345339981370715), ('epyc', 0.015152270939372717), ('year', 0.014454022312165938), ('revenue', 0.013411223648664362), ('card', 0.012901616829985243)]
Topic 2:
[('vehicle', 0.04765898180043747), ('driving', 0.047202658963867425), ('car', 0.042474590377885174), ('self', 0.04082171099340391), ('autonomous', 0.035574415144220155), ('tesla', 0.01887988034298657), ('mobileye', 0.01811948251642927), ('technology', 0.016406649748489047), ('motor', 0.016173204463762306)

In [46]:
# Visualize the frequency of topics
bert_topic_model.visualize_barchart()

In [47]:
# Visualize topics using a hierarchical structure
bert_topic_model.visualize_hierarchy()

In [48]:
# Visualize the similarity between topics
bert_topic_model.visualize_heatmap()

2nd iteration

In [50]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Define a custom vectorizer to include bigrams and trigrams
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=list(stop_words))

# Adjust min_topic_size
bert_topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    min_topic_size=10,
    n_gram_range=(1, 3)
)

# Fit the model
topics, probabilities = bert_topic_model.fit_transform(nvidia_articles)

In [51]:
topic_info = bert_topic_model.get_topic_info()
print(topic_info.head())


   Topic  Count                            Name  \
0     -1   1068    -1_company_stock_year_nasdaq   
1      0    133  0_million_quarter_year_revenue   
2      1    120    1_amd_graphic_processor_year   
3      2     69      2_driving_vehicle_car_self   
4      3     65            3_intel_ai_chip_data   

                                      Representation  \
0  [company, stock, year, nasdaq, market, zacks, ...   
1  [million, quarter, year, revenue, zacks, compa...   
2  [amd, graphic, processor, year, radeon, gpu, r...   
3  [driving, vehicle, car, self, self driving, au...   
4  [intel, ai, chip, data, technology, company, n...   

                                 Representative_Docs  
0  [immediate release chicago il may highlight im...  
1  [keeping earnings streak alive palo alto netwo...  
2  [advanced micro device nasdaq amd q4 earnings ...  
3  [intel corporation nasdaq intc recent deal win...  
4  [immediate releasechicago il december zacks co...  


In [52]:
for topic_id in topic_info['Topic']:
    if topic_id != -1:
        print(f"Topic {topic_id}:")
        print(bert_topic_model.get_topic(topic_id))

Topic 0:
[('million', 0.017414244962858018), ('quarter', 0.015941987915629924), ('year', 0.01491665798252751), ('revenue', 0.012908274116902499), ('zacks', 0.011776945425046767), ('company', 0.010657534634929291), ('year year', 0.010185825351018907), ('cent', 0.008076421159806165), ('consensus', 0.007954317546458683), ('earnings', 0.007949960877550621)]
Topic 1:
[('amd', 0.030994771895163326), ('graphic', 0.010996013339652978), ('processor', 0.009887425573542706), ('year', 0.009562314494635413), ('radeon', 0.009235396980075437), ('gpu', 0.009003343490720786), ('revenue', 0.007755499742848212), ('ryzen', 0.007214286564178652), ('quarter', 0.007127683934089898), ('market', 0.0065182609304932386)]
Topic 2:
[('driving', 0.02278988187124435), ('vehicle', 0.022344574459432706), ('car', 0.02042566779880543), ('self', 0.018901865801109842), ('self driving', 0.018649517674262192), ('autonomous', 0.016026514380833197), ('tesla', 0.009723144631512417), ('technology', 0.009025820411531391), ('driv

3rd iteration: dimensionality reductioon 

In [53]:
from umap import UMAP
from hdbscan import HDBSCAN

# Adjust UMAP parameters
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Adjust HDBSCAN parameters
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')

# Initialize BERTopic with custom models
bert_topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
)

# Fit the model
topics, probabilities = bert_topic_model.fit_transform(nvidia_articles)

In [54]:
topic_info = bert_topic_model.get_topic_info()
print(topic_info)


    Topic  Count                                               Name  \
0      -1    886                       -1_stock_year_company_nasdaq   
1       0    322                     0_quarter_million_year_company   
2       1    120                       1_amd_graphic_year_processor   
3       2     77                            2_nyse_nasdaq_dow_stock   
4       3     75                          3_week_market_might_yield   
5       4     73                   4_resistance_stock_around_nasdaq   
6       5     69                         5_intel_ai_chip_technology   
7       6     68                         6_driving_vehicle_car_self   
8       7     65                  7_nvidia_price target_target_nvda   
9       8     64                            8_inc_close_nasdaq_nyse   
10      9     62                      9_nvidia_year_quarter_revenue   
11     10     57            10_zacks analyst_analyst_zacks_research   
12     11     57           11_semiconductor_earnings_stock_industry   
13    

In [55]:

for topic_id in topic_info['Topic']:
    if topic_id != -1:
        print(f"Topic {topic_id}:")
        print(bert_topic_model.get_topic(topic_id))

Topic 0:
[('quarter', 0.013125433603620819), ('million', 0.01275080660074476), ('year', 0.012731290214838433), ('company', 0.011119930159479681), ('revenue', 0.01093741089694972), ('zacks', 0.010399606276772047), ('year year', 0.007649154253244714), ('earnings', 0.007547451541913227), ('zacks rank', 0.007048465290095045), ('rank', 0.0069365436997741595)]
Topic 1:
[('amd', 0.03325698303999769), ('graphic', 0.011655783952572003), ('year', 0.010524570951214236), ('processor', 0.010384512697871377), ('radeon', 0.009636132101793787), ('gpu', 0.00952085870053596), ('revenue', 0.008234803526090824), ('quarter', 0.00764414397304276), ('ryzen', 0.007510520718561247), ('market', 0.007147957601451946)]
Topic 2:
[('nyse', 0.012059335331746888), ('nasdaq', 0.010429548682368867), ('dow', 0.008402364312457446), ('stock', 0.007220359602035065), ('fell', 0.006653408968852405), ('index', 0.006509217633238066), ('point', 0.006451669670175064), ('rate', 0.006095890469574477), ('market', 0.0060929841812182