In [61]:
import pandas as pd
import nltk as nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from tqdm import tqdm

In [62]:
news_df = pd.read_csv('data_with_relevant_sentences.csv')
news_df.head()

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,ticker,ticker_sentiment_score,ticker_sentiment_label,news_text,Unnamed: 14,keywords,company_name,relevant_sentences,processed_relevant_sentences
0,NVDA: Will These Semiconductor Stocks Deliver ...,https://stocknews.com/news/nvda-tsm-avgo-csco-...,,,"Despite macroeconomic challenges, the semicond...",https://stocknews.com/wp-content/uploads/2022/...,Stocknews.com,,stocknews.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.197061,Somewhat-Bullish,"Despite macroeconomic challenges, the semicond...",,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.","['On February 6, 2024, NVDA partnered together...","['On February 6, 2024, NVDA partnered together..."
1,3 Cheap Tech Stocks to Buy Right Now,https://www.fool.com/investing/2024/02/12/3-ch...,2/12/24 00:00,Leo Sun,"IBM, AT&T, and Cisco are all attractive safe h...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",CSCO,0.046564,Neutral,Many tech stocks soared over the past year as ...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.",['I believe three underappreciated blue chip s...,['I believe three underappreciated blue chip s...
2,Nvidia's Valuation Sparks Reddit Debate: Echoe...,https://www.benzinga.com/trading-ideas/long-id...,,Surbhi Jain,The technology sector has always been a hot to...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.468392,Bullish,The technology sector has always been a hot to...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.",['Redditor u/waterlimes sparked a conversation...,['Redditor u/waterlimes sparked a conversation...
3,Spotlight on Cisco Systems: Analyzing the Surg...,https://www.benzinga.com/insights/options/24/0...,,Benzinga Insights,Deep-pocketed investors have adopted a bullish...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",CSCO,0.412413,Bullish,Deep-pocketed investors have adopted a bullish...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.","[""Deep-pocketed investors have adopted a bulli...","[""Deep-pocketed investors have adopted a bulli..."
4,"If You Like Nvidia, Then You Will Love These 2...",https://www.fool.com/investing/2024/02/11/if-y...,2/11/24 00:00,"Daniel Foelber, Scott Levine, Lee Samaha",These companies have clearly defined runways f...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.078029,Neutral,Nvidia (NVDA -5.55%) could be about to do the ...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.","['This week, for example, it announced a partn...","['This week, for example, it announced a partn..."


In [63]:
# Removing relevant (for sentiment analysis) columns needed that are NA
news_df = news_df.dropna(subset=['ticker','relevant_sentences'])
news_df.isnull().sum()

title                               0
url                                 0
time_published                  11589
authors                          3418
summary                             0
banner_image                     2226
source                              0
category_within_source          14478
source_domain                       0
topics                              0
ticker                              0
ticker_sentiment_score              0
ticker_sentiment_label              0
news_text                           0
Unnamed: 14                     26437
keywords                            0
company_name                       25
relevant_sentences                  0
processed_relevant_sentences        0
dtype: int64

In [64]:
print(news_df['relevant_sentences'][0])
type(news_df['relevant_sentences'][0])

['On February 6, 2024, NVDA partnered together with Cisco (CSCO) to provide AI infrastructure solutions for the data center that are simple to deploy and operate, providing the tremendous processing capacity that organizations require to succeed in the AI era.', 'CSCO, with its industry-leading experience in Ethernet networking and broad partner ecosystem, and NVDA, the creator of the GPU that fuelled the AI boom, share a vision and commitment to assisting customers with the transition to AI with highly secure Ethernet-based infrastructure.']


str

In [65]:
import ast
# Relevant sentences is a string representation of a list, so we need to convert it back to a list
# Convert the string representations into actual lists
news_df['relevant_sentences'] = news_df['relevant_sentences'].apply(ast.literal_eval)

In [66]:
print(news_df['relevant_sentences'][0])
type(news_df['relevant_sentences'][0])

['On February 6, 2024, NVDA partnered together with Cisco (CSCO) to provide AI infrastructure solutions for the data center that are simple to deploy and operate, providing the tremendous processing capacity that organizations require to succeed in the AI era.', 'CSCO, with its industry-leading experience in Ethernet networking and broad partner ecosystem, and NVDA, the creator of the GPU that fuelled the AI boom, share a vision and commitment to assisting customers with the transition to AI with highly secure Ethernet-based infrastructure.']


list

In [67]:
# Removing rows where relevant_sentences is an empty list
print(f'Before: {news_df.info}')
# Identify rows where 'relevant_sentences' is an empty list
empty_lists = news_df['relevant_sentences'].apply(lambda x: not x)

# Drop these rows from the DataFrame
news_df = news_df[~empty_lists]
print(f'After: {news_df.info}')

Before: <bound method DataFrame.info of                                                    title  \
0      NVDA: Will These Semiconductor Stocks Deliver ...   
1                   3 Cheap Tech Stocks to Buy Right Now   
2      Nvidia's Valuation Sparks Reddit Debate: Echoe...   
3      Spotlight on Cisco Systems: Analyzing the Surg...   
4      If You Like Nvidia, Then You Will Love These 2...   
...                                                  ...   
28176  Can Market Sustain Upward Momentum In Tech-Hea...   
28177   Lam Research (LRCX) Stock Price, News & Analysis   
28178  Nasdaq, S&P Futures Take Off As Tesla's Solid ...   
28179  Lam Research (LRCX) Q4 2022 Earnings Call Tran...   
28183  Dutch lithography gear giant ASML expects stea...   

                                                     url  \
0      https://stocknews.com/news/nvda-tsm-avgo-csco-...   
1      https://www.fool.com/investing/2024/02/12/3-ch...   
2      https://www.benzinga.com/trading-ideas/long-id...   

In [68]:
# news_df['ticker_sentiment_label'].value_counts()

# Define the valid sentiment labels
valid_labels = ['Bullish', 'Bearish', 'Neutral', 'Somewhat-Bullish', 'Somewhat-Bearish']

# Keep only the rows where 'ticker_sentiment_label' is in valid_labels
news_df = news_df[news_df['ticker_sentiment_label'].isin(valid_labels)]

def simplify_sentiment(label):
    if 'Bullish' in label:
        return 'Bullish'
    elif 'Bearish' in label:
        return 'Bearish'
    else:
        return 'Neutral'

news_df['golden_sentiment'] = news_df['ticker_sentiment_label'].apply(simplify_sentiment)

news_df['golden_sentiment'].value_counts()

golden_sentiment
Neutral    12742
Bullish     9798
Bearish      752
Name: count, dtype: int64

In [69]:
# Initialize the VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# NB: Tried tailoring for finance-related words, but accuracy worsens when adding the below words.

# new_words = {
#     'profit': 2.0,
#     'profits': 2.0,
#     'profitable': 2.0,
#     'gain': 2.0,
#     'gains': 2.0,
#     'growth': 2.0,
#     'expand': 2.0,
#     'expansion': 2.0,
#     'bullish': 2.0,
#     'surge': 2.0,
#     'jump': 1.0,
#     'positive': 2.0,
#     'success': 2.0,
#     'healthy': 2.0,
#     'strong': 2.0,
#     'increase': 1.0,
#     'improve': 1.0,
#     'improvement': 1.0,
#     'up': 1.0,
#     'win': 2.0,
#     'winner': 2.0,
#     'beat': 1.0,
#     'dividend': 2.0,
#     'bonus': 2.0,
#     'reward': 2.0,
#     'innovation': 2.0,
#     'innovative': 2.0,
#     'leader': 2.0,
#     'leading': 2.0,
#     'top': 2.0,
#     'outperform': 2.0,
#     'best': 2.0,
#     'better': 1.0,
#     'good': 1.0,
#     'high': 1.0,
#     'excellent': 2.0,
#     'great': 2.0,
#     'buy': 1.0,
#     'sell': -1.0,
#     'loss': -2.0,
#     'losses': -2.0,
#     'lose': -2.0,
#     'loser': -2.0,
#     'lost': -2.0,
#     'worst': -2.0,
#     'risk': -1.0,
#     'risks': -1.0,
#     'risky': -2.0,
#     'debt': -2.0,
#     'debts': -2.0,
#     'owe': -1.0,
#     'owes': -1.0,
#     'owed': -1.0,
#     'default': -3.0,
#     'defaults': -3.0,
#     'bankrupt': -3.0,
#     'bankruptcy': -3.0,
#     'bearish': -2.0,
#     'negative': -2.0,
#     'fail': -2.0,
#     'fails': -2.0,
#     'failing': -2.0,
#     'failure': -2.0,
#     'weak': -2.0,
#     'weakness': -2.0,
#     'decline': -2.0,
#     'drop': -1.0,
#     'fall': -1.0,
#     'crash': -3.0,
#     'down': -1.0,
#     'decrease': -1.0,
#     'deteriorate': -2.0,
#     'deterioration': -2.0,
#     'worsen': -2.0,
#     'worse': -2.0,
#     'low': -1.0,
#     'penalty': -2.0,
#     'penalties': -2.0,
# }

# sia.lexicon.update(new_words)

# Function to calculate VADER sentiment scores
def calculate_sentiment_scores(sentences):
    scores = [sia.polarity_scores(sentence) for sentence in sentences]
    avg_scores = {key: sum(score[key] for score in scores)/len(scores) for key in scores[0]}
    
    # Determine the sentiment label based on the average compound score
    if avg_scores['compound'] > 0.05:
        sentiment = 'Bullish'
    elif avg_scores['compound'] < -0.05:
        sentiment = 'Bearish'
    else:
        sentiment = 'Neutral'
    
    return avg_scores['compound'], sentiment

# Apply the function to the 'relevant_sentences' column
tqdm.pandas(desc="Calculating sentiment scores")
news_df[['vader_sentiment_scores', 'vader_sentiment']] = news_df['relevant_sentences'].progress_apply(calculate_sentiment_scores).apply(pd.Series)

Calculating sentiment scores: 100%|██████████| 23292/23292 [00:19<00:00, 1208.67it/s]


In [70]:
news_df.head()
news_df['vader_sentiment'].value_counts()

vader_sentiment
Bullish    17639
Neutral     3906
Bearish     1747
Name: count, dtype: int64

In [71]:
from sklearn.metrics import classification_report

# Calculate the classification metrics
report = classification_report(news_df['golden_sentiment'], news_df['vader_sentiment'], labels=['Bullish', 'Neutral', 'Bearish'])

print(report)

              precision    recall  f1-score   support

     Bullish       0.51      0.92      0.66      9798
     Neutral       0.79      0.24      0.37     12742
     Bearish       0.24      0.55      0.33       752

    accuracy                           0.54     23292
   macro avg       0.51      0.57      0.45     23292
weighted avg       0.65      0.54      0.49     23292

