In [28]:
import pandas as pd
import nltk as nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from tqdm import tqdm

In [29]:
news_df = pd.read_csv('data_with_relevant_sentences.csv')
news_df.head()

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,ticker,ticker_sentiment_score,ticker_sentiment_label,news_text,Unnamed: 14,keywords,company_name,relevant_sentences,processed_relevant_sentences
0,NVDA: Will These Semiconductor Stocks Deliver ...,https://stocknews.com/news/nvda-tsm-avgo-csco-...,,,"Despite macroeconomic challenges, the semicond...",https://stocknews.com/wp-content/uploads/2022/...,Stocknews.com,,stocknews.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.197061,Somewhat-Bullish,"Despite macroeconomic challenges, the semicond...",,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.","['On February 6, 2024, NVDA partnered together...","['On February 6, 2024, NVDA partnered together..."
1,3 Cheap Tech Stocks to Buy Right Now,https://www.fool.com/investing/2024/02/12/3-ch...,2/12/24 00:00,Leo Sun,"IBM, AT&T, and Cisco are all attractive safe h...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",CSCO,0.046564,Neutral,Many tech stocks soared over the past year as ...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.",['I believe three underappreciated blue chip s...,['I believe three underappreciated blue chip s...
2,Nvidia's Valuation Sparks Reddit Debate: Echoe...,https://www.benzinga.com/trading-ideas/long-id...,,Surbhi Jain,The technology sector has always been a hot to...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.468392,Bullish,The technology sector has always been a hot to...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.",['Redditor u/waterlimes sparked a conversation...,['Redditor u/waterlimes sparked a conversation...
3,Spotlight on Cisco Systems: Analyzing the Surg...,https://www.benzinga.com/insights/options/24/0...,,Benzinga Insights,Deep-pocketed investors have adopted a bullish...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",CSCO,0.412413,Bullish,Deep-pocketed investors have adopted a bullish...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.","[""Deep-pocketed investors have adopted a bulli...","[""Deep-pocketed investors have adopted a bulli..."
4,"If You Like Nvidia, Then You Will Love These 2...",https://www.fool.com/investing/2024/02/11/if-y...,2/11/24 00:00,"Daniel Foelber, Scott Levine, Lee Samaha",These companies have clearly defined runways f...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.078029,Neutral,Nvidia (NVDA -5.55%) could be about to do the ...,,"['csco', 'cisco', 'systems']","Cisco Systems, Inc.","['This week, for example, it announced a partn...","['This week, for example, it announced a partn..."


In [30]:
# Removing relevant (for sentiment analysis) columns that are NA
news_df = news_df.dropna(subset=['ticker','relevant_sentences'])
news_df.isnull().sum()

title                               0
url                                 0
time_published                  11589
authors                          3418
summary                             0
banner_image                     2226
source                              0
category_within_source          14478
source_domain                       0
topics                              0
ticker                              0
ticker_sentiment_score              0
ticker_sentiment_label              0
news_text                           0
Unnamed: 14                     26437
keywords                            0
company_name                       25
relevant_sentences                  0
processed_relevant_sentences        0
dtype: int64

In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # Download Wordnet for lemmatization

# Initialize a WordNetLemmatizer for lemmatization
lemmatizer = WordNetLemmatizer()

# Define stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the sentence
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [word.lower() for word in tokens]
    # Remove punctuation and numbers
    tokens = [word for word in tokens if word.isalpha()]
    # Remove stop words
    tokens = [word for word in tokens if not word in stop_words]
    # Perform lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Apply the preprocessing to the 'relevant_sentences' column
news_df['relevant_sentences'] = news_df['relevant_sentences'].apply(preprocess_text)

news_df['relevant_sentences'].head()

[nltk_data] Downloading package punkt to /Users/adam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/adam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/adam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [february, nvda, partnered, together, cisco, c...
1    [believe, three, underappreciated, blue, chip,...
2    [sparked, conversation, drawing, parallel, nvi...
3    [investor, adopted, bullish, approach, towards...
4    [week, example, announced, partnership, cisco,...
Name: relevant_sentences, dtype: object

In [32]:
# Removing rows with no tokens
news_df = news_df[news_df['relevant_sentences'].apply(lambda x: len(x) > 0)]

In [33]:
print(news_df['ticker_sentiment_label'].value_counts())

# Define the valid sentiment labels
valid_labels = ['Bullish', 'Bearish', 'Neutral', 'Somewhat-Bullish', 'Somewhat-Bearish']

# Keep only the rows where 'ticker_sentiment_label' is in valid_labels
news_df = news_df[news_df['ticker_sentiment_label'].isin(valid_labels)]

def simplify_sentiment(label):
    if 'Bullish' in label:
        return 'Bullish'
    elif 'Bearish' in label:
        return 'Bearish'
    else:
        return 'Neutral'

news_df['golden_sentiment'] = news_df['ticker_sentiment_label'].apply(simplify_sentiment)

news_df['golden_sentiment'].value_counts()

ticker_sentiment_label
Neutral                                                                                                                          12727
Somewhat-Bullish                                                                                                                  6737
Bullish                                                                                                                           3056
Somewhat-Bearish                                                                                                                   606
Bearish                                                                                                                            146
 respectively (December 2022 - expense of $214 and a recovery of $145). These amounts are included in direct cost of revenues       12
 net other consisted of $2.2 million of net other                                                                                    1
 consisted of $1.8 million of ne

golden_sentiment
Neutral    12727
Bullish     9793
Bearish      752
Name: count, dtype: int64

In [34]:
from sklearn.utils import resample

# Separate the classes
df_bearish = news_df[news_df.golden_sentiment == 'Bearish']
df_bullish = news_df[news_df.golden_sentiment == 'Bullish']
df_neutral = news_df[news_df.golden_sentiment == 'Neutral']

# Find the number of samples in the middle class
n_samples = df_bullish.shape[0]  # Assuming 'Bullish' has a count in between 'Bearish' and 'Neutral'

# Upsample 'Bearish' class and downsample 'Neutral' class to match the number of samples in 'Bullish' class
df_bearish_upsampled = resample(df_bearish, replace=True, n_samples=n_samples, random_state=42)
df_neutral_downsampled = resample(df_neutral, replace=False, n_samples=n_samples, random_state=42)

# Combine the resampled classes
df_resampled = pd.concat([df_bearish_upsampled, df_bullish, df_neutral_downsampled])

# Check the new class counts
print(df_resampled.golden_sentiment.value_counts())

golden_sentiment
Bearish    9793
Bullish    9793
Neutral    9793
Name: count, dtype: int64


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Convert the list of tokens back into a string
df_resampled['processed_text'] = df_resampled['relevant_sentences'].apply(' '.join)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_resampled['processed_text'], df_resampled['golden_sentiment'], test_size=0.2, random_state=42)

# Initialize a CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_vect = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vect = vectorizer.transform(X_test)

# Initialize a MultinomialNB
nb = MultinomialNB()

# Fit the model
nb.fit(X_train_vect, y_train)

# Predict the labels of the test set
y_pred = nb.predict(X_test_vect)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Bearish       0.79      0.91      0.85      1933
     Bullish       0.65      0.74      0.69      2005
     Neutral       0.71      0.49      0.58      1938

    accuracy                           0.72      5876
   macro avg       0.72      0.72      0.71      5876
weighted avg       0.71      0.72      0.71      5876

