In [1]:
import pandas as pd

# Load the dataset from the CSV file
news_df = pd.read_csv('news_cleaned_no_spaces.csv')

news_df.head()

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,ticker,ticker_sentiment_score,ticker_sentiment_label,news_text
0,NVDA: Will These Semiconductor Stocks Deliver ...,https://stocknews.com/news/nvda-tsm-avgo-csco-...,,,"Despite macroeconomic challenges, the semicond...",https://stocknews.com/wp-content/uploads/2022/...,Stocknews.com,,stocknews.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.197061,Somewhat-Bullish,"Despite macroeconomic challenges, the semicond..."
1,3 Cheap Tech Stocks to Buy Right Now,https://www.fool.com/investing/2024/02/12/3-ch...,2/12/24 00:00,Leo Sun,"IBM, AT&T, and Cisco are all attractive safe h...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",CSCO,0.046564,Neutral,Many tech stocks soared over the past year as ...
2,Nvidia's Valuation Sparks Reddit Debate: Echoe...,https://www.benzinga.com/trading-ideas/long-id...,,Surbhi Jain,The technology sector has always been a hot to...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.468392,Bullish,The technology sector has always been a hot to...
3,Spotlight on Cisco Systems: Analyzing the Surg...,https://www.benzinga.com/insights/options/24/0...,,Benzinga Insights,Deep-pocketed investors have adopted a bullish...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",CSCO,0.412413,Bullish,Deep-pocketed investors have adopted a bullish...
4,"If You Like Nvidia, Then You Will Love These 2...",https://www.fool.com/investing/2024/02/11/if-y...,2/11/24 00:00,"Daniel Foelber, Scott Levine, Lee Samaha",These companies have clearly defined runways f...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.078029,Neutral,Nvidia (NVDA -5.55%) could be about to do the ...


In [2]:
news_df.isnull().sum()

title                         0
url                           0
time_published            11589
authors                    3417
summary                       0
banner_image               2225
source                        0
category_within_source    14483
source_domain                 0
topics                        0
ticker                        0
ticker_sentiment_score        0
ticker_sentiment_label        0
news_text                     0
dtype: int64

In [3]:
print(news_df['ticker_sentiment_label'].value_counts())

# Define the valid sentiment labels
valid_labels = ['Bullish', 'Bearish', 'Neutral', 'Somewhat-Bullish', 'Somewhat-Bearish']

# Keep only the rows where 'ticker_sentiment_label' is in valid_labels
news_df = news_df[news_df['ticker_sentiment_label'].isin(valid_labels)]

def simplify_sentiment(label):
    if 'Bullish' in label:
        return 'Bullish'
    elif 'Bearish' in label:
        return 'Bearish'
    else:
        return 'Neutral'

news_df['golden_sentiment'] = news_df['ticker_sentiment_label'].apply(simplify_sentiment)

news_df['golden_sentiment'].value_counts()

ticker_sentiment_label
Neutral             14708
Somewhat-Bullish     7586
Bullish              3264
Somewhat-Bearish      702
Bearish               159
Name: count, dtype: int64


golden_sentiment
Neutral    14708
Bullish    10850
Bearish      861
Name: count, dtype: int64

In [4]:
from sklearn.utils import resample

# Separate the classes
df_minority = news_df[news_df.golden_sentiment == 'Bearish']
df_majority1 = news_df[news_df.golden_sentiment == 'Bullish']
df_majority2 = news_df[news_df.golden_sentiment == 'Neutral'] 

# Downsample the majority classes to match the number of samples in the minority class
df_majority1_downsampled = resample(df_majority1, replace=False, n_samples=861, random_state=42)
df_majority2_downsampled = resample(df_majority2, replace=False, n_samples=861, random_state=42)

# Combine the minority class and downsampled majority classes
df_resampled = pd.concat([df_minority, df_majority1_downsampled, df_majority2_downsampled])

# Check the new class counts
print(df_resampled.golden_sentiment.value_counts())

golden_sentiment
Bearish    861
Bullish    861
Neutral    861
Name: count, dtype: int64


In [5]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline

# Load Aspect-Based Sentiment Analysis model
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = TFAutoModelForSequenceClassification \
  .from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

# Load a traditional Sentiment Analysis model
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = pipeline("sentiment-analysis", model=sentiment_model_path,
                          tokenizer=sentiment_model_path)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['deberta.embeddings.position_ids']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDebertaV2ForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2ForSequenceClassification for predictions without further training.


In [7]:
%%time

from tqdm import tqdm
import tensorflow as tf

# Initialize empty lists to store the results
negative = []
neutral = []
positive = []

# Iterate over the rows of the DataFrame
for _, row in tqdm(df_resampled.iterrows(), total=df_resampled.shape[0]):
    # Get the news text and the ticker
    news_text = row['news_text']
    ticker = row['ticker']

    # Perform ABSA
    inputs = absa_tokenizer(f"[CLS] {news_text} [SEP] {ticker} [SEP]", return_tensors="tf")
    outputs = absa_model(inputs)
    probs = tf.nn.softmax(outputs.logits, axis=1)
    probs = probs.numpy()[0]

    # Store the results
    negative.append(probs[0])
    neutral.append(probs[1])
    positive.append(probs[2])

# Append the results to the original DataFrame
df_resampled['predicted_Bearish'] = negative
df_resampled['predicted_Neutral'] = neutral
df_resampled['predicted_Bullish'] = positive

100%|██████████| 2583/2583 [2:46:27<00:00,  3.87s/it]   

CPU times: user 8h 22min 46s, sys: 4h 3min 56s, total: 12h 26min 42s
Wall time: 2h 46min 27s





In [8]:
# Define a mapping from column names to labels
label_mapping = {
    'predicted_Bearish': 'Bearish',
    'predicted_Neutral': 'Neutral',
    'predicted_Bullish': 'Bullish'
}

# Find the column name with the maximum value for each row
df_resampled['predicted_Sentiment'] = df_resampled[['predicted_Bearish', 'predicted_Neutral', 'predicted_Bullish']].idxmax(axis=1)

# Map the column names to labels
df_resampled['predicted_Sentiment'] = df_resampled['predicted_Sentiment'].map(label_mapping)

df_resampled['predicted_Sentiment'].head()

26     Neutral
62     Neutral
138    Neutral
145    Bearish
151    Neutral
Name: predicted_Sentiment, dtype: object

In [10]:
from sklearn.metrics import classification_report

# Generate the classification report
report = classification_report(df_resampled['golden_sentiment'], df_resampled['predicted_Sentiment'])

print(report)

              precision    recall  f1-score   support

     Bearish       0.66      0.29      0.40       861
     Bullish       0.44      0.66      0.53       861
     Neutral       0.30      0.32      0.31       861

    accuracy                           0.42      2583
   macro avg       0.47      0.42      0.41      2583
weighted avg       0.47      0.42      0.41      2583

