## Sentiment scores generation using Finbert model
 - [Finbert github](https://github.com/ProsusAI/finBERT/tree/master)
 - [Huggingface link](https://huggingface.co/ProsusAI/finbert)

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn

import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_colwidth', None)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")



In [12]:
test_string = """Stocks is bad"""

In [13]:
inputs = tokenizer(test_string, return_tensors='pt', padding=True)
logits = model(**inputs)[0]

# Convert logits to softmax probabilities.
probabilities = nn.functional.softmax(logits, dim=-1)

# The probability labels are 'positive', 'negative', 'neutral' (NEED TO CONFIRM)

print(probabilities.detach().numpy().reshape((3,)))

[0.05239694 0.12453802 0.82306504]


In [14]:
def generate_sentiment_values(text: str) -> list[float]:
    inputs = tokenizer(text, return_tensors='pt', padding=True)
    logits = model(**inputs)[0]

    # Convert logits to softmax probabilities.
    probabilities = nn.functional.softmax(logits, dim=-1)

    # The probability labels are 'positive', 'negative', 'neutral' (NEED TO CONFIRM)
    return probabilities.detach().numpy().reshape((3,))

## Finbert on NYTimes news


1. Get sentiment for each article

In [37]:
df = pd.read_csv('../data/nyt_headlines.csv')
df = df[['timestamp','article_url','lead_paragraph','abstract','adjusted_date']]

In [38]:
df

Unnamed: 0,timestamp,article_url,lead_paragraph,abstract,adjusted_date
0,2019-09-30 19:20:11+00:00,https://www.nytimes.com/2019/09/30/business/bu...,Cannes has been a hub for meetings and events ...,While the largest gatherings still go to the b...,2019-10-01
1,2019-09-30 18:44:40+00:00,https://www.nytimes.com/2019/09/30/business/ec...,Congress didn’t unconstitutionally penalize De...,A federal judge rejected a suit by four states...,2019-10-01
2,2019-09-30 15:01:19+00:00,https://www.nytimes.com/2019/09/30/business/we...,WeWork shelved its plans for an initial public...,"The company, which built an office-space behem...",2019-09-30
3,2019-09-30 15:00:08+00:00,https://www.nytimes.com/2019/09/30/business/ja...,"TOKYO — Yasuo Sugiuchi can’t avoid death, but ...","The increase, to 10 percent from 8 percent on ...",2019-09-30
4,2019-09-30 14:34:31+00:00,https://www.nytimes.com/2019/09/30/business/gr...,"Seamless, the food delivery service started tw...",Restaurant owners say Grubhub’s business model...,2019-09-30
...,...,...,...,...,...
12046,2024-09-27 14:10:49+00:00,https://www.nytimes.com/2024/09/27/business/de...,"For the past two decades, European banks have ...",Investors are cheering a possible tie-up betwe...,2024-09-27
12047,2024-09-27 12:43:10+00:00,https://www.nytimes.com/2024/09/27/business/ec...,"Inflation cooled in August, the latest sign of...",Inflation is slowing so much that some economi...,2024-09-27
12048,2024-09-27 12:15:14+00:00,https://www.nytimes.com/2024/09/27/business/de...,"Over the past 48 hours, the biggest spectacle ...",The criminal charges against the embattled may...,2024-09-27
12049,2024-09-27 09:02:56+00:00,https://www.nytimes.com/2024/09/27/technology/...,Hours before former President Donald J. Trump ...,Almost a third of 171 posts last week from the...,2024-09-27


In [39]:
for i, row in df.iterrows():
    combined_str = f"""{str(row['lead_paragraph'])}\n{str(row['abstract'])}"""
    output = generate_sentiment_values(combined_str)
    df.at[i, 'pos_sentiment'] = output[0]
    df.at[i, 'neg_sentiment'] = output[1]
    df.at[i, 'neutral_sentiment'] = output[2]

    preamble = "Evaluate the following news on S&P price."
    combined_str = f"""<instructions>{preamble}</instructions> <news>{str(row['lead_paragraph'])}\n{str(row['abstract'])}</news>"""    
    output = generate_sentiment_values(combined_str)
    df.at[i, 'pos_sentiment_w_preamb'] = output[0]
    df.at[i, 'neg_sentiment_w_preamb'] = output[1]
    df.at[i, 'neutral_sentiment_w_preamb'] = output[2]

Save to the same file first

In [40]:
df

Unnamed: 0,timestamp,article_url,lead_paragraph,abstract,adjusted_date,pos_sentiment,neg_sentiment,neutral_sentiment,pos_sentiment_w_preamb,neg_sentiment_w_preamb,neutral_sentiment_w_preamb
0,2019-09-30 19:20:11+00:00,https://www.nytimes.com/2019/09/30/business/bu...,Cannes has been a hub for meetings and events ...,While the largest gatherings still go to the b...,2019-10-01,0.054560,0.027506,0.917934,0.049412,0.025419,0.925169
1,2019-09-30 18:44:40+00:00,https://www.nytimes.com/2019/09/30/business/ec...,Congress didn’t unconstitutionally penalize De...,A federal judge rejected a suit by four states...,2019-10-01,0.062097,0.808197,0.129706,0.059578,0.785785,0.154637
2,2019-09-30 15:01:19+00:00,https://www.nytimes.com/2019/09/30/business/we...,WeWork shelved its plans for an initial public...,"The company, which built an office-space behem...",2019-09-30,0.035901,0.860062,0.104038,0.016129,0.929829,0.054042
3,2019-09-30 15:00:08+00:00,https://www.nytimes.com/2019/09/30/business/ja...,"TOKYO — Yasuo Sugiuchi can’t avoid death, but ...","The increase, to 10 percent from 8 percent on ...",2019-09-30,0.225155,0.715339,0.059506,0.712138,0.140160,0.147703
4,2019-09-30 14:34:31+00:00,https://www.nytimes.com/2019/09/30/business/gr...,"Seamless, the food delivery service started tw...",Restaurant owners say Grubhub’s business model...,2019-09-30,0.009128,0.963881,0.026991,0.009450,0.956868,0.033682
...,...,...,...,...,...,...,...,...,...,...,...
12046,2024-09-27 14:10:49+00:00,https://www.nytimes.com/2024/09/27/business/de...,"For the past two decades, European banks have ...",Investors are cheering a possible tie-up betwe...,2024-09-27,0.054860,0.737702,0.207437,0.124981,0.225523,0.649496
12047,2024-09-27 12:43:10+00:00,https://www.nytimes.com/2024/09/27/business/ec...,"Inflation cooled in August, the latest sign of...",Inflation is slowing so much that some economi...,2024-09-27,0.555174,0.399458,0.045368,0.662625,0.272857,0.064518
12048,2024-09-27 12:15:14+00:00,https://www.nytimes.com/2024/09/27/business/de...,"Over the past 48 hours, the biggest spectacle ...",The criminal charges against the embattled may...,2024-09-27,0.027138,0.804501,0.168361,0.032231,0.662571,0.305199
12049,2024-09-27 09:02:56+00:00,https://www.nytimes.com/2024/09/27/technology/...,Hours before former President Donald J. Trump ...,Almost a third of 171 posts last week from the...,2024-09-27,0.018546,0.689842,0.291612,0.019478,0.705193,0.275329


In [41]:
df.to_csv('../data/nyt_snp_headlines_temp.csv', index=False)

2. Group by date

In [44]:
new_df = pd.read_csv('../data/nyt_snp_headlines_with_sentiment.csv')

In [45]:
agg_func = {
    'pos_sentiment': 'mean',
    'neg_sentiment': 'mean',
    'neutral_sentiment': 'mean',
    'pos_sentiment_w_preamb': 'mean',
    'neg_sentiment_w_preamb': 'mean',
    'neutral_sentiment_w_preamb': 'mean'
}
column_rename = {
    'pos_sentiment': 'mean_pos_sentiment',
    'neg_sentiment': 'mean_neg_sentiment',
    'neutral_sentiment': 'mean_neutral_sentiment',
    'pos_sentiment_w_preamb': 'mean_pos_preamble_sentiment',
    'neg_sentiment_w_preamb': 'mean_neg_preamble_sentiment',
    'neutral_sentiment_w_preamb': 'mean_neutral_preamble_sentiment'
}
grouped_by_date_df = new_df.groupby(by='adjusted_date').agg(agg_func).rename(columns=column_rename).reset_index()
grouped_by_date_df.tail()

Unnamed: 0,adjusted_date,mean_pos_sentiment,mean_neg_sentiment,mean_neutral_sentiment,mean_pos_preamble_sentiment,mean_neg_preamble_sentiment,mean_neutral_preamble_sentiment
1493,2024-09-25,0.159386,0.285289,0.555325,0.112141,0.281216,0.606643
1494,2024-09-26,0.296828,0.350619,0.352554,0.26093,0.321001,0.418069
1495,2024-09-27,0.192744,0.503622,0.303634,0.209407,0.408375,0.382218
1496,2024-09-28,0.225603,0.226435,0.547962,0.20913,0.203006,0.587863
1497,adjusted_date,0.019455,0.067139,0.913406,0.017278,0.06523,0.917491


In [46]:
grouped_by_date_df.to_csv('../data/nyt_sentiment.csv', index=False)

## Merge the Sentiments DataFrame into 1 

In [48]:
nyt_sentiment = pd.read_csv('../data/nyt_sentiment.csv')
tesla_sentiment = pd.read_csv('../data/tesla_sentiment.csv')


In [50]:
nyt_sentiment['News'] = 'Market News'

In [61]:
nyt_sentiment

Unnamed: 0,adjusted_date,mean_pos_sentiment,mean_neg_sentiment,mean_neutral_sentiment,mean_pos_preamble_sentiment,mean_neg_preamble_sentiment,mean_neutral_preamble_sentiment,News
0,2019-09-30,0.056308,0.762604,0.181088,0.127192,0.641141,0.231667,Market News
1,2019-10-01,0.084752,0.525347,0.389900,0.069554,0.531837,0.398609,Market News
2,2019-10-02,0.064854,0.476130,0.459017,0.053761,0.438076,0.508164,Market News
3,2019-10-03,0.267430,0.271210,0.461360,0.250733,0.239351,0.509915,Market News
4,2019-10-04,0.132889,0.533843,0.333267,0.070609,0.499317,0.430075,Market News
...,...,...,...,...,...,...,...,...
1493,2024-09-25,0.159386,0.285289,0.555325,0.112141,0.281216,0.606643,Market News
1494,2024-09-26,0.296828,0.350619,0.352554,0.260930,0.321001,0.418069,Market News
1495,2024-09-27,0.192744,0.503622,0.303634,0.209407,0.408375,0.382218,Market News
1496,2024-09-28,0.225603,0.226435,0.547962,0.209130,0.203006,0.587863,Market News


In [62]:
nyt_sentiment = nyt_sentiment[:-1]

In [51]:
tesla_sentiment['News'] = 'Tesla'

In [63]:
tesla_sentiment

Unnamed: 0,adjusted_date,mean_pos_sentiment,mean_neg_sentiment,mean_neutral_sentiment,mean_pos_preamble_sentiment,mean_neg_preamble_sentiment,mean_neutral_preamble_sentiment,News
0,1/1/24,0.932750,0.018450,0.048801,0.883637,0.019422,0.096940,Tesla
1,1/10/21,0.007312,0.974635,0.018053,0.007583,0.970156,0.022261,Tesla
2,1/11/23,0.260249,0.570400,0.169351,0.224915,0.534778,0.240307,Tesla
3,1/12/20,0.068412,0.030966,0.900621,0.037842,0.027356,0.934802,Tesla
4,1/12/23,0.008780,0.971812,0.019408,0.012819,0.961352,0.025829,Tesla
...,...,...,...,...,...,...,...,...
857,9/8/22,0.061572,0.221923,0.716505,0.070154,0.102857,0.826989,Tesla
858,9/9/20,0.064603,0.097978,0.837419,0.056757,0.021830,0.921414,Tesla
859,9/9/21,0.075175,0.014510,0.910315,0.087085,0.011177,0.901738,Tesla
860,9/9/22,0.763597,0.009088,0.227315,0.504756,0.011038,0.484206,Tesla


In [65]:
tesla_sentiment['adjusted_date'] = pd.to_datetime(tesla_sentiment['adjusted_date'], format="%d/%m/%y").dt.strftime("%Y-%m-%d")


In [67]:
union_df = pd.concat([tesla_sentiment, nyt_sentiment], ignore_index=True)