In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn

import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")



In [4]:
test_string = """Stocks"""

In [5]:
inputs = tokenizer(test_string, return_tensors='pt', padding=True)
logits = model(**inputs)[0]

# Convert logits to softmax probabilities.
probabilities = nn.functional.softmax(logits, dim=-1)

# The probability labels are 'positive', 'negative', 'neutral' (NEED TO CONFIRM)

print(probabilities.detach().numpy().reshape((3,)))

[0.04868224 0.05188207 0.8994357 ]


In [6]:
def generate_sentiment_values(text: str) -> list[float]:
    inputs = tokenizer(text, return_tensors='pt', padding=True)
    logits = model(**inputs)[0]

    # Convert logits to softmax probabilities.
    probabilities = nn.functional.softmax(logits, dim=-1)

    # The probability labels are 'positive', 'negative', 'neutral' (NEED TO CONFIRM)
    return probabilities.detach().numpy().reshape((3,))

### Finbert on NYTimes news

In [7]:
df = pd.read_csv('../data/nyt_headlines_2.csv')
df.head()

Unnamed: 0,timestamp,article_url,lead_paragraph,abstract,adjusted_date
0,2023-08-18 16:14:19+00:00,https://www.nytimes.com/interactive/2023/08/18/business/irs-cash-influx-staff.html,"New funding has helped the I.R.S. increase staffing, but the agency faces an uncertain future.","New funding has helped the I.R.S. increase staffing, but the agency faces an uncertain future.",2023-08-19
1,2023-08-18 11:55:07+00:00,https://www.nytimes.com/2023/08/18/business/interest-rates-markets-tech-slump.html,"In the span of a month, the bottom has dropped out of the bull-market rally as investors have come to grips with the prospect of “higher for longer” interest rates worldwide. The sell-off in global stocks and bonds picked up steam on Thursday. And weary market watchers will be looking for more hints on the Fed’s view at next week’s Jackson Hole summit of central bankers and policymakers.","Apple, Nvidia, Tesla and Microsoft are among the giants that have tumbled by at least 10 percent in the past month after powering a market rally earlier this year.",2023-08-18
2,2023-08-18 09:00:02+00:00,https://www.nytimes.com/interactive/2023/08/18/business/shoptalk-peak-china.html,The concept that China has reached the pinnacle of its economic power is hotly debated.,The concept that China has reached the pinnacle of its economic power is hotly debated.,2023-08-18
3,2023-08-18 08:06:58+00:00,https://www.nytimes.com/2023/08/18/business/hong-kong-stocks-bear-market.html,"Stocks in Hong Kong entered a bear market on Friday, down 21 percent from their high near the start of the year, as investors around the world grew increasingly worried that the deteriorating condition of China’s real estate sector could spill over into the broader economy.","The Hang Seng Index has fallen more than 20 percent from its recent high, as investors grow more pessimistic about China’s post-pandemic recovery.",2023-08-18
4,2023-08-18 04:01:21+00:00,https://www.nytimes.com/2023/08/18/business/indonesia-nickel-china-us.html,"He is known as the Minister for Everything. From the government offices of Indonesia’s capital to dusty mines on remote islands, Luhut Binsar Pandjaitan commands authority as the nation’s essential power broker.",The fate of Indonesia’s unrivaled stocks of nickel — a critical mineral used to make batteries for electric vehicles — is caught in the conflict between the United States and China.,2023-08-18


In [8]:
for i, row in df.iterrows():
    combined_str = str(row['lead_paragraph']) + ' ' + str(row['abstract'])
    output = generate_sentiment_values(combined_str)
    df.at[i, 'pos_sentiment'] = output[0]
    df.at[i, 'neg_sentiment'] = output[1]
    df.at[i, 'neutral_sentiment'] = output[2]

In [9]:
df.head()

Unnamed: 0,timestamp,article_url,lead_paragraph,abstract,adjusted_date,pos_sentiment,neg_sentiment,neutral_sentiment
0,2023-08-18 16:14:19+00:00,https://www.nytimes.com/interactive/2023/08/18/business/irs-cash-influx-staff.html,"New funding has helped the I.R.S. increase staffing, but the agency faces an uncertain future.","New funding has helped the I.R.S. increase staffing, but the agency faces an uncertain future.",2023-08-19,0.747801,0.153354,0.098845
1,2023-08-18 11:55:07+00:00,https://www.nytimes.com/2023/08/18/business/interest-rates-markets-tech-slump.html,"In the span of a month, the bottom has dropped out of the bull-market rally as investors have come to grips with the prospect of “higher for longer” interest rates worldwide. The sell-off in global stocks and bonds picked up steam on Thursday. And weary market watchers will be looking for more hints on the Fed’s view at next week’s Jackson Hole summit of central bankers and policymakers.","Apple, Nvidia, Tesla and Microsoft are among the giants that have tumbled by at least 10 percent in the past month after powering a market rally earlier this year.",2023-08-18,0.018421,0.950249,0.03133
2,2023-08-18 09:00:02+00:00,https://www.nytimes.com/interactive/2023/08/18/business/shoptalk-peak-china.html,The concept that China has reached the pinnacle of its economic power is hotly debated.,The concept that China has reached the pinnacle of its economic power is hotly debated.,2023-08-18,0.04368,0.099582,0.856738
3,2023-08-18 08:06:58+00:00,https://www.nytimes.com/2023/08/18/business/hong-kong-stocks-bear-market.html,"Stocks in Hong Kong entered a bear market on Friday, down 21 percent from their high near the start of the year, as investors around the world grew increasingly worried that the deteriorating condition of China’s real estate sector could spill over into the broader economy.","The Hang Seng Index has fallen more than 20 percent from its recent high, as investors grow more pessimistic about China’s post-pandemic recovery.",2023-08-18,0.007548,0.974413,0.018039
4,2023-08-18 04:01:21+00:00,https://www.nytimes.com/2023/08/18/business/indonesia-nickel-china-us.html,"He is known as the Minister for Everything. From the government offices of Indonesia’s capital to dusty mines on remote islands, Luhut Binsar Pandjaitan commands authority as the nation’s essential power broker.",The fate of Indonesia’s unrivaled stocks of nickel — a critical mineral used to make batteries for electric vehicles — is caught in the conflict between the United States and China.,2023-08-18,0.054345,0.137765,0.807891


In [10]:
df.to_csv('../data/nyt_sentiment_2.csv', index=False)

In [12]:
agg_func = {
    'pos_sentiment': 'mean',
    'neg_sentiment': 'mean',
    'neutral_sentiment': 'mean'
}
column_rename = {
    'pos_sentiment': 'mean_pos_sentiment',
    'neg_sentiment': 'mean_neg_sentiment',
    'neutral_sentiment': 'mean_neutral_sentiment'
}
grouped_by_date_df = df.groupby(by='adjusted_date').agg(agg_func).rename(columns=column_rename).reset_index()
grouped_by_date_df.head()

Unnamed: 0,adjusted_date,mean_pos_sentiment,mean_neg_sentiment,mean_neutral_sentiment
0,2023-08-18,0.029191,0.620925,0.349884
1,2023-08-19,0.747801,0.153354,0.098845
2,2023-08-21,0.125991,0.345267,0.528741
3,2023-08-22,0.297633,0.321896,0.38047
4,2023-08-23,0.049287,0.607605,0.343108


In [14]:
grouped_by_date_df.to_csv('../data/nyt_sentiment_2_grouped.csv', index=False)