In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
#from textblob import TextBlob

In [2]:
# Load the existing stock data
dataNVDA = pd.read_csv('../data/NVDA_historical_data.csv', index_col='Date', parse_dates=True)

In [3]:
# Load the exosting news data
news_data = pd.read_csv('../data/raw_analyst_ratings.csv', parse_dates=['date'])


In [4]:
# Convert to datetime and adjust to UTC
news_data['date'] = pd.to_datetime(news_data['date'], utc=True, format='mixed')
dataNVDA.index = pd.to_datetime(dataNVDA.index, utc=True)


# Extract just the date part
news_data['date'] = news_data['date'].dt.date
# To extract the date index
dates = dataNVDA.index
dataNVDA = dataNVDA.reset_index()

dataNVDA['Date'] = dataNVDA['Date'].dt.date

# Calculate daily percentage changes
dataNVDA['Daily_Return'] = dataNVDA['Close'].pct_change() * 100  # Multiply by 100 to get percentage


# Merge on the date
merged_data = pd.merge(news_data, dataNVDA, left_on='date', right_on='Date', how='left')

# Handle non-trading days by filling missing stock data with the closest trading day
merged_data = merged_data.ffill()  # Forward fill


In [5]:
print("\nMerged Data:")
print(merged_data.head())
print("Date range in merged data:")
print(merged_data['date'].min(), merged_data['date'].max())


Merged Data:
   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

         date stock        Date     Open     High      Low    Close  \
0  2020-06-05     A  2020-06-05  8.79450  8.99700  8.70

In [6]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [7]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()


In [8]:
# Perform sentiment analysis on headlines
def get_sentiment_score(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']  # Compound score

# Apply sentiment analysis
merged_data['sentiment_score'] = merged_data['headline'].apply(get_sentiment_score)

# Display the merged DataFrame with sentiment scores
print("Merged Data with Sentiment Scores:")
print(merged_data)

Merged Data with Sentiment Scores:
         Unnamed: 0                                           headline  \
0                 0            Stocks That Hit 52-Week Highs On Friday   
1                 1         Stocks That Hit 52-Week Highs On Wednesday   
2                 2                      71 Biggest Movers From Friday   
3                 3       46 Stocks Moving In Friday's Mid-Day Session   
4                 4  B of A Securities Maintains Neutral on Agilent...   
...             ...                                                ...   
1407323     1413844             Top Narrow Based Indexes For August 29   
1407324     1413845  Recap: Wednesday's Top Percentage Gainers and ...   
1407325     1413846  UPDATE: Oppenheimer Color on China Zenix Auto ...   
1407326     1413847  Oppenheimer Initiates China Zenix At Outperfor...   
1407327     1413848  China Zenix Auto International Opens For Tradi...   

                                                       url          publishe

In [9]:
# Aggregate sentiment scores by date (compute the average)
daily_sentiments = merged_data.groupby('date')['sentiment_score'].mean().reset_index()

# Display the daily sentiment scores
print(daily_sentiments)
print(daily_sentiments['date'].min(), daily_sentiments['date'].max())

            date  sentiment_score
0     2009-02-14         0.226300
1     2009-04-27         0.000000
2     2009-04-29         0.000000
3     2009-05-22         0.000000
4     2009-05-27         0.751050
...          ...              ...
3950  2020-06-07         0.040156
3951  2020-06-08         0.250061
3952  2020-06-09         0.283393
3953  2020-06-10         0.044021
3954  2020-06-11         0.122841

[3955 rows x 2 columns]
2009-02-14 2020-06-11


In [10]:
# Ensure stock data contains 'Date' and 'Daily_Return' columns
dataNVDA = dataNVDA.rename(columns={'Date': 'date'})
dataNVDA = dataNVDA[['date', 'Daily_Return']]  # Ensure only relevant columns are included

In [11]:
# Merge aggregated sentiment scores with stock data
merged_data = pd.merge(daily_sentiments, dataNVDA, on='date', how='left')

In [12]:
# Compute correlation between sentiment scores and daily returns
correlation = merged_data[['sentiment_score', 'Daily_Return']].corr().iloc[0, 1]

In [13]:
# Display results
print("Merged Data with Aggregated Sentiments and Daily Returns:")
print(merged_data)
print(f"\nCorrelation between Sentiment Scores and Daily Returns: {correlation:.2f}")

Merged Data with Aggregated Sentiments and Daily Returns:
            date  sentiment_score  Daily_Return
0     2009-02-14         0.226300           NaN
1     2009-04-27         0.000000     -3.587052
2     2009-04-29         0.000000     -1.538458
3     2009-05-22         0.000000      1.551188
4     2009-05-27         0.751050      0.193234
...          ...              ...           ...
3950  2020-06-07         0.040156           NaN
3951  2020-06-08         0.250061     -1.289235
3952  2020-06-09         0.283393      2.737072
3953  2020-06-10         0.044021      3.545769
3954  2020-06-11         0.122841     -6.090687

[3955 rows x 3 columns]

Correlation between Sentiment Scores and Daily Returns: 0.11


In [14]:
# Reset the index to make 'Date' a column and rename it to 'date'
daily_returns = dataNVDA.reset_index()[['date', 'Daily_Return']]
daily_returns.head()

Unnamed: 0,date,Daily_Return
0,1999-01-22,
1,1999-01-25,10.476398
2,1999-01-26,-7.759362
3,1999-01-27,-0.311031
4,1999-01-28,-0.314397


In [15]:
#Merge sentiment scores with stock returns
merged_data = pd.merge(daily_sentiments, daily_returns, on='date', how='inner')

# Calculate the Pearson correlation coefficient
correlation = merged_data[['sentiment_score', 'Daily_Return']].corr().iloc[0, 1]
print("Pearson correlation coefficient between average daily sentiment scores and stock daily returns:", correlation)

Pearson correlation coefficient between average daily sentiment scores and stock daily returns: 0.1145975858743781
