In [102]:
import pandas as pd
from textblob import TextBlob
import yfinance as yf
import numpy as np
from scipy.stats import pearsonr

In [103]:
# Load news headlines data
news_data = pd.read_csv('../data/raw_analyst_ratings.csv')
stock_data= pd.read_csv('../data/all_stock_data2.csv')

In [104]:
print(news_data)

         Unnamed: 0                                           headline  \
0                 0            Stocks That Hit 52-Week Highs On Friday   
1                 1         Stocks That Hit 52-Week Highs On Wednesday   
2                 2                      71 Biggest Movers From Friday   
3                 3       46 Stocks Moving In Friday's Mid-Day Session   
4                 4  B of A Securities Maintains Neutral on Agilent...   
...             ...                                                ...   
1407323     1413844             Top Narrow Based Indexes For August 29   
1407324     1413845  Recap: Wednesday's Top Percentage Gainers and ...   
1407325     1413846  UPDATE: Oppenheimer Color on China Zenix Auto ...   
1407326     1413847  Oppenheimer Initiates China Zenix At Outperfor...   
1407327     1413848  China Zenix Auto International Opens For Tradi...   

                                                       url          publisher  \
0        https://www.benzinga.

In [106]:
print(stock_data)

               Date       Open       High        Low      Close  Adj Close  \
0        2013-03-05  33.310001  33.310001  33.310001  33.310001  30.201914   
1        2013-03-06  33.290001  33.290001  33.220001  33.220001  30.120306   
2        2013-03-07  33.410000  33.410000  32.970001  33.130001  30.038700   
3        2013-03-08  33.009998  33.250000  33.009998  33.250000  30.147512   
4        2013-03-11  33.330002  33.360001  32.959999  33.060001  29.975235   
...             ...        ...        ...        ...        ...        ...   
7454194  2020-05-29  24.350000  24.740000  23.809999  24.370001  24.370001   
7454195  2020-06-01  24.389999  25.270000  23.719999  24.690001  24.690001   
7454196  2020-06-02  25.100000  26.520000  24.660000  26.250000  26.250000   
7454197  2020-06-03  26.760000  27.809999  26.049999  27.690001  27.690001   
7454198  2020-06-04  27.530001  29.780001  27.530001  29.200001  29.200001   

           Volume Symbol  
0           300.0   AADR  
1        

In [112]:
# Assuming 'news_data' is your DataFrame containing the 'date' column
news_data['date'] = pd.to_datetime(news_data['date'])  # Convert to datetime object
news_data['date'] = news_data['date'].dt.strftime('%Y-%m-%d')  # Format date as '%Y-%m-%d'


In [113]:
print(news_data['date'])

0          2020-06-05
1          2020-06-03
2          2020-05-26
3          2020-05-22
4          2020-05-22
              ...    
1407323    2011-08-29
1407324    2011-06-22
1407325    2011-06-21
1407326    2011-06-21
1407327    2011-05-12
Name: date, Length: 1407328, dtype: object


In [114]:
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='ISO8601')


In [115]:
print(stock_data['Date'])

0         2013-03-05
1         2013-03-06
2         2013-03-07
3         2013-03-08
4         2013-03-11
             ...    
7454194   2020-05-29
7454195   2020-06-01
7454196   2020-06-02
7454197   2020-06-03
7454198   2020-06-04
Name: Date, Length: 7454199, dtype: datetime64[ns]


In [116]:
# Apply sentiment analysis to news headlines
news_data['sentiment'] = news_data['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [117]:
print(news_data['sentiment'])

0          0.00
1          0.00
2          0.00
3          0.00
4          0.00
           ... 
1407323    0.15
1407324    0.15
1407325    0.00
1407326    0.00
1407327    0.00
Name: sentiment, Length: 1407328, dtype: float64


In [118]:
#Calculate Stock Movements
# Compute daily percentage change in closing prices to represent stock movements
stock_data['Daily_Returns'] = stock_data['Close'].pct_change() * 100

In [119]:
# Correlation Analysis
# Aggregate sentiment scores if multiple articles appear on the same day
daily_sentiment = news_data.groupby('date')['sentiment'].mean().reset_index()

In [120]:
# Ensure both datasets have the same dates
merged_data = pd.concat([daily_sentiment, stock_data], axis=1)



In [121]:
print("Merged dataset:")
print(merged_data)


Merged dataset:
               date  sentiment       Date       Open       High        Low  \
0        2009-02-14   0.000000 2013-03-05  33.310001  33.310001  33.310001   
1        2009-04-27   0.000000 2013-03-06  33.290001  33.290001  33.220001   
2        2009-04-29   0.000000 2013-03-07  33.410000  33.410000  32.970001   
3        2009-05-22   0.000000 2013-03-08  33.009998  33.250000  33.009998   
4        2009-05-27   0.234091 2013-03-11  33.330002  33.360001  32.959999   
...             ...        ...        ...        ...        ...        ...   
7454194         NaN        NaN 2020-05-29  24.350000  24.740000  23.809999   
7454195         NaN        NaN 2020-06-01  24.389999  25.270000  23.719999   
7454196         NaN        NaN 2020-06-02  25.100000  26.520000  24.660000   
7454197         NaN        NaN 2020-06-03  26.760000  27.809999  26.049999   
7454198         NaN        NaN 2020-06-04  27.530001  29.780001  27.530001   

             Close  Adj Close    Volume Symbol 

In [122]:
# Check for missing values
missing_sentiment = merged_data['sentiment'].isnull().sum()
missing_returns = merged_data['Daily_Returns'].isnull().sum()
print("Missing values in sentiment:", missing_sentiment)
print("Missing values in Daily_Returns:", missing_returns)

# Check for infinite values
infinite_sentiment = np.isinf(merged_data['sentiment']).sum()
infinite_returns = np.isinf(merged_data['Daily_Returns']).sum()
print("Infinite values in sentiment:", infinite_sentiment)
print("Infinite values in Daily_Returns:", infinite_returns)


Missing values in sentiment: 7450244
Missing values in Daily_Returns: 1
Infinite values in sentiment: 0
Infinite values in Daily_Returns: 0


In [123]:
print("Merged dataset:")
print(merged_data)

Merged dataset:
               date  sentiment       Date       Open       High        Low  \
0        2009-02-14   0.000000 2013-03-05  33.310001  33.310001  33.310001   
1        2009-04-27   0.000000 2013-03-06  33.290001  33.290001  33.220001   
2        2009-04-29   0.000000 2013-03-07  33.410000  33.410000  32.970001   
3        2009-05-22   0.000000 2013-03-08  33.009998  33.250000  33.009998   
4        2009-05-27   0.234091 2013-03-11  33.330002  33.360001  32.959999   
...             ...        ...        ...        ...        ...        ...   
7454194         NaN        NaN 2020-05-29  24.350000  24.740000  23.809999   
7454195         NaN        NaN 2020-06-01  24.389999  25.270000  23.719999   
7454196         NaN        NaN 2020-06-02  25.100000  26.520000  24.660000   
7454197         NaN        NaN 2020-06-03  26.760000  27.809999  26.049999   
7454198         NaN        NaN 2020-06-04  27.530001  29.780001  27.530001   

             Close  Adj Close    Volume Symbol 

In [124]:
print(len(merged_data['sentiment']))
print(len(merged_data['Daily_Returns']))

7454199
7454199


In [128]:
import numpy as np
from scipy.stats import pearsonr

# Check for NaN or infinite values in the arrays
nan_mask = np.isnan(merged_data['sentiment']) | np.isnan(merged_data['Daily_Returns'])
inf_mask = np.isinf(merged_data['sentiment']) | np.isinf(merged_data['Daily_Returns'])

# Combine NaN and infinite masks
invalid_mask = nan_mask | inf_mask

# Remove invalid values from both arrays
clean_sentiment = merged_data['sentiment'][~invalid_mask]
clean_returns = merged_data['Daily_Returns'][~invalid_mask]

# Calculate Pearson correlation coefficient between clean sentiment scores and stock returns
correlation_coefficient, p_value = pearsonr(clean_sentiment, clean_returns)
print("Pearson correlation coefficient:", correlation_coefficient)
print("p-value:", p_value)


Pearson correlation coefficient: 0.015239236844272882
p-value: 0.33805839591930076
