In [1]:
import pandas as pd
from textblob import TextBlob

In [14]:
stock_prices = pd.read_csv('/content/GOOG_historical_data.csv')
news = pd.read_csv('/content/raw_analyst_ratings.csv')

In [3]:
stock_prices = stock_prices.drop(columns=['Dividends','Stock Splits'])

In [4]:
stock_prices.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,2.490664,2.591785,2.390042,2.499133,2.496292,897427216
1,2004-08-20,2.51582,2.716817,2.503118,2.697639,2.694573,458857488
2,2004-08-23,2.758411,2.826406,2.71607,2.724787,2.72169,366857939
3,2004-08-24,2.770615,2.779581,2.579581,2.61196,2.608991,306396159
4,2004-08-25,2.614201,2.689918,2.587302,2.640104,2.637103,184645512


In [5]:
news.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


## **Date Alignment**

In [6]:
print("News dataset dates:")
print(news['date'].head())

print("\nStock prices dataset dates:")
print(stock_prices['Date'].head())

News dataset dates:
0    2020-06-05 10:30:54-04:00
1    2020-06-03 10:45:20-04:00
2    2020-05-26 04:30:07-04:00
3    2020-05-22 12:45:06-04:00
4    2020-05-22 11:38:59-04:00
Name: date, dtype: object

Stock prices dataset dates:
0    2004-08-19
1    2004-08-20
2    2004-08-23
3    2004-08-24
4    2004-08-25
Name: Date, dtype: object


To make sure that the dates in the news and stock price datasets are aligned, we must first normalize the formats. If the two datasets have different date formats (e.g., one includes timestamps and timezones while the other does not)
- Converting all date columns to a common format using pd.to_datetime.

- Extracting only the date component (removing time and timezone information)

In [19]:
#Removes timezone information from the datetime values
news['date'] = pd.to_datetime(news['date'].astype(str), errors='coerce').dt.tz_localize(None).dt.date

# Normalize the 'Date' column in the stock prices dataset
stock_prices['Date'] = pd.to_datetime(stock_prices['Date'], errors='coerce').dt.date

# Display results to confirm
news[['date']].head(), stock_prices[['Date']].head()

(         date
 0  2020-06-05
 1  2020-06-03
 2  2020-05-26
 3  2020-05-22
 4  2020-05-22,
          Date
 0  2004-08-19
 1  2004-08-20
 2  2004-08-23
 3  2004-08-24
 4  2004-08-25)

In [20]:
aligned_df = pd.merge(news, stock_prices, left_on='date', right_on='Date', how='inner')
aligned_df.head()


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05,A,2020-06-05,70.658501,72.252502,70.300003,71.919502,71.837753,34698000,0.0,0.0
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03,A,2020-06-03,71.915001,72.327599,71.488853,71.819,71.737366,25124000,0.0,0.0
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26,A,2020-05-26,71.863503,72.050003,70.606499,70.850998,70.770462,41212000,0.0,0.0
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22,A,2020-05-22,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22,A,2020-05-22,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0


In [21]:
aligned_df = aligned_df.reset_index(drop=True)
aligned_df = aligned_df.drop(columns=['Unnamed: 0'], errors='ignore')
aligned_df.head()

Unnamed: 0,headline,url,publisher,date,stock,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05,A,2020-06-05,70.658501,72.252502,70.300003,71.919502,71.837753,34698000,0.0,0.0
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03,A,2020-06-03,71.915001,72.327599,71.488853,71.819,71.737366,25124000,0.0,0.0
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26,A,2020-05-26,71.863503,72.050003,70.606499,70.850998,70.770462,41212000,0.0,0.0
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22,A,2020-05-22,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22,A,2020-05-22,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0


In [23]:
#drop one date column
aligned_df = aligned_df.drop(columns=['Date'])
aligned_df.head()

Unnamed: 0,headline,url,publisher,date,stock,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05,A,70.658501,72.252502,70.300003,71.919502,71.837753,34698000,0.0,0.0
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03,A,71.915001,72.327599,71.488853,71.819,71.737366,25124000,0.0,0.0
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26,A,71.863503,72.050003,70.606499,70.850998,70.770462,41212000,0.0,0.0
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22,A,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22,A,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0


## **Sentiment Analysis**

In [24]:
aligned_df['sentiment'] = aligned_df['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [25]:
aligned_df.head()

Unnamed: 0,headline,url,publisher,date,stock,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,sentiment
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05,A,70.658501,72.252502,70.300003,71.919502,71.837753,34698000,0.0,0.0,0.0
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03,A,71.915001,72.327599,71.488853,71.819,71.737366,25124000,0.0,0.0,0.0
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26,A,71.863503,72.050003,70.606499,70.850998,70.770462,41212000,0.0,0.0,0.0
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22,A,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0,0.0
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22,A,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0,0.0


In [26]:
def classify_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [27]:
aligned_df['sentiment_class'] = aligned_df['sentiment'].apply(classify_sentiment)

# Display the result
aligned_df[['headline', 'sentiment', 'sentiment_class']].head()

Unnamed: 0,headline,sentiment,sentiment_class
0,Stocks That Hit 52-Week Highs On Friday,0.0,Neutral
1,Stocks That Hit 52-Week Highs On Wednesday,0.0,Neutral
2,71 Biggest Movers From Friday,0.0,Neutral
3,46 Stocks Moving In Friday's Mid-Day Session,0.0,Neutral
4,B of A Securities Maintains Neutral on Agilent...,0.0,Neutral


The **polarity** value is a float within the range of [-1, 1]. Positive values indicate positive sentiment, negative values indicate negative sentiment, and values close to 0 indicate neutral sentiment.

-----

In [28]:
len(stock_prices)

5020

In [29]:
len(news)

1407328

In [30]:
len(aligned_df)

55230

In [31]:
news_date_counts = news.groupby('date').size()
multiple_articles_dates = news_date_counts[news_date_counts > 1]
print(multiple_articles_dates)

date
2011-04-28      2
2011-04-29      2
2011-05-02      9
2011-05-03      3
2011-05-05      3
             ... 
2020-06-07     25
2020-06-08    765
2020-06-09    804
2020-06-10    806
2020-06-11    544
Length: 2218, dtype: int64


In [32]:
aligned_date_counts = aligned_df.groupby('date').size()
multiple_articles_dates = aligned_date_counts[aligned_date_counts > 1]
print(multiple_articles_dates)

date
2011-04-28      2
2011-04-29      2
2011-05-02      9
2011-05-03      3
2011-05-05      3
             ... 
2020-06-05    932
2020-06-08    765
2020-06-09    804
2020-06-10    806
2020-06-11    544
Length: 2091, dtype: int64


----

## **Analysis**

The **daily return** represents the percentage change in the stock's closing price compared to the previous day's closing price.
Positive returns indicate a price increase, while negative returns indicate a price decrease.

In [34]:
aligned_df['Daily_Return'] = aligned_df['Close'].pct_change() * 100
print(aligned_df[['date', 'Close', 'Daily_Return']].head())

         date      Close  Daily_Return
0  2020-06-05  71.919502           NaN
1  2020-06-03  71.819000     -0.139742
2  2020-05-26  70.850998     -1.347836
3  2020-05-22  70.521004     -0.465758
4  2020-05-22  70.521004      0.000000


- **2004-08-20:** The stock's closing price is 2.697639. The daily return is 7.94%. This means that, compared to the previous day, the stock's price increased by 7.94% on 2004-08-20.

- **2004-08-24:** The stock's closing price is 2.611960. The daily return is -4.14%. This means the stock price decreased by 4.14% compared to 2004-08-23.



In [35]:
aligned_df.head()

Unnamed: 0,headline,url,publisher,date,stock,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,sentiment,sentiment_class,Daily_Return
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05,A,70.658501,72.252502,70.300003,71.919502,71.837753,34698000,0.0,0.0,0.0,Neutral,
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03,A,71.915001,72.327599,71.488853,71.819,71.737366,25124000,0.0,0.0,0.0,Neutral,-0.139742
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26,A,71.863503,72.050003,70.606499,70.850998,70.770462,41212000,0.0,0.0,0.0,Neutral,-1.347836
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22,A,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0,0.0,Neutral,-0.465758
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22,A,69.835503,70.638,69.591499,70.521004,70.440842,26188000,0.0,0.0,0.0,Neutral,0.0


## **Correlation Analysis**

In [37]:
correlation = aligned_df[['sentiment', 'Daily_Return']].corr(method='pearson')
correlation

Unnamed: 0,sentiment,Daily_Return
sentiment,1.0,-0.001879
Daily_Return,-0.001879,1.0


## **Conclusion**
The correlation of -0.001879 between sentiment and stock returns indicates that there is almost no meaningful relationship between the sentiment scores and stock returns