In [34]:
# Import useful libraries 
import os 
import pandas as pd
import numpy as np 
from textblob import TextBlob 
os.chdir('../scripts/')
import utils as util


In [48]:
# Import Stock Data 
names = ["AAPL","AMZN","GOOG","META","MSFT","NVDA","TSLA"]
stock_data = pd.DataFrame()
for name in names: 
    # read data 
    data_path = f"../../data/week1/yfinance_data/{name}_historical_data.csv"
    curr_data = util.read_csv_file(data_path).get("data")
    curr_data["stock"] = name 

    # concatenate 
    stock_data = pd.concat([stock_data,curr_data],ignore_index=True)

In [49]:
# Import News Data 
data_path = "../../data/week1/raw_analyst_ratings.csv"
df = util.read_csv_file(data_path)
news_data = df.get("data")

### Normalize Dates 
- For both data normalize the dates 
- To normalize we have to change it date only with no time component

In [50]:
# Convert 'date' in news_data
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')  # Convert to datetime

# Check the data type and handle NaT values
if news_data['date'].isnull().any():
    print("There are NaT values in the 'date' column. Please check the original data.")
    # Optionally, you can drop NaT values or fill them
    # news_data = news_data.dropna(subset=['date'])  # Drop rows with NaT
    # news_data['date'] = news_data['date'].fillna(pd.Timestamp('some_default_date'))  # Fill with a default date

# Proceed only if the conversion was successful
if pd.api.types.is_datetime64_any_dtype(news_data['date']):
    news_data['date'] = news_data['date'].dt.tz_convert('UTC')  # Remove timezone
    news_data['date'] = news_data['date'].dt.date  # Extract only the date
else:
    print("The 'date' column is not in datetime format. Please check the conversion.")

The 'date' column is not in datetime format. Please check the conversion.


In [51]:
# Step 3: Sentiment Analysis on News Headlines
def analyze_sentiment(headline):
    """Analyzes the sentiment of a headline and returns a polarity score."""
    return TextBlob(headline).sentiment.polarity

In [None]:
# Step 4: Data Preparation
news_data['sentiment_score'] = news_data['headline'].apply(analyze_sentiment)

# Align the dates between news and stock data by merging on date and stock symbol
merged_data = pd.merge(news_data, stock_data, left_on=['date', 'stock'], right_on=['Date', 'stock'], how='inner')

In [None]:
# Step 5: Calculate Stock Movements
# Calculate daily returns as the percentage change in 'Close' prices
stock_data['daily_return'] = stock_data['Close'].pct_change()

In [None]:
# Step 6: Aggregate Sentiments
# Calculate the average daily sentiment score for each stock and each date
average_sentiment = news_data.groupby(['date', 'stock'])['sentiment_score'].mean().reset_index()

In [None]:
# Merge the aggregated sentiment data with stock data
final_merged_data = pd.merge(stock_data, average_sentiment, left_on=['Date', 'stock'], right_on=['date', 'stock'], how='left')

In [None]:
# Step 7: Correlation Analysis
# Drop rows with missing values to ensure clean correlation analysis
clean_data = final_merged_data.dropna(subset=['daily_return', 'sentiment_score'])

In [None]:
# Calculate the Pearson correlation coefficient between daily returns and sentiment scores
correlation = clean_data['daily_return'].corr(clean_data['sentiment_score'])
print(f"The Pearson correlation coefficient between daily stock returns and sentiment scores is: {correlation}")