CORRELATION ANALYSIS FOR THE NEWS AND STOCK DATASETS
FOR THE ALL STOCK DATA SET 

In [2]:
# Data Manipulation
import pandas as pd  
import numpy as np  

# Data Visualization
import matplotlib.pyplot as plt  
import seaborn as sns            # For advanced and aesthetically pleasing visualizations

# Text Analysis (Sentiment and NLP)
from nltk.sentiment import SentimentIntensityAnalyzer  
from textblob import TextBlob                          
from wordcloud import WordCloud                       

# Time-Series Analysis
import datetime as dt  # For handling date and time-related operations

# Machine Learning (if needed for advanced analysis later)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  
from sklearn.decomposition import LatentDirichletAllocation                   


In [3]:
def Data_merger(df1_path, df2_path, ticker: str):
    ''' the following function performs data loading and cleaning for the stock data news and 
    Stockn data prices and then it merges them by their dates and their tickers and returns a merged data frame
    which simplifies for doing correlation analysis'''

    # Preprocessing the first DataFrame (df1)
    df1 = pd.read_csv(df1_path)
    df1['date'] = pd.to_datetime(df1['date'], format='ISO8601', utc=True).dt.date
    df1 = df1.drop(columns=["Unnamed: 0"], errors='ignore')  # Drop if exists
    # print("df1 columns:", df1.columns) # for debugging purposes

    # Preprocessing the second DataFrame (df2)
    df2 = pd.read_csv(df2_path)
    df2['Date'] = pd.to_datetime(df2['Date'], format='ISO8601', utc=True).dt.date
    df2.rename(columns={'Date': 'date'}, inplace=True)
    df2['Daily_Return'] = df2['Close'].pct_change()
    # print("df2 columns:", df2.columns) # for debugging purposes

    # Filter selected stocks and the specific ticker
    selected_categories = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'NVDA', 'AMZN', 'META']
    df1_selected = df1[df1['stock'].isin(selected_categories)]
    df1_filtered = df1_selected[df1_selected['stock'] == ticker]

    # Merge DataFrames
    Merged_df = pd.merge(df1_filtered, df2, on='date', how='inner')
    # print("Merged_df columns:", Merged_df.columns) # for debugging purposes

    # Check if 'headline' column exists
    if 'headline' not in Merged_df.columns:
        raise KeyError("'headline' column is missing in the merged DataFrame.")

    # Sentiment analysis
    def calculate_sentiment(text):
        return TextBlob(text).sentiment.polarity

    Merged_df['sentiment_score'] = Merged_df['headline'].apply(calculate_sentiment)

    # Classify sentiment
    Merged_df['sentiment_category'] = pd.cut(
        Merged_df['sentiment_score'],
        bins=[-1, -0.1, 0.1, 1],
        labels=['negative', 'neutral', 'positive']
    )

    return Merged_df


CORRELATION CALCULATION FOR ALL THE DATASET 

In [22]:
# Dictionary of paths
Path_Dict = {
    'AAPL_PATH': "C:/Users/ibsan/Desktop/TenX/week-1/Data/yfinance_data/AAPL_historical_data.csv",
    'AMZN_PATH': "C:/Users/ibsan/Desktop/TenX/week-1/Data/yfinance_data/AMZN_historical_data.csv",
    'GOOGL_PATH': "C:/Users/ibsan/Desktop/TenX/week-1/Data/yfinance_data/GOOG_historical_data.csv",
    'META_PATH': "C:/Users/ibsan/Desktop/TenX/week-1/Data/yfinance_data/META_historical_data.csv",
    'MSFT_PATH': "C:/Users/ibsan/Desktop/TenX/week-1/Data/yfinance_data/MSFT_historical_data.csv",
    'NVDA_PATH': "C:/Users/ibsan/Desktop/TenX/week-1/Data/yfinance_data/NVDA_historical_data.csv",
    'TSLA_PATH': "C:/Users/ibsan/Desktop/TenX/week-1/Data/yfinance_data/TSLA_historical_data.csv"
}

# List of tickers
tickers_list = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'NVDA', 'AMZN', 'META']

# Path to raw analyst ratings
stock_path = "C:/Users/ibsan/Desktop/TenX/week-1/Data/raw_analyst_ratings.csv/raw_analyst_ratings.csv"

# Loop to process and calculate correlations
for i in range(len(tickers_list)):
    ticker = tickers_list[i]
    df2_path = Path_Dict[ticker + '_PATH']  # Fetch correct file path
    
    # Call the Data_merger function
    temps_df = Data_merger(stock_path, df2_path, ticker=ticker)
    
    # Calculate correlations
    corr_sent_score_and_daily_return = temps_df['sentiment_score'].corr(temps_df['Daily_Return'])
    corr_sent_score_and_closing_price = temps_df['sentiment_score'].corr(temps_df['Close'])
    
    # Print results
    print(f"Correlation between sentiment score and daily stock returns of {ticker}: {corr_sent_score_and_daily_return}")
    print(f"Correlation between sentiment score and closing price of {ticker}: {corr_sent_score_and_closing_price}")


Correlation between sentiment score and daily stock returns of AAPL: 0.06662652819832401
Correlation between sentiment score and closing price of AAPL: 0.05496089526508538
Correlation between sentiment score and daily stock returns of GOOGL: 0.02707079024818816
Correlation between sentiment score and closing price of GOOGL: -0.009027053700049099
Correlation between sentiment score and daily stock returns of MSFT: nan
Correlation between sentiment score and closing price of MSFT: nan
Correlation between sentiment score and daily stock returns of TSLA: 0.024454665434566786
Correlation between sentiment score and closing price of TSLA: -0.02632867605746522
Correlation between sentiment score and daily stock returns of NVDA: 0.08485831064829649
Correlation between sentiment score and closing price of NVDA: -0.011296696192418004
Correlation between sentiment score and daily stock returns of AMZN: 0.006157753052171696
Correlation between sentiment score and closing price of AMZN: 0.086167916