In [22]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
# Load news data
def load_news_data(file_path):
    news_data = pd.read_csv(file_path)
    # Use a more flexible date parser
    news_data['date'] = pd.to_datetime(news_data['date'], format='mixed', utc=True)
    # Convert to UTC-4 timezone
    news_data['date'] = news_data['date'].dt.tz_convert('Etc/GMT+4')
    return news_data

In [25]:
# Load stock data
def load_stock_data(folder_path):
    stock_data = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            stock_symbol = file.split('.')[0]
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            # Use a more flexible date parser
            df['Date'] = pd.to_datetime(df['Date'], format='mixed')
            df.set_index('Date', inplace=True)
            stock_data[stock_symbol] = df
    return stock_data

In [26]:
# Perform sentiment analysis
def analyze_sentiment(headline):
    return TextBlob(headline).sentiment.polarity

In [27]:
# Calculate daily stock returns
def calculate_daily_returns(stock_data):
    for symbol, df in stock_data.items():
        stock_data[symbol]['Daily_Return'] = df['Close'].pct_change()
    return stock_data

In [28]:
# Align news data with stock data
def align_data(news_data, stock_data):
    aligned_data = {}
    for symbol in stock_data.keys():
        symbol_news = news_data[news_data['stock'] == symbol]
        symbol_stock = stock_data[symbol]
        
        # Convert date index to date only (no time) for alignment
        symbol_stock.index = symbol_stock.index.date
        symbol_news['date'] = symbol_news['date'].dt.date
        
        aligned_df = pd.DataFrame(index=symbol_stock.index)
        aligned_df['Daily_Return'] = symbol_stock['Daily_Return']
        
        daily_sentiment = symbol_news.groupby('date')['sentiment'].mean()
        aligned_df['Sentiment'] = daily_sentiment
        
        aligned_data[symbol] = aligned_df.dropna()
    
    return aligned_data

In [29]:
# Analyze correlation
def analyze_correlation(aligned_data):
    correlations = {}
    for symbol, df in aligned_data.items():
        correlation = df['Daily_Return'].corr(df['Sentiment'])
        correlations[symbol] = correlation
    return correlations

In [30]:
# Visualize correlation
def visualize_correlation(correlations):
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(correlations.keys()), y=list(correlations.values()))
    plt.title('Correlation between News Sentiment and Stock Returns')
    plt.xlabel('Stock Symbol')
    plt.ylabel('Correlation Coefficient')
    plt.show()

In [23]:
def main():
    # Load data
    news_data = load_news_data('/home/kali/Desktop/git/Financial-News-and-Stock-Price-Integration/Data/raw_analyst_ratings.csv')
    stock_data = load_stock_data('/home/kali/Desktop/git/Financial-News-and-Stock-Price-Integration/Data/yfinance_data')

    # Perform sentiment analysis
    news_data['sentiment'] = news_data['headline'].apply(analyze_sentiment)
    
    # Calculate daily stock returns
    stock_data = calculate_daily_returns(stock_data)
    
    # Align news and stock data
    aligned_data = align_data(news_data, stock_data)
    
    # Analyze correlation
    correlations = analyze_correlation(aligned_data)
    
    # Print correlations
    for symbol, correlation in correlations.items():
        print(f"Correlation for {symbol}: {correlation:.4f}")
    
    # Visualize correlation
    visualize_correlation(correlations)

if __name__ == "__main__":
    main()