In [1]:
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Paths
news_path = "../data/raw_analyst_ratings.csv"
stock_folder = "./"
plot_dir = "plots/task3"
out_dir = "outputs/task3"
os.makedirs(plot_dir, exist_ok=True)
os.makedirs(out_dir, exist_ok=True)

In [3]:
# Load and clean news data
news_df = pd.read_csv(news_path, parse_dates=["date"])
news_df['date'] = pd.to_datetime(
    news_df['date'], format='mixed', errors='coerce', utc=True)
news_df["date"] = news_df["date"].dt.date
news_df.dropna(subset=["headline"], inplace=True)


def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity

In [4]:
# Sentiment polarity
news_df["sentiment"] = news_df["headline"].apply(get_sentiment)

In [5]:
# Sentiment classes
def classify_sentiment(score):
    if score > 0.1:
        return "positive"
    elif score < -0.1:
        return "negative"
    else:
        return "neutral"


news_df["sentiment_class"] = news_df["sentiment"].apply(classify_sentiment)
daily_sentiment = news_df.groupby("date")["sentiment"].mean().reset_index()
daily_sentiment.columns = ["date", "avg_sentiment"]

In [6]:
# Sentiment class distribution
class_dist = news_df.groupby(
    ["date", "sentiment_class"]).size().unstack(fill_value=0)
class_dist.to_csv(f"{out_dir}/sentiment_class_distribution.csv")

In [7]:
# Sentiment class distribution
class_dist = news_df.groupby(
    ["date", "sentiment_class"]).size().unstack(fill_value=0)
class_dist.to_csv(f"{out_dir}/sentiment_class_distribution.csv")

In [13]:
# Ticker files (update to relative path as needed)
ticker_files = {
    "AAPL": "../data/AAPL_historical_data.csv",
    "AMZN": "../data/AMZN_historical_data.csv",
    "GOOG": "../data/GOOG_historical_data.csv",
    "META": "../data/META_historical_data.csv",
    "MSFT": "../data/MSFT_historical_data.csv",
    "NVDA": "../data/NVDA_historical_data.csv",
    "TSLA": "../data/TSLA_historical_data.csv"
}

# Ensure sentiment dates are datetime64[ns]
daily_sentiment["date"] = pd.to_datetime(daily_sentiment["date"])

summary = []

for ticker, file in ticker_files.items():
    df = pd.read_csv(file, parse_dates=["Date"])
    df["date"] = pd.to_datetime(df["Date"])  # Ensure datetime64[ns]
    df.sort_values("date", inplace=True)
    df["daily_return"] = df["Close"].pct_change()

    # --- Lagged Correlations ---
    lags = [0, 1, 2]
    lag_corrs = {}

    for lag in lags:
        shifted = daily_sentiment.copy()
        shifted["date"] = pd.to_datetime(
            shifted["date"]) + pd.to_timedelta(lag, unit="D")

        merged_lagged = pd.merge(
            shifted, df[["date", "daily_return"]],
            on="date", how="inner"
        ).dropna()

        corr = merged_lagged["avg_sentiment"].corr(
            merged_lagged["daily_return"])
        lag_corrs[f"lag_{lag}"] = corr

    # --- Rolling Correlation ---
    merged = pd.merge(
        daily_sentiment, df[["date", "daily_return"]],
        on="date", how="inner"
    ).dropna()

    merged["rolling_corr"] = merged["avg_sentiment"].rolling(
        30).corr(merged["daily_return"])
    merged.to_csv(f"{out_dir}/{ticker}_advanced_correlation.csv", index=False)

    # Plot Rolling Correlation
    plt.figure(figsize=(12, 4))
    plt.plot(merged["date"], merged["rolling_corr"],
             label="30-day Rolling Correlation", color="purple")
    plt.axhline(0, linestyle="--", color="black")
    plt.title(f"{ticker} – Rolling Correlation (Sentiment vs. Return)")
    plt.tight_layout()
    plt.savefig(f"{plot_dir}/{ticker}_rolling_corr.png")
    plt.close()

    lag_corrs["Ticker"] = ticker
    summary.append(lag_corrs)

In [14]:
# Save summary
lag_summary = pd.DataFrame(summary)
lag_summary.to_csv(f"{out_dir}/lagged_sentiment_correlation.csv", index=False)

In [15]:
# Plot lagged correlations
lag_melted = lag_summary.melt(
    id_vars="Ticker", var_name="Lag", value_name="Correlation")
plt.figure(figsize=(10, 6))
sns.barplot(data=lag_melted, x="Ticker",
            y="Correlation", hue="Lag", palette="Set2")
plt.title("Lagged Correlations (Sentiment → Return)")
plt.tight_layout()
plt.savefig(f"{plot_dir}/lagged_correlation_barplot.png")
plt.close()

print("🎯 Advanced Task 3 complete with lag, rolling, and sentiment classification.")

🎯 Advanced Task 3 complete with lag, rolling, and sentiment classification.
