In [79]:
import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error as mse
from datetime import datetime
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
import xgboost as xgb
from sklearn.metrics import f1_score, roc_auc_score

In [3]:
def balanced_weighted_average(sentiment_scores, decay_factor):
    """
    Calculate a balanced weighted average of sentiment scores without biasing towards negative values.
    
    Parameters:
    - sentiment_scores: A list or pandas Series of sentiment scores (positive and negative).
    - decay_factor: A value between 0 and 1 to control the decay rate of weights; defaults to 0.95.
    
    Returns:
    - A single balanced weighted average score.
    """
    # Initialize positive and negative scores with respective weights
    positive_scores = sentiment_scores[sentiment_scores > 0]
    negative_scores = sentiment_scores[sentiment_scores < 0]
    
    # Calculate decay weights for each score in reverse order (older scores get smaller weights)
    decay_weights = decay_factor ** np.arange(len(sentiment_scores))[::-1]

    # Separate weights for positive and negative scores
    pos_weights = decay_weights[:len(positive_scores)]
    neg_weights = decay_weights[:len(negative_scores)]

    # Calculate the weighted average for positive and negative scores separately
    pos_weighted_avg = (positive_scores * pos_weights).sum() / pos_weights.sum() if len(pos_weights) > 0 else 0
    neg_weighted_avg = (negative_scores * neg_weights).sum() / neg_weights.sum() if len(neg_weights) > 0 else 0

    # Return the balanced average by combining positive and negative averages equally
    balanced_avg = (pos_weighted_avg + neg_weighted_avg) / 2

    return balanced_avg

In [123]:
def compute_stock_sentiment_changes(stock_symbols, interval_days, decay_factor, start_date, end_date, include_news_sentiment):
    """
    Computes stock sentiment and related metrics, with an option to include/exclude news sentiment scores.

    Parameters:
    - stock_symbols: List of stock symbols to process.
    - interval_days: The interval of days over which metrics are calculated.
    - decay_factor: Decay factor for sentiment weighting.
    - start_date: Start date for data filtering.
    - end_date: End date for data filtering.
    - include_news_sentiment: Boolean indicating whether to include news sentiment scores.

    Returns:
    - Dictionary containing results DataFrame for each stock symbol.
    """
    # Initialize dictionaries to hold data
    df_stock_news_sentiment_scores_dict = {}
    df_news_sentiment_scores_dict = {}
    df_stock_data_dict = {}
    results = {}

    # Ensure start_date and end_date are datetime.date objects
    if isinstance(start_date, str):
        start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
    if isinstance(end_date, str):
        end_date = datetime.strptime(end_date, '%Y-%m-%d').date()

    for symbol in stock_symbols:
        # Load data and convert date columns to datetime.date
        try:
            df_stock_news_sentiment_scores_dict[symbol] = pd.read_csv(f'/Users/rishabhbhardwaj/Desktop/Bootcamp project/Sentiment_scores/stock_news_sentiment_scores/stock_news_sentiment_analysis_results_{symbol}.csv')
            if include_news_sentiment:
                df_news_sentiment_scores_dict[symbol] = pd.read_csv(f'/Users/rishabhbhardwaj/Desktop/Bootcamp project/Sentiment_scores/news_sentiment_scores/2000-2024/sentiment_analysis_results_{symbol}.csv')
            df_stock_data_dict[symbol] = pd.read_csv(f'/Users/rishabhbhardwaj/Desktop/Bootcamp project/stocks data/stock_data_{symbol}.csv')
        except FileNotFoundError as e:
            print(f"Error loading data for {symbol}: {e}")
            continue

        # Rename columns to distinguish sentiment sources
        if include_news_sentiment:
            df_news_sentiment_scores_dict[symbol] = df_news_sentiment_scores_dict[symbol].rename(columns={'weighted compound sentiment score': 'weighted compound news sentiment score'})
        df_stock_news_sentiment_scores_dict[symbol] = df_stock_news_sentiment_scores_dict[symbol].rename(columns={'weighted compound sentiment score': 'weighted compound stock sentiment score'})

        # Convert 'Date' to datetime.date format
        dataframes = [df_stock_news_sentiment_scores_dict[symbol], df_stock_data_dict[symbol]]
        if include_news_sentiment:
            dataframes.append(df_news_sentiment_scores_dict[symbol])

        for df in dataframes:
            df['Date'] = pd.to_datetime(df['Date']).dt.date

        # Filter data within date range
        filtered_stock_sentiment = df_stock_news_sentiment_scores_dict[symbol][
            (df_stock_news_sentiment_scores_dict[symbol]['Date'] >= start_date) & 
            (df_stock_news_sentiment_scores_dict[symbol]['Date'] <= end_date)
        ]
        if include_news_sentiment:
            filtered_news_sentiment = df_news_sentiment_scores_dict[symbol][
                (df_news_sentiment_scores_dict[symbol]['Date'] >= start_date) & 
                (df_news_sentiment_scores_dict[symbol]['Date'] <= end_date)
            ]
        filtered_stock = df_stock_data_dict[symbol][
            (df_stock_data_dict[symbol]['Date'] >= start_date) & 
            (df_stock_data_dict[symbol]['Date'] <= end_date)
        ]

        # Merge data on 'Date'
        merged_data = filtered_stock_sentiment
        if include_news_sentiment:
            merged_data = pd.merge(merged_data, filtered_news_sentiment, on='Date', how='inner')
        merged_data = pd.merge(merged_data, filtered_stock, on='Date', how='inner')
        merged_data.sort_values(by='Date', inplace=True)

        # Check if data is sufficient
        if len(merged_data) < interval_days:
            print(f"Not enough data for {symbol} with interval_days = {interval_days}. Skipping.")
            continue

        # Initialize lists to store results
        price_diff_list = []
        balanced_avg_stock_sentiment_list = []
        balanced_avg_news_sentiment_list = [] if include_news_sentiment else None
        sma_list = []
        sma_diff_list = []
        date_list = []

        # Calculate metrics
        for i in range(len(merged_data) - interval_days + 1):
            date_d = merged_data['Date'].iloc[i + interval_days - 1]

            # Calculate price difference
            close_d = merged_data['Close'].iloc[i]
            close_d_T = merged_data['Close'].iloc[i + interval_days - 1]
            price_diff = close_d_T - close_d

            # Calculate SMA
            sma = merged_data['Close'].iloc[i:i + interval_days].mean()

            # Calculate balanced weighted averages for sentiments
            stock_sentiment_scores = merged_data['weighted compound stock sentiment score'].iloc[i:i + interval_days]
            balanced_avg_stock_sentiment = balanced_weighted_average(stock_sentiment_scores, decay_factor)

            if include_news_sentiment:
                news_sentiment_scores = merged_data['weighted compound news sentiment score'].iloc[i:i + interval_days]
                balanced_avg_news_sentiment = balanced_weighted_average(news_sentiment_scores, decay_factor)

            # Calculate SMA difference (classification)
            if len(sma_list) > 0:  # Ensure there is a previous SMA value to compare
                prev_sma = sma_list[-1]
                sma_diff = 0 if prev_sma > sma else 1
            else:
                sma_diff = np.nan  # For the first interval, there is no previous SMA to compare

            # Append results
            date_list.append(date_d)
            price_diff_list.append(price_diff)
            sma_list.append(sma)
            sma_diff_list.append(sma_diff)
            balanced_avg_stock_sentiment_list.append(balanced_avg_stock_sentiment)
            if include_news_sentiment:
                balanced_avg_news_sentiment_list.append(balanced_avg_news_sentiment)

        # Store results for the symbol
        results[symbol] = pd.DataFrame({
            'Date': date_list,
            f'{symbol}_Price_Diff_{interval_days}d': price_diff_list,
            f'{symbol}_SMA_{interval_days}d': sma_list,
            f'{symbol}_SMA_Diff_{interval_days}d': sma_diff_list,
            f'{symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d': balanced_avg_stock_sentiment_list
        })

        if include_news_sentiment:
            results[symbol][f'{symbol}_Balanced_Avg_News_Sentiment_{interval_days}d'] = balanced_avg_news_sentiment_list

    return results

In [132]:
stock_symbols = ['GOOG', 'MSFT', 'NVDA','AMZN','AAPL']
interval_days = 14
decay_factor = 0.60
results = compute_stock_sentiment_changes(stock_symbols, interval_days, decay_factor, start_date='2011-05-16', end_date='2024-09-21', include_news_sentiment = True)

In [133]:
results['AAPL']

Unnamed: 0,Date,AAPL_Price_Diff_14d,AAPL_SMA_14d,AAPL_SMA_Diff_14d,AAPL_Balanced_Avg_Stock_Sentiment_14d,AAPL_Balanced_Avg_News_Sentiment_14d
0,2014-04-23,6.451735,15.218639,,0.082914,0.101592
1,2014-07-15,7.060843,16.007199,1.0,0.046834,0.077620
2,2014-10-01,5.655769,16.580228,1.0,0.122351,0.078132
3,2015-01-15,5.732059,17.112691,1.0,0.028113,0.078684
4,2015-07-20,12.186655,17.941331,1.0,0.028216,0.005379
...,...,...,...,...,...,...
132,2024-07-19,54.826340,192.633856,1.0,0.092503,-0.079817
133,2024-07-24,49.132828,196.138356,1.0,0.030154,-0.079631
134,2024-07-26,48.463730,199.606464,1.0,0.015207,-0.077080
135,2024-08-06,39.411926,202.302616,1.0,-0.024605,-0.084794


In [107]:
def fit_sentiments_vs_metric_as_mlr(stock_symbols, results, interval_days, metric):
    """
    Fits multilinear regression of sentiment scores against either SMA or price difference.

    Parameters:
    - stock_symbols: List of stock symbols to process.
    - results: Dictionary with stock data DataFrames.
    - interval_days: The interval of days over which metrics are calculated.
    - metric: Specify either 'sma' or 'price_diff' to choose which metric to plot.

    Returns:
    - Dictionary containing models and MSEs for each stock symbol.
    """
    # Initialize dictionaries to store linear models and MSEs
    mlr_model = {}
    mlr_mses = {}

    for symbol in stock_symbols:
        # Construct column names based on the actual interval_days and selected metric
        news_sentiment_col = f'{symbol}_Balanced_Avg_News_Sentiment_{interval_days}d'
        stock_sentiment_col = f'{symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d'
        if metric == 'sma':
            metric_col = f'{symbol}_SMA_{interval_days}d'
        elif metric == 'price_diff':
            metric_col = f'{symbol}_Price_Diff_{interval_days}d'
        else:
            print(f"Unknown metric '{metric}'. Choose 'sma' or 'price_diff'. Skipping.")
            continue

        # Check if the expected columns exist in the DataFrame for the current symbol
        if (
            symbol in results and 
            news_sentiment_col in results[symbol].columns and 
            stock_sentiment_col in results[symbol].columns and 
            metric_col in results[symbol].columns
        ):
            # Extract features (sentiment columns) and target (metric column)
            X = results[symbol][[news_sentiment_col, stock_sentiment_col]].values
            y = results[symbol][metric_col].values

            # Fit a multilinear regression model
            pipeline = Pipeline([
                ('scale', StandardScaler()),
                ('mlr', LinearRegression())
            ])
            pipeline.fit(X, y)
            y_pred = pipeline.predict(X)

            # Store the model and MSE
            mlr_model[symbol] = pipeline
            mlr_mses[symbol] = mse(y, y_pred)

            print(f"{symbol}: Model fitted. MSE = {mlr_mses[symbol]:.4f}")
        else:
            print(f"Required columns for {symbol} with interval {interval_days} days not found. Skipping.")

    return {'models': mlr_model, 'mse': mlr_mses}

In [108]:
fit_sentiments_vs_metric_as_mlr(stock_symbols, results, interval_days, 'sma')

Required columns for GOOG with interval 14 days not found. Skipping.
Required columns for MSFT with interval 14 days not found. Skipping.
Required columns for NVDA with interval 14 days not found. Skipping.
Required columns for AMZN with interval 14 days not found. Skipping.
Required columns for AAPL with interval 14 days not found. Skipping.


{'models': {}, 'mse': {}}

In [29]:
def run_multilinear_regression(results, stock_symbol, interval_days, metric):
    """
    Runs a multilinear regression of balanced average news and stock sentiment scores
    against either SMA or price difference.

    Parameters:
    - results: Dictionary with stock data DataFrames.
    - stock_symbol: The symbol of the stock to analyze.
    - interval_days: The interval of days over which metrics are calculated.
    - metric: Specify either 'sma' or 'price_diff' to choose which metric to analyze.
    """
    # Define column names based on interval_days and selected metric
    balanced_avg_news_sentiment_col = f'{stock_symbol}_Balanced_Avg_News_Sentiment_{interval_days}d'
    balanced_avg_stock_sentiment_col = f'{stock_symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d'
    if metric == 'sma':
        metric_col = f'{stock_symbol}_SMA_{interval_days}d'
    elif metric == 'price_diff':
        metric_col = f'{stock_symbol}_Price_Diff_{interval_days}d'
    else:
        print(f"Unknown metric '{metric}'. Choose 'sma' or 'price_diff'.")
        return

    # Check if the required columns exist in the DataFrame
    if stock_symbol in results and \
       balanced_avg_news_sentiment_col in results[stock_symbol].columns and \
       balanced_avg_stock_sentiment_col in results[stock_symbol].columns and \
       metric_col in results[stock_symbol].columns:
        
        # Extract predictor variables and the dependent variable
        X_news = np.array(results[stock_symbol][balanced_avg_news_sentiment_col])
        X_stock = np.array(results[stock_symbol][balanced_avg_stock_sentiment_col])
        y_stats = np.array(results[stock_symbol][metric_col])
        
        # Combine predictors into a single 2D array
        X_combined = np.column_stack((X_news, X_stock))
        
        # Add a constant to the predictors for the intercept
        X_with_const = sm.add_constant(X_combined)

        # Fit the OLS model
        model_ols = sm.OLS(y_stats, X_with_const)
        results_ols = model_ols.fit()

        # Print the summary to see coefficients and other statistics
        print(results_ols.summary())

        # Get the confidence intervals for the coefficients
        confidence_intervals = results_ols.conf_int(alpha=0.05)  # 95% CI by default
        print("Confidence intervals:\n", confidence_intervals)
    else:
        print(f"Required columns for {stock_symbol} with interval {interval_days} days and metric '{metric}' not found in results.")


In [30]:
# Define the parameters
stock_symbol = 'AAPL'
metric = 'sma'  

# Run the multilinear regression
run_multilinear_regression(results, stock_symbol, interval_days, metric)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.336
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     33.89
Date:                Sun, 17 Nov 2024   Prob (F-statistic):           1.23e-12
Time:                        22:45:40   Log-Likelihood:                -737.66
No. Observations:                 137   AIC:                             1481.
Df Residuals:                     134   BIC:                             1490.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        100.2348      5.400     18.561      0.0

In [128]:
def fit_sentiments_vs_metric_as_log_reg(stock_symbols, results, interval_days, include_news_sentiment):
    """
    Fits logistic regression and calculates accuracy and specificity with an option to include/exclude news sentiment scores.

    Parameters:
    - stock_symbols: List of stock symbols to process.
    - results: Dictionary with stock data DataFrames.
    - interval_days: The interval of days over which metrics are calculated.
    - include_news_sentiment: Boolean indicating whether to include news sentiment scores as features.

    Returns:
    - Dictionary containing models and confusion matrices for each stock symbol.
    """
    # Initialize dictionaries to store models, confusion matrices, and metrics
    model = {}
    metrics = {}

    for symbol in stock_symbols:
        # Construct column names
        stock_sentiment_col = f'{symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d'
        news_sentiment_col = f'{symbol}_Balanced_Avg_News_Sentiment_{interval_days}d'
        metric_col = f'{symbol}_SMA_Diff_{interval_days}d'

        # Check if required columns exist
        if symbol in results and stock_sentiment_col in results[symbol].columns and metric_col in results[symbol].columns:
            if include_news_sentiment and news_sentiment_col in results[symbol].columns:
                # Include both stock and news sentiment as features
                data = results[symbol][[news_sentiment_col, stock_sentiment_col, metric_col]].dropna()
                X = data[[news_sentiment_col, stock_sentiment_col]].values
            else:
                # Include only stock sentiment as features
                data = results[symbol][[stock_sentiment_col, metric_col]].dropna()
                X = data[[stock_sentiment_col]].values

            y = data[metric_col].values

            # Fit logistic regression
            pipeline = Pipeline([
                ('scale', StandardScaler()),
                ('log_reg', LogisticRegression(penalty=None, solver='lbfgs', max_iter=1000))
            ])
            pipeline.fit(X, y)
            y_pred = pipeline.predict(X)

            # Compute confusion matrix
            tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

            # Calculate metrics
            accuracy = accuracy_score(y, y_pred)
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

            # Store results
            model[symbol] = pipeline
            metrics[symbol] = {
                "accuracy": accuracy,
                "specificity": specificity
            }

            # Print results
            print(f"{symbol}:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Specificity: {specificity:.4f}")
        else:
            print(f"Required columns for {symbol} with interval {interval_days} days not found. Skipping.")


In [134]:
fit_sentiments_vs_metric_as_log_reg(stock_symbols, results, interval_days, include_news_sentiment=True)

GOOG:
Accuracy: 0.8732
Specificity: 0.0000
MSFT:
Accuracy: 0.8323
Specificity: 0.0000
NVDA:
Accuracy: 0.9853
Specificity: 0.0000
AMZN:
Accuracy: 0.6926
Specificity: 0.0870
AAPL:
Accuracy: 0.7794
Specificity: 0.0000


In [130]:
def fit_sentiments_vs_metric_as_xgb(stock_symbols, results, interval_days, include_news_sentiment):
    """
    Fits XGBoost classifiers and calculates accuracy and specificity with an option to include/exclude news sentiment scores.

    Parameters:
    - stock_symbols: List of stock symbols to process.
    - results: Dictionary with stock data DataFrames.
    - interval_days: The interval of days over which metrics are calculated.
    - include_news_sentiment: Boolean indicating whether to include news sentiment scores as features.

    Returns:
    - Dictionary containing models and metrics for each stock symbol.
    """
    # Initialize dictionaries to store models and metrics
    model = {}
    metrics = {}

    for symbol in stock_symbols:
        # Construct column names
        stock_sentiment_col = f'{symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d'
        news_sentiment_col = f'{symbol}_Balanced_Avg_News_Sentiment_{interval_days}d'
        metric_col = f'{symbol}_SMA_Diff_{interval_days}d'

        # Check if required columns exist
        if symbol in results and stock_sentiment_col in results[symbol].columns and metric_col in results[symbol].columns:
            # Build features and target
            if include_news_sentiment and news_sentiment_col in results[symbol].columns:
                data = results[symbol][[news_sentiment_col, stock_sentiment_col, metric_col]].dropna()
                X = data[[news_sentiment_col, stock_sentiment_col]].values
            else:
                data = results[symbol][[stock_sentiment_col, metric_col]].dropna()
                X = data[[stock_sentiment_col]].values
            
            y = data[metric_col].values

            # Compute scale_pos_weight for class imbalance
            scale_pos_weight = len(y[y == 0]) / len(y[y == 1]) if len(y[y == 1]) > 0 else 1

            # Fit XGBoost model
            xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric="logloss")
            xgb_model.fit(X, y)
            y_pred = xgb_model.predict(X)

            # Compute confusion matrix
            tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

            # Calculate metrics
            accuracy = accuracy_score(y, y_pred)
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            f1 = f1_score(y, y_pred)
            roc_auc = roc_auc_score(y, xgb_model.predict_proba(X)[:, 1])

            # Store results
            model[symbol] = xgb_model
            metrics[symbol] = {
                "accuracy": accuracy,
                "specificity": specificity,
                "f1_score": f1,
                "roc_auc": roc_auc
            }

            # Print results
            print(f"{symbol}:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Specificity: {specificity:.4f}")
            print(f"F1-Score: {f1:.4f}")
            print(f"ROC-AUC: {roc_auc:.4f}")
        else:
            print(f"Required columns for {symbol} with interval {interval_days} days not found. Skipping.")

In [135]:
fit_sentiments_vs_metric_as_xgb(stock_symbols, results, interval_days, include_news_sentiment = True)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



GOOG:
Accuracy: 0.9085
Specificity: 0.9444
F1-Score: 0.9451
ROC-AUC: 0.9828
MSFT:
Accuracy: 0.9701
Specificity: 1.0000
F1-Score: 0.9817
ROC-AUC: 0.9979
NVDA:
Accuracy: 0.0147
Specificity: 1.0000
F1-Score: 0.0000
ROC-AUC: 0.5000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



AMZN:
Accuracy: 0.9654
Specificity: 0.9928
F1-Score: 0.9748
ROC-AUC: 0.9981
AAPL:
Accuracy: 0.9706
Specificity: 1.0000
F1-Score: 0.9811
ROC-AUC: 0.9983
