In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv("final_data.csv") 
df.dropna(inplace=True)
df.describe()

Unnamed: 0,angry,disgust,fear,happy,neutral,sad,surprise,emotional_entry_count,average_sentiment_score,sentiment_entry_count,percentage_difference
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,14.409784,0.128373,18.941708,1.100666,30.176518,31.570345,3.672607,160.30303,0.110505,240.878788,0.074466
std,13.031845,0.229245,11.933211,1.339909,22.783064,13.006028,5.525137,138.065991,0.254808,124.789262,0.56666
min,2.467037,3e-05,1.423784,0.010571,5.996684,1.6104,0.00097,2.0,-0.209905,1.0,-1.325644
25%,4.703229,0.001753,7.888783,0.183319,11.157334,27.037318,0.188173,48.0,-0.014135,158.0,-0.160781
50%,8.932758,0.00803,19.896419,0.511851,25.238812,31.67565,0.699688,125.0,0.088435,304.0,0.060079
75%,27.093717,0.224939,27.611899,1.222938,39.618834,40.806352,7.026155,250.0,0.119526,336.0,0.425471
max,53.364653,0.953955,45.354121,4.768783,92.969578,56.430499,18.289802,627.0,0.999845,393.0,1.633034


### Predicting Sentiment Analysis Score using Multivariate Regression for Emotions

In [4]:
import statsmodels.api as sm

# All emotion columns together (multivariate regression)
X = df[["angry", "disgust", "fear", "happy", "neutral", "sad"]]
X = sm.add_constant(X)  # Include intercept

y = df["average_sentiment_score"]

# Fit the OLS model
model = sm.OLS(y, X)
results = model.fit()

# Print the full regression summary (this is very "econometrics textbook")
print(results.summary())

                               OLS Regression Results                              
Dep. Variable:     average_sentiment_score   R-squared:                       0.145
Model:                                 OLS   Adj. R-squared:                 -0.052
Method:                      Least Squares   F-statistic:                    0.7363
Date:                     Thu, 13 Mar 2025   Prob (F-statistic):              0.625
Time:                             11:23:12   Log-Likelihood:                 1.3912
No. Observations:                       33   AIC:                             11.22
Df Residuals:                           26   BIC:                             21.69
Df Model:                                6                                         
Covariance Type:                 nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co

### Predicting Sentiment Analysis Score using Linear Regression for Each Emotion Separately

In [5]:
import statsmodels.api as sm

emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad"]

print('OLS Regression Results: Predicting average_sentiment_score from each emotion')
results = {}

for emotion in emotions:
    X = df[[emotion]]  # Feature (independent variable)
    X = sm.add_constant(X)  # Add intercept term
    y = df["average_sentiment_score"]  # Target (dependent variable)

    model = sm.OLS(y, X)
    result = model.fit()
    
    # Store full-sample R² and coefficient details
    results[emotion] = {
        "R²": result.rsquared,
        "coefficient": result.params[emotion],
        "t_stat": result.tvalues[emotion],
        "p_value": result.pvalues[emotion]
    }

# Print regression summaries
for emotion, metrics in results.items():
    print(f"\nEmotion: {emotion}")
    print(f"    R²: {metrics['R²']:.4f}")
    print(f"    Coefficient: {metrics['coefficient']:.4f}")
    print(f"    t-statistic: {metrics['t_stat']:.4f}")
    print(f"    p-value: {metrics['p_value']:.4f}")


OLS Regression Results: Predicting average_sentiment_score from each emotion

Emotion: angry
    R²: 0.0033
    Coefficient: 0.0011
    t-statistic: 0.3218
    p-value: 0.7498

Emotion: disgust
    R²: 0.0034
    Coefficient: 0.0648
    t-statistic: 0.3252
    p-value: 0.7472

Emotion: fear
    R²: 0.0351
    Coefficient: 0.0040
    t-statistic: 1.0619
    p-value: 0.2965

Emotion: happy
    R²: 0.0138
    Coefficient: 0.0224
    t-statistic: 0.6590
    p-value: 0.5148

Emotion: neutral
    R²: 0.0229
    Coefficient: -0.0017
    t-statistic: -0.8519
    p-value: 0.4008

Emotion: sad
    R²: 0.0092
    Coefficient: -0.0019
    t-statistic: -0.5357
    p-value: 0.5960


### Predicting Stock Percentage Change using Linear Regression for Each Emotion Separately

In [6]:
import statsmodels.api as sm

emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]

print('OLS Regression Results: Predicting percentage_change from each emotion')
results = {}

for emotion in emotions:
    X = df[[emotion]]  # Feature (independent variable)
    X = sm.add_constant(X)  # Add intercept term
    y = df["percentage_difference"]  # Target (dependent variable)

    model = sm.OLS(y, X)
    result = model.fit()

    # Store full-sample R² and coefficient details
    results[emotion] = {
        "R²": result.rsquared,
        "coefficient": result.params[emotion],
        "t_stat": result.tvalues[emotion],
        "p_value": result.pvalues[emotion]
    }

# Print regression summaries
for emotion, metrics in results.items():
    print(f"\nEmotion: {emotion}")
    print(f"    R²: {metrics['R²']:.4f}")
    print(f"    Coefficient: {metrics['coefficient']:.4f}")
    print(f"    t-statistic: {metrics['t_stat']:.4f}")
    print(f"    p-value: {metrics['p_value']:.4f}")


OLS Regression Results: Predicting percentage_change from each emotion

Emotion: angry
    R²: 0.0062
    Coefficient: 0.0034
    t-statistic: 0.4406
    p-value: 0.6626

Emotion: disgust
    R²: 0.0005
    Coefficient: -0.0572
    t-statistic: -0.1288
    p-value: 0.8983

Emotion: fear
    R²: 0.0230
    Coefficient: -0.0072
    t-statistic: -0.8545
    p-value: 0.3994

Emotion: happy
    R²: 0.0739
    Coefficient: -0.1150
    t-statistic: -1.5726
    p-value: 0.1260

Emotion: neutral
    R²: 0.0381
    Coefficient: 0.0049
    t-statistic: 1.1081
    p-value: 0.2763

Emotion: sad
    R²: 0.0354
    Coefficient: -0.0082
    t-statistic: -1.0663
    p-value: 0.2945

Emotion: surprise
    R²: 0.0236
    Coefficient: -0.0158
    t-statistic: -0.8658
    p-value: 0.3932


### Predicting Stock Percentage Change using Multivariate Regression on Emotion (excluding surprise)

In [7]:
import statsmodels.api as sm

emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad"]

print('OLS Regression Results: Predicting stock percentage change from each emotion')

X = df[emotions]  # Feature (independent variable)
X = sm.add_constant(X)  # Add intercept term
y = df["percentage_difference"]  # Target (dependent variable)

model = sm.OLS(y, X)
result = model.fit()

print(result.summary())

OLS Regression Results: Predicting stock percentage change from each emotion
                              OLS Regression Results                             
Dep. Variable:     percentage_difference   R-squared:                       0.175
Model:                               OLS   Adj. R-squared:                 -0.015
Method:                    Least Squares   F-statistic:                    0.9195
Date:                   Thu, 13 Mar 2025   Prob (F-statistic):              0.497
Time:                           11:23:12   Log-Likelihood:                -24.398
No. Observations:                     33   AIC:                             62.80
Df Residuals:                         26   BIC:                             73.27
Df Model:                              6                                         
Covariance Type:               nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------

### Predicting Stock Percentage Change using Linear Regression on Sentiment Analysis

In [8]:
print('r score for sentiment / percentage_difference')
correlation = df['average_sentiment_score'].corr(df['percentage_difference'])
print('Pearson correlation:', correlation)

# Set up features (X) and target (y)
X = df[['average_sentiment_score']]  # Note double brackets to make X a DataFrame (2D)
y = df["percentage_difference"]      # y can stay as a Series (1D)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X, y)

# Predictions on the test set
y_pred = model.predict(X)

# Calculate and print the R^2 score
r2 = r2_score(y, y_pred)
print('R^2:', r2)

r score for sentiment / percentage_difference
Pearson correlation: -0.23909425351329797
R^2: 0.05716606206308117
