In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import time
import yfinance as yf
import statsmodels.api as sm
import numpy as np

## Import all data

In [3]:
short_filings = json.load(open("metastore/short_filings.json", "r"))

bow = pd.read_csv("metastore/bag_of_words_2001-2021.csv")
bow.set_index("acc_nr", inplace=True)
bow = bow[~bow.index.isin(short_filings)].copy()

mean_sentiment_store_item7 = json.load(open("metastore/mean_sentiment_store_item7.json", "r"))

cik_store_priced = json.load(open("metastore/cik_store_priced.json", "r"))

financial_indicator = pd.read_pickle("data/financial_indicator.p").set_index("cik")


In [4]:
def get_fin_indicator(cik):
    try:
        return financial_indicator.loc[cik, "financial (REIT, insurance)"]
    except:
        return 0

In [5]:
final_frame = bow.copy()
for cik in mean_sentiment_store_item7:
    for acc_nr in mean_sentiment_store_item7[cik]:
        final_frame.loc[acc_nr, "finbert_pos"] = mean_sentiment_store_item7[cik][acc_nr]["positive"]
        final_frame.loc[acc_nr, "finbert_neg"] = mean_sentiment_store_item7[cik][acc_nr]["negative"]       
        final_frame.loc[acc_nr, "finbert"] = mean_sentiment_store_item7[cik][acc_nr]["positive"] - mean_sentiment_store_item7[cik][acc_nr]["negative"]
        
        
merge_frame = pd.DataFrame()
for cik in cik_store_priced:
    df = pd.read_json(cik_store_priced[cik])
    merge_frame = merge_frame.append(df)
    
final_frame = final_frame.join(merge_frame.set_index("accessionNumber"))

final_frame.reset_index(inplace=True)
final_frame["year"] = final_frame["acc_nr"].replace(r"(\d{10})-", "", regex=True).replace(r"-(\d{6})", "", regex=True)
final_frame["cik"] = final_frame["acc_nr"].replace(r"-(\d{2})-(\d{6})", "", regex=True)
final_frame["financial_indicator"] = final_frame.cik.apply(get_fin_indicator)
final_frame.set_index("acc_nr", inplace=True)

final_frame.dropna(inplace=True)
final_frame.to_pickle("data/final_frame.p")

## Define general regression variables

In [334]:
finbert_vars = ["finbert", "finbert_pos", "finbert_neg"]
bow_summed_vars = ['summed_negative_weights', 'summed_positive_weights', 'summed_uncertain_weights', 
              'summed_litigious_weights', 'summed_strong_m_weights', 'summed_weak_m_weights', 
              'summed_constraining_weights']
bow_proportion_vars = ['total_negative_proportion', 'total_positive_proportion',
       'total_uncertain_proportion', 'total_litigious_proportion',
       'total_strong_m_proportion', 'total_weak_m_proportion',
       'total_constraining_proportion']
control_vars = ["pre_filing_alpha_2", "pre_filing_alpha_6", "pre_filing_alpha_20", "volume_momentum", "financial_indicator"]

y_bhar = ["stockBHAR_4"]
y_alpha = ["jensens_alpha_value_4"]
y_bhr = ["stockBHR_4"]
y_vol = ["abnormal_volume"]

In [335]:
def regression(X, y):
    X = sm.add_constant(X.astype(float)) # adding a constant
    model = sm.OLS(y.astype(float), X).fit()

    print_model = model.summary()
    return print_model

In [336]:
final_frame_total = final_frame.copy()
final_frame_2008 = final_frame[final_frame["year"].astype(int)<9].copy()
final_frame_2022 = final_frame[final_frame["year"].astype(int)>10].copy()

## BoW only

### summed vars

#### 2000-2022

In [338]:
regression_bow_summed_total_alpha = regression(final_frame_total[bow_summed_vars + control_vars], final_frame_total[y_alpha])
regression_bow_summed_total_bhar = regression(final_frame_total[bow_summed_vars + control_vars], final_frame_total[y_bhar])

In [367]:
regression_bow_summed_total_alpha

0,1,2,3
Dep. Variable:,jensens_alpha_value_4,R-squared:,0.035
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,4.255
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,1.22e-06
Time:,20:34:50,Log-Likelihood:,2485.4
No. Observations:,1414,AIC:,-4945.0
Df Residuals:,1401,BIC:,-4877.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0025,0.009,0.270,0.787,-0.016,0.021
summed_negative_weights,0.0003,0.001,0.213,0.831,-0.003,0.003
summed_positive_weights,0.0008,0.000,2.674,0.008,0.000,0.001
summed_uncertain_weights,-2.11e-05,0.001,-0.017,0.986,-0.002,0.002
summed_litigious_weights,-9.162e-05,0.000,-0.357,0.721,-0.001,0.000
summed_strong_m_weights,-0.0014,0.006,-0.222,0.824,-0.014,0.011
summed_weak_m_weights,-0.0021,0.006,-0.345,0.730,-0.014,0.010
summed_constraining_weights,-0.0024,0.009,-0.271,0.786,-0.020,0.015
pre_filing_alpha_2,0.1169,0.032,3.682,0.000,0.055,0.179

0,1,2,3
Omnibus:,364.566,Durbin-Watson:,1.908
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14486.247
Skew:,0.426,Prob(JB):,0.0
Kurtosis:,18.657,Cond. No.,5630.0


In [348]:
regression_bow_summed_total_bhar

0,1,2,3
Dep. Variable:,stockBHAR_4,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,2.305
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,0.00657
Time:,20:34:50,Log-Likelihood:,2683.5
No. Observations:,1414,AIC:,-5341.0
Df Residuals:,1401,BIC:,-5273.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0075,0.008,0.933,0.351,-0.008,0.023
summed_negative_weights,-0.0016,0.001,-1.244,0.214,-0.004,0.001
summed_positive_weights,0.0001,0.000,0.534,0.593,-0.000,0.001
summed_uncertain_weights,-0.0012,0.001,-1.125,0.261,-0.003,0.001
summed_litigious_weights,-0.0003,0.000,-1.217,0.224,-0.001,0.000
summed_strong_m_weights,0.0072,0.005,1.321,0.187,-0.004,0.018
summed_weak_m_weights,0.0055,0.005,1.042,0.298,-0.005,0.016
summed_constraining_weights,0.0098,0.008,1.255,0.210,-0.006,0.025
pre_filing_alpha_2,0.0248,0.028,0.900,0.369,-0.029,0.079

0,1,2,3
Omnibus:,435.393,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8175.872
Skew:,-0.943,Prob(JB):,0.0
Kurtosis:,14.628,Cond. No.,5630.0


#### 2000-2008

In [361]:
regression_bow_summed_2008_alpha = regression(final_frame_2008[bow_summed_vars + control_vars], final_frame_2008[y_alpha])
regression_bow_summed_2008_bhar = regression(final_frame_2008[bow_summed_vars + control_vars], final_frame_2008[y_bhar])

In [362]:
regression_bow_summed_2008_alpha

0,1,2,3
Dep. Variable:,jensens_alpha_value_4,R-squared:,0.02
Model:,OLS,Adj. R-squared:,-0.006
Method:,Least Squares,F-statistic:,0.7621
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,0.678
Time:,21:35:04,Log-Likelihood:,760.38
No. Observations:,420,AIC:,-1497.0
Df Residuals:,408,BIC:,-1448.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0123,0.013,-0.943,0.346,-0.038,0.013
summed_negative_weights,5.614e-06,9.41e-05,0.060,0.952,-0.000,0.000
summed_positive_weights,0.0003,0.000,0.894,0.372,-0.000,0.001
summed_uncertain_weights,0.0003,0.001,0.248,0.804,-0.002,0.003
summed_litigious_weights,-2.776e-05,0.000,-0.083,0.934,-0.001,0.001
summed_strong_m_weights,-0.0033,0.007,-0.468,0.640,-0.017,0.011
summed_weak_m_weights,-0.0005,0.007,-0.075,0.940,-0.014,0.013
summed_constraining_weights,9.468e-07,1.59e-05,0.060,0.952,-3.03e-05,3.22e-05
pre_filing_alpha_2,0.0265,0.071,0.371,0.711,-0.114,0.167

0,1,2,3
Omnibus:,131.719,Durbin-Watson:,2.057
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1343.837
Skew:,-1.022,Prob(JB):,1.55e-292
Kurtosis:,11.521,Cond. No.,1.44e+16


In [215]:
regression_bow_summed_2008_bhar

0,1,2,3
Dep. Variable:,stockBHAR_4,R-squared:,0.009
Model:,OLS,Adj. R-squared:,-0.011
Method:,Least Squares,F-statistic:,0.4454
Date:,"Wed, 16 Mar 2022",Prob (F-statistic):,0.893
Time:,19:59:08,Log-Likelihood:,801.96
No. Observations:,420,AIC:,-1586.0
Df Residuals:,411,BIC:,-1550.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0023,0.012,-0.198,0.843,-0.025,0.021
summed_negative_weights,-5.54e-05,8.46e-05,-0.655,0.513,-0.000,0.000
summed_positive_weights,0.0003,0.000,0.930,0.353,-0.000,0.001
summed_uncertain_weights,-0.0005,0.001,-0.405,0.685,-0.003,0.002
summed_litigious_weights,-4.947e-05,0.000,-0.164,0.870,-0.001,0.001
summed_strong_m_weights,0.0038,0.006,0.591,0.555,-0.009,0.016
summed_weak_m_weights,0.0036,0.006,0.606,0.545,-0.008,0.015
summed_constraining_weights,-9.343e-06,1.43e-05,-0.655,0.513,-3.74e-05,1.87e-05
pre_filing_alpha_6,0.0260,0.044,0.593,0.553,-0.060,0.112

0,1,2,3
Omnibus:,80.21,Durbin-Watson:,2.199
Prob(Omnibus):,0.0,Jarque-Bera (JB):,717.985
Skew:,-0.496,Prob(JB):,1.23e-156
Kurtosis:,9.328,Cond. No.,1.44e+16


#### 2011-2022

In [364]:
regression_bow_summed_2022_alpha = regression(final_frame_2022[bow_summed_vars + control_vars], final_frame_2022[y_alpha])
regression_bow_summed_2022_bhar = regression(final_frame_2022[bow_summed_vars + control_vars], final_frame_2022[y_bhar])

In [365]:
regression_bow_summed_2022_alpha

0,1,2,3
Dep. Variable:,jensens_alpha_value_4,R-squared:,0.033
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,2.436
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,0.00406
Time:,21:37:23,Log-Likelihood:,1617.3
No. Observations:,880,AIC:,-3209.0
Df Residuals:,867,BIC:,-3146.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0138,0.016,0.884,0.377,-0.017,0.045
summed_negative_weights,0.0013,0.003,0.495,0.621,-0.004,0.006
summed_positive_weights,0.0009,0.000,1.862,0.063,-4.65e-05,0.002
summed_uncertain_weights,-0.0009,0.002,-0.359,0.719,-0.006,0.004
summed_litigious_weights,-0.0003,0.000,-0.648,0.517,-0.001,0.001
summed_strong_m_weights,0.0107,0.015,0.714,0.475,-0.019,0.040
summed_weak_m_weights,-0.0003,0.014,-0.022,0.982,-0.027,0.026
summed_constraining_weights,-0.0084,0.015,-0.545,0.586,-0.039,0.022
pre_filing_alpha_2,0.0746,0.035,2.101,0.036,0.005,0.144

0,1,2,3
Omnibus:,191.249,Durbin-Watson:,1.838
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5727.375
Skew:,0.179,Prob(JB):,0.0
Kurtosis:,15.493,Cond. No.,4380.0


In [None]:
regression_bow_summed_2022_bhar

### proportion vars

#### 2000-2022

In [349]:
regression_bow_prop_total_alpha = regression(final_frame_total[bow_proportion_vars + control_vars], final_frame_total[y_alpha])
regression_bow_prop_total_bhar = regression(final_frame_total[bow_proportion_vars + control_vars], final_frame_total[y_bhar])

In [368]:
regression_bow_prop_total_alpha

0,1,2,3
Dep. Variable:,jensens_alpha_value_4,R-squared:,0.032
Model:,OLS,Adj. R-squared:,0.023
Method:,Least Squares,F-statistic:,3.828
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,8.94e-06
Time:,21:15:03,Log-Likelihood:,2482.9
No. Observations:,1414,AIC:,-4940.0
Df Residuals:,1401,BIC:,-4872.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0048,0.009,0.517,0.605,-0.013,0.023
total_negative_proportion,0.1981,0.346,0.573,0.566,-0.480,0.876
total_positive_proportion,-1.0532,0.669,-1.575,0.116,-2.365,0.259
total_uncertain_proportion,0.2728,0.689,0.396,0.692,-1.079,1.625
total_litigious_proportion,0.0508,0.311,0.163,0.870,-0.559,0.660
total_strong_m_proportion,0.6455,1.324,0.488,0.626,-1.952,3.243
total_weak_m_proportion,0.1022,0.875,0.117,0.907,-1.614,1.819
total_constraining_proportion,0.0089,0.828,0.011,0.991,-1.615,1.632
pre_filing_alpha_2,0.1160,0.032,3.651,0.000,0.054,0.178

0,1,2,3
Omnibus:,365.225,Durbin-Watson:,1.903
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14096.828
Skew:,0.442,Prob(JB):,0.0
Kurtosis:,18.443,Cond. No.,1790.0


In [350]:
regression_bow_prop_total_bhar

0,1,2,3
Dep. Variable:,stockBHAR_4,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,2.248
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,0.00821
Time:,21:15:03,Log-Likelihood:,2683.2
No. Observations:,1414,AIC:,-5340.0
Df Residuals:,1401,BIC:,-5272.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0111,0.008,1.389,0.165,-0.005,0.027
total_negative_proportion,-0.4986,0.300,-1.662,0.097,-1.087,0.090
total_positive_proportion,-0.7906,0.581,-1.362,0.173,-1.929,0.348
total_uncertain_proportion,0.8936,0.598,1.494,0.135,-0.280,2.067
total_litigious_proportion,0.4551,0.270,1.688,0.092,-0.074,0.984
total_strong_m_proportion,-0.4752,1.149,-0.413,0.679,-2.729,1.779
total_weak_m_proportion,-0.7378,0.759,-0.972,0.331,-2.228,0.752
total_constraining_proportion,-0.3260,0.718,-0.454,0.650,-1.735,1.083
pre_filing_alpha_2,0.0211,0.028,0.765,0.444,-0.033,0.075

0,1,2,3
Omnibus:,423.877,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7863.826
Skew:,-0.908,Prob(JB):,0.0
Kurtosis:,14.409,Cond. No.,1790.0


#### 2000-2008

In [None]:
regression_bow_prop_2008_alpha = regression(final_frame_2008[bow_proportion_vars + control_vars], final_frame_2008[y_alpha])
regression_bow_prop_2008_bhar = regression(final_frame_2008[bow_proportion_vars + control_vars], final_frame_2008[y_bhar])

In [None]:
regression_bow_prop_2008_alpha

In [None]:
regression_bow_prop_2008_bhar

#### 2011-2022

In [None]:
regression_bow_prop_2022_alpha = regression(final_frame_2022[bow_proportion_vars + control_vars], final_frame_2022[y_alpha])
regression_bow_prop_2022_bhar = regression(final_frame_2022[bow_proportion_vars + control_vars], final_frame_2022[y_bhar])

In [None]:
regression_bow_prop_2022_alpha

In [None]:
regression_bow_prop_2022_bhar

## FinBERT only

#### 2000-2022

In [351]:
regression_finbert_total_alpha = regression(final_frame_total[finbert_vars + control_vars], final_frame_total[y_alpha])
regression_finbert_total_bhar = regression(final_frame_total[finbert_vars + control_vars], final_frame_total[y_bhar])

In [352]:
regression_finbert_total_alpha

0,1,2,3
Dep. Variable:,jensens_alpha_value_4,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,5.976
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,7.1e-07
Time:,21:15:51,Log-Likelihood:,2480.8
No. Observations:,1414,AIC:,-4946.0
Df Residuals:,1406,BIC:,-4904.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0055,0.005,1.142,0.254,-0.004,0.015
finbert,-0.0056,0.007,-0.767,0.444,-0.020,0.009
finbert_pos,0.0011,0.011,0.101,0.920,-0.020,0.022
finbert_neg,0.0067,0.010,0.640,0.522,-0.014,0.027
pre_filing_alpha_2,0.1142,0.032,3.604,0.000,0.052,0.176
pre_filing_alpha_6,0.0590,0.027,2.151,0.032,0.005,0.113
pre_filing_alpha_20,-0.0380,0.010,-3.747,0.000,-0.058,-0.018
volume_momentum,-0.0045,0.003,-1.771,0.077,-0.009,0.000
financial_indicator,-0.0007,0.004,-0.185,0.853,-0.008,0.007

0,1,2,3
Omnibus:,370.562,Durbin-Watson:,1.895
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14653.016
Skew:,0.456,Prob(JB):,0.0
Kurtosis:,18.744,Cond. No.,4.04e+16


In [353]:
regression_finbert_total_bhar

0,1,2,3
Dep. Variable:,stockBHAR_4,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,3.546
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,0.000875
Time:,21:15:51,Log-Likelihood:,2682.1
No. Observations:,1414,AIC:,-5348.0
Df Residuals:,1406,BIC:,-5306.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0011,0.004,0.256,0.798,-0.007,0.009
finbert,0.0071,0.006,1.132,0.258,-0.005,0.020
finbert_pos,0.0216,0.009,2.353,0.019,0.004,0.040
finbert_neg,0.0145,0.009,1.609,0.108,-0.003,0.032
pre_filing_alpha_2,0.0220,0.027,0.801,0.423,-0.032,0.076
pre_filing_alpha_6,-0.0792,0.024,-3.328,0.001,-0.126,-0.033
pre_filing_alpha_20,0.0141,0.009,1.599,0.110,-0.003,0.031
volume_momentum,-0.0050,0.002,-2.311,0.021,-0.009,-0.001
financial_indicator,-0.0004,0.003,-0.131,0.896,-0.007,0.006

0,1,2,3
Omnibus:,432.271,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8315.768
Skew:,-0.925,Prob(JB):,0.0
Kurtosis:,14.736,Cond. No.,4.04e+16


#### 2000-2008

In [None]:
regression_finbert_2008_alpha = regression(final_frame_2008[finbert_vars + control_vars], final_frame_2008[y_alpha])
regression_finbert_2008_bhar = regression(final_frame_2008[finbert_vars + control_vars], final_frame_2008[y_bhar])

In [None]:
regression_finbert_2008_alpha

In [None]:
regression_finbert_2008_bhar

#### 2011-2022

In [None]:
regression_finbert_2022_alpha = regression(final_frame_2022[finbert_vars + control_vars], final_frame_2022[y_alpha])
regression_finbert_2022_bhar = regression(final_frame_2022[finbert_vars + control_vars], final_frame_2022[y_bhar])

In [None]:
regression_finbert_2022_alpha

In [None]:
regression_finbert_2022_bhar

## FinBERT + BoW

### summed vars

#### 2000-2022

In [358]:
regression_finbert_bow_summed_total_alpha = regression(final_frame_total[finbert_vars + bow_summed_vars + control_vars], final_frame_total[y_alpha])
regression_finbert_bow_summed_total_bhar = regression(final_frame_total[finbert_vars + bow_summed_vars + control_vars], final_frame_total[y_bhar])

In [359]:
regression_finbert_bow_summed_total_alpha

0,1,2,3
Dep. Variable:,jensens_alpha_value_4,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,3.721
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,3.6e-06
Time:,21:32:06,Log-Likelihood:,2486.0
No. Observations:,1414,AIC:,-4942.0
Df Residuals:,1399,BIC:,-4863.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0034,0.011,-0.315,0.753,-0.025,0.018
finbert,-0.0009,0.008,-0.116,0.908,-0.017,0.015
finbert_pos,0.0103,0.011,0.924,0.356,-0.012,0.032
finbert_neg,0.0112,0.011,0.997,0.319,-0.011,0.033
summed_negative_weights,0.0003,0.002,0.167,0.867,-0.003,0.003
summed_positive_weights,0.0008,0.000,2.666,0.008,0.000,0.001
summed_uncertain_weights,2.552e-05,0.001,0.021,0.983,-0.002,0.002
summed_litigious_weights,-0.0001,0.000,-0.536,0.592,-0.001,0.000
summed_strong_m_weights,-0.0018,0.006,-0.290,0.772,-0.014,0.011

0,1,2,3
Omnibus:,364.022,Durbin-Watson:,1.907
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14741.351
Skew:,0.416,Prob(JB):,0.0
Kurtosis:,18.796,Cond. No.,1.09e+18


In [None]:
regression_finbert_bow_summed_total_bhar

#### 2000-2008

In [None]:
regression_finbert_bow_summed_2008_alpha = regression(final_frame_2008[finbert_vars + bow_summed_vars + control_vars], final_frame_2008[y_alpha])
regression_finbert_bow_summed_2008_bhar = regression(final_frame_2008[finbert_vars + bow_summed_vars + control_vars], final_frame_2008[y_bhar])

In [None]:
regression_finbert_bow_summed_2008_alpha

In [None]:
regression_finbert_bow_summed_2008_bhar

#### 2011-2022

In [None]:
regression_finbert_bow_summed_2022_alpha = regression(final_frame_2022[finbert_vars + bow_summed_vars + control_vars], final_frame_2022[y_alpha])
regression_finbert_bow_summed_2022_bhar = regression(final_frame_2022[finbert_vars + bow_summed_vars + control_vars], final_frame_2022[y_bhar])

In [None]:
regression_finbert_bow_summed_2022_alpha

In [None]:
regression_finbert_bow_summed_2022_bhar

### proportion vars

#### 2000-2022

In [None]:
regression_finbert_bow_prop_total_alpha = regression(final_frame_total[finbert_vars + bow_proportion_vars + control_vars], final_frame_total[y_alpha])
regression_finbert_bow_prop_total_bhar = regression(final_frame_total[finbert_vars + bow_proportion_vars + control_vars], final_frame_total[y_bhar])

In [None]:
regression_finbert_bow_prop_total_alpha

In [None]:
regression_finbert_bow_prop_total_bhar

#### 2000-2008

In [None]:
regression_finbert_bow_prop_2008_alpha = regression(final_frame_2008[finbert_vars + bow_proportion_vars + control_vars], final_frame_2008[y_alpha])
regression_finbert_bow_prop_2008_bhar = regression(final_frame_2008[finbert_vars + bow_proportion_vars + control_vars], final_frame_2008[y_bhar])

In [None]:
regression_finbert_bow_prop_2008_alpha

In [None]:
regression_finbert_bow_prop_2008_bhar

#### 2011-2022

In [None]:
regression_finbert_bow_prop_2022_alpha = regression(final_frame_2022[finbert_vars + bow_proportion_vars + control_vars], final_frame_2022[y_alpha])
regression_finbert_bow_prop_2022_bhar = regression(final_frame_2022[finbert_vars + bow_proportion_vars + control_vars], final_frame_2022[y_bhar])

In [None]:
regression_finbert_bow_prop_2022_alpha

In [None]:
regression_finbert_bow_prop_2022_bhar

## Look at extreme sentiment

In [196]:
bow_prop_neg = final_frame_total[final_frame_total["total_negative_proportion"] > final_frame_total["total_negative_proportion"].describe()["75%"]]
finbert_neg = final_frame_total[final_frame_total["finbert_neg"] > final_frame_total["finbert_neg"].describe()["75%"]]
bow_sum_neg = final_frame_total[final_frame_total["summed_negative_weights"] > final_frame_total["summed_negative_weights"].describe()["75%"]]

bow_prop_pos = final_frame_total[final_frame_total["total_positive_proportion"] > final_frame_total["total_positive_proportion"].describe()["75%"]]
finbert_pos = final_frame_total[final_frame_total["finbert_pos"] > final_frame_total["finbert_pos"].describe()["75%"]]
bow_sum_pos = final_frame_total[final_frame_total["summed_positive_weights"] > final_frame_total["summed_positive_weights"].describe()["75%"]]



In [197]:
print(finbert_pos.corr()["jensens_alpha_value_4"]["finbert_pos"])
print(finbert_neg.corr()["jensens_alpha_value_4"]["finbert_neg"])

0.04442801646046545
0.06688159825533774


In [198]:
print(bow_prop_pos.corr()["jensens_alpha_value_4"]["total_positive_proportion"])
print(bow_prop_neg.corr()["jensens_alpha_value_4"]["total_negative_proportion"])

-0.029022122521243213
-0.034850917877053016


In [199]:
print(bow_sum_pos.corr()["jensens_alpha_value_4"]["summed_positive_weights"])
print(bow_sum_neg.corr()["jensens_alpha_value_4"]["summed_negative_weights"])

0.15210257445401607
0.09767406342867517


## Look at buy-and-hold returns

In [219]:
regression_finbert_bow_prop_total_bhr = regression(final_frame_total[finbert_vars + bow_proportion_vars + control_vars], final_frame_total[y_bhr])
regression_finbert_bow_prop_total_bhr

0,1,2,3
Dep. Variable:,stockBHR_4,R-squared:,0.026
Model:,OLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,3.378
Date:,"Wed, 16 Mar 2022",Prob (F-statistic):,0.000125
Time:,20:00:09,Log-Likelihood:,2419.6
No. Observations:,1415,AIC:,-4815.0
Df Residuals:,1403,BIC:,-4752.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0067,0.010,-0.659,0.510,-0.027,0.013
finbert,-0.0053,0.009,-0.608,0.543,-0.023,0.012
finbert_pos,0.0339,0.012,2.822,0.005,0.010,0.057
finbert_neg,0.0392,0.012,3.206,0.001,0.015,0.063
total_negative_proportion,-0.5491,0.409,-1.344,0.179,-1.350,0.252
total_positive_proportion,-0.7620,0.730,-1.044,0.297,-2.194,0.670
total_uncertain_proportion,0.5756,0.728,0.790,0.430,-0.853,2.005
total_litigious_proportion,0.3269,0.345,0.948,0.344,-0.350,1.004
total_strong_m_proportion,0.2327,1.394,0.167,0.867,-2.503,2.968

0,1,2,3
Omnibus:,492.821,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6012.176
Skew:,-1.263,Prob(JB):,0.0
Kurtosis:,12.777,Cond. No.,2.97e+16


## Look at abnormal volume

In [372]:
regression_bow_prop_total_vol = regression(final_frame_total[bow_proportion_vars + control_vars], final_frame_total[y_vol])
regression_bow_prop_total_vol

0,1,2,3
Dep. Variable:,abnormal_volume,R-squared:,0.212
Model:,OLS,Adj. R-squared:,0.205
Method:,Least Squares,F-statistic:,31.41
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,2.1e-64
Time:,22:03:31,Log-Likelihood:,-1811.1
No. Observations:,1414,AIC:,3648.0
Df Residuals:,1401,BIC:,3717.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.6256,0.193,-3.247,0.001,-1.004,-0.248
total_negative_proportion,-1.7264,7.201,-0.240,0.811,-15.852,12.399
total_positive_proportion,-26.3512,13.939,-1.891,0.059,-53.694,0.992
total_uncertain_proportion,-16.4329,14.362,-1.144,0.253,-44.607,11.741
total_litigious_proportion,-3.6851,6.476,-0.569,0.569,-16.388,9.018
total_strong_m_proportion,8.3883,27.591,0.304,0.761,-45.736,62.512
total_weak_m_proportion,4.5959,18.234,0.252,0.801,-31.173,40.365
total_constraining_proportion,32.7735,17.246,1.900,0.058,-1.057,66.604
pre_filing_alpha_2,-0.5403,0.662,-0.816,0.415,-1.839,0.758

0,1,2,3
Omnibus:,1524.768,Durbin-Watson:,1.896
Prob(Omnibus):,0.0,Jarque-Bera (JB):,268610.643
Skew:,4.893,Prob(JB):,0.0
Kurtosis:,69.809,Cond. No.,1790.0


In [371]:
regression_bow_summed_total_vol = regression(final_frame_total[bow_summed_vars + control_vars], final_frame_total[y_vol])
regression_bow_summed_total_vol

0,1,2,3
Dep. Variable:,abnormal_volume,R-squared:,0.214
Model:,OLS,Adj. R-squared:,0.207
Method:,Least Squares,F-statistic:,31.72
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,4.88e-65
Time:,22:03:28,Log-Likelihood:,-1809.6
No. Observations:,1414,AIC:,3645.0
Df Residuals:,1401,BIC:,3714.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.7357,0.192,-3.833,0.000,-1.112,-0.359
summed_negative_weights,-0.0456,0.031,-1.464,0.144,-0.107,0.016
summed_positive_weights,-0.0021,0.006,-0.340,0.734,-0.014,0.010
summed_uncertain_weights,-0.0258,0.025,-1.018,0.309,-0.076,0.024
summed_litigious_weights,0.0024,0.005,0.454,0.650,-0.008,0.013
summed_strong_m_weights,-0.1683,0.131,-1.280,0.201,-0.426,0.090
summed_weak_m_weights,0.2576,0.126,2.043,0.041,0.010,0.505
summed_constraining_weights,0.2683,0.187,1.434,0.152,-0.099,0.635
pre_filing_alpha_2,-0.5204,0.662,-0.786,0.432,-1.819,0.778

0,1,2,3
Omnibus:,1525.502,Durbin-Watson:,1.897
Prob(Omnibus):,0.0,Jarque-Bera (JB):,267387.051
Skew:,4.899,Prob(JB):,0.0
Kurtosis:,69.651,Cond. No.,5630.0


In [370]:
regression_finbert_total_vol = regression(final_frame_total[finbert_vars + control_vars], final_frame_total[y_vol])
regression_finbert_total_vol

0,1,2,3
Dep. Variable:,abnormal_volume,R-squared:,0.211
Model:,OLS,Adj. R-squared:,0.207
Method:,Least Squares,F-statistic:,53.67
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,4.1099999999999995e-68
Time:,22:03:19,Log-Likelihood:,-1812.2
No. Observations:,1414,AIC:,3640.0
Df Residuals:,1406,BIC:,3682.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.6111,0.100,-6.125,0.000,-0.807,-0.415
finbert,-0.0183,0.152,-0.121,0.904,-0.316,0.279
finbert_pos,-0.6260,0.221,-2.834,0.005,-1.059,-0.193
finbert_neg,-0.6077,0.216,-2.808,0.005,-1.032,-0.183
pre_filing_alpha_2,-0.5024,0.660,-0.761,0.447,-1.797,0.792
pre_filing_alpha_6,-0.0257,0.571,-0.045,0.964,-1.147,1.095
pre_filing_alpha_20,-0.3130,0.211,-1.482,0.139,-0.727,0.101
volume_momentum,0.9939,0.052,18.992,0.000,0.891,1.097
financial_indicator,0.1108,0.080,1.387,0.166,-0.046,0.267

0,1,2,3
Omnibus:,1515.701,Durbin-Watson:,1.882
Prob(Omnibus):,0.0,Jarque-Bera (JB):,262334.501
Skew:,4.846,Prob(JB):,0.0
Kurtosis:,69.02,Cond. No.,4.04e+16
