In [1]:
# matplotlib inline plotting
%matplotlib inline
# make inline plotting higher resolution
%config InlineBackend.figure_format ='svg'

In [2]:
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from helpers.portfolio_sorts import estimate_portfolio_sorts, last_of_month
from helpers.expand_to_daily import expand_to_daily
from helpers.hml import high_minus_low, construct_portfolio
from helpers.sql import connect_to_db, read_db, vacuum_db, update_database

plt.style.use('ggplot')

In [3]:
engine = connect_to_db()

# setting newey west lags to 3. This is using the rule-of-thumb (T^(1/4)=60^(1/4)=2.78 -> 3)
#NW_LAGS: int = 3
NW_LAGS: int = 7  # this is **really** just testing something

# define looking-back period (number of months)
LOOK_BACK: int = 3

# number of obs in a year
YEARLY_BUSINESS_DAYS: int = 250

# define important dates
START_DATE_MISSING = datetime(year=2010, month=1, day=1)

# Actual data-ranges
START_DATE = datetime(year=2010, month=1, day=4)
END_DATE = datetime(year=2023, month=1, day=1)

# Models
AP_MODELS = {
    "CAPM": ["mkt-rf"],
    "FF3": ["mkt-rf", "smb", "hml"],
    "FF3_C": ["mkt-rf", "smb", "hml", "mom"],
    "FF5": ["mkt-rf", "smb", "hml", "rmw", "cma"],
}

# Select sentiment measures 
SENTMENT_MEASURES = [
    "aggregate_transformed_residuals",
    "politics_transformed_residuals",
    "importance_of_human_intervantion_transformed_residuals",
    "weather_extremes_transformed_residuals",
]

# Splits
SPLITS = [0.1, 0.2]


In [4]:
# fetch series of business days (will be used to clean E portfolio and factors)
business_days = read_db(engine=engine, statement='select "index", date from returns')

business_days = business_days.set_index("date", drop=True).index


In [5]:
# fetch data
sentiment = read_db(
    # statement="select * from climate_sum_ar1", engine=engine, idx_col="date"
    statement="select * from climate_mean_n_ar1", engine=engine, idx_col="date"
)

market_cap = read_db(
    engine=engine, statement="select * from market_cap", idx_col="date"
)

returns = read_db(engine=engine, statement="select * from returns", idx_col="date")

snp500 = read_db(
    engine=engine, statement="select * from snp", idx_col="date"
).sort_index()

factors = read_db(engine=engine, statement="select * from factors", idx_col="date")

riskfree = read_db(engine=engine, statement="select * from riskfree", idx_col="date")


In [6]:
sentiment.tail()

Unnamed: 0,aggregate_average,aggregate_sum,aggregate_count,aggregate_count_transformed,aggregate_transformed,aggregate_transformed_predictions,aggregate_transformed_observed,aggregate_transformed_residuals,aggregate_transformed_dw_statistic,aggregate_transformed_f_statistic,...,undefined_one_word_hashtags_count,undefined_one_word_hashtags_count_transformed,undefined_one_word_hashtags_transformed,undefined_one_word_hashtags_transformed_predictions,undefined_one_word_hashtags_transformed_observed,undefined_one_word_hashtags_transformed_residuals,undefined_one_word_hashtags_transformed_dw_statistic,undefined_one_word_hashtags_transformed_f_statistic,undefined_one_word_hashtags_transformed_f_pval,undefined_one_word_hashtags_transformed_lags
2019-09-27,-0.067066,-85.442356,1274,7.150701,-0.47957,-0.331355,-0.47957,-0.148216,2.007809,0.417581,...,26.0,3.295837,1.118689,1.070639,1.118689,0.04805,2.004494,0.748919,0.523072,3.0
2019-09-28,0.01961,21.845393,1114,7.01661,0.137595,-0.366468,0.137595,0.504063,2.005986,0.407484,...,14.0,2.70805,0.94707,1.110104,0.94707,-0.163034,2.007872,0.707352,0.547705,3.0
2019-09-29,0.059978,46.063407,768,6.645091,0.398562,0.043965,0.398562,0.354596,2.008096,0.411058,...,20.0,3.044522,0.285845,1.010453,0.285845,-0.724608,1.997953,0.757939,0.517837,3.0
2019-09-30,-0.167222,-174.078142,1041,6.948897,-1.162009,0.176293,-1.162009,-1.338301,2.003567,0.438758,...,13.0,2.639057,0.384506,0.513717,0.384506,-0.129211,2.009564,0.779152,0.505682,3.0
2019-10-01,-0.401588,-112.043045,279,5.63479,-2.262864,-0.872028,-2.262864,-1.390836,2.003273,0.442052,...,5.0,1.791759,0.422367,0.486721,0.422367,-0.064354,2.009494,0.794605,0.496966,3.0


## Calculate excess returns r_{i}-r_{rf}

In [7]:
# calculate excess returns
returns = pd.merge(left=returns, right=riskfree, how='left', right_index=True, left_index=True)

returns.tail()

Unnamed: 0,POOL.OQ,CHRW.OQ,AJG.N,CNP.N,AMCR.N,WM.N,BA.N,FOX.OQ,LIN.N,WY.N,...,CIC.N^E95,ESY.N^E95,SK.N^F00,MXS.N^F95,LK.N^C95,ML.N^C95,NEC.N^F97,PT.N^E95,E.N^E95,rf
2023-01-26,0.002503,0.006609,0.000507,-0.001332,0.004244,-0.003984,0.000235,0.012448,0.009652,0.027481,...,,,,,,,,,,0.000122
2023-01-27,0.041108,0.005334,-0.01744,-0.008667,0.010144,0.000721,-0.007333,-0.003153,-0.014597,0.019911,...,,,,,,,,,,0.000122
2023-01-30,-0.00767,-0.014184,0.007688,0.003362,-0.014226,0.003211,-0.007103,-0.006641,0.001016,-0.015152,...,,,,,,,,,,0.000123
2023-01-31,0.024251,0.036849,0.002151,0.009383,0.023769,0.010647,0.015882,0.009233,0.018183,0.018639,...,,,,,,,,,,0.000123
2023-02-01,0.032494,0.019467,0.002861,-0.000664,0.000829,-0.002068,0.008216,0.009779,0.000423,0.016846,...,,,,,,,,,,0.000122


In [8]:
for col in returns.columns:
    if col == "rf":
        continue

    returns[col] = returns[col] - returns["rf"]

returns = returns.drop(columns=["rf"])

returns.tail()


Unnamed: 0,POOL.OQ,CHRW.OQ,AJG.N,CNP.N,AMCR.N,WM.N,BA.N,FOX.OQ,LIN.N,WY.N,...,CKL.N^E95,CIC.N^E95,ESY.N^E95,SK.N^F00,MXS.N^F95,LK.N^C95,ML.N^C95,NEC.N^F97,PT.N^E95,E.N^E95
2023-01-26,0.002382,0.006487,0.000385,-0.001453,0.004123,-0.004106,0.000113,0.012326,0.00953,0.027359,...,,,,,,,,,,
2023-01-27,0.040985,0.005212,-0.017562,-0.008789,0.010021,0.000599,-0.007456,-0.003275,-0.01472,0.019788,...,,,,,,,,,,
2023-01-30,-0.007793,-0.014307,0.007565,0.00324,-0.014349,0.003088,-0.007226,-0.006764,0.000893,-0.015274,...,,,,,,,,,,
2023-01-31,0.024128,0.036726,0.002028,0.009261,0.023646,0.010524,0.015759,0.00911,0.01806,0.018516,...,,,,,,,,,,
2023-02-01,0.032372,0.019345,0.00274,-0.000786,0.000708,-0.00219,0.008094,0.009658,0.000301,0.016724,...,,,,,,,,,,


In [9]:
# Clean-up S&P data. Here we remove all S&P observations that we
# do NOT have observations for
snp500 = snp500[snp500["ric"].isin(returns.columns)]


In [10]:
# limiting factor space $X_t$
factors = factors.loc[START_DATE:]

factors

Unnamed: 0,mkt-rf,smb,hml,rmw,cma,mom,wti,ng,prop,epu,vix,ted,gb,e-hml
2010-01-04,0.0169,0.0079,0.0113,-0.0017,0.0021,0.0059,0.026830,0.046392,0.043313,-0.306983,-0.075646,0.000005,,0.005857
2010-01-05,0.0031,-0.0041,0.0124,-0.0019,0.0019,0.0064,0.002699,0.016420,0.011653,0.005387,-0.034431,0.000005,,0.003043
2010-01-06,0.0013,-0.0013,0.0057,-0.0005,0.0020,-0.0004,0.016883,0.045234,0.040317,-0.365242,-0.009819,0.000005,,-0.000488
2010-01-07,0.0040,0.0025,0.0098,-0.0069,0.0022,-0.0085,-0.006256,0.160742,-0.007612,-0.039083,-0.005219,0.000005,,0.002359
2010-01-08,0.0033,0.0032,0.0001,0.0022,-0.0037,0.0020,0.001695,-0.126498,-0.023710,0.539262,-0.048793,0.000005,,-0.002140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-28,,,,,,,,,,0.030504,,,,
2023-01-29,,,,,,,,,,0.970399,,,,
2023-01-30,,,,,,,-0.022075,-0.003534,-0.021622,-0.151020,0.077256,,-0.001759,
2023-01-31,,,,,,,0.012569,-0.060284,-0.016575,-0.172902,-0.027081,,0.003259,


In [11]:
# expand to daily for market cap. Fill between but don't forward fill
market_cap = expand_to_daily(
    market_cap,
    start_date=START_DATE,
    end_date=END_DATE,
    start_date_missing=START_DATE_MISSING,
    last_valid=True,
)

# expand to daily for stock-returns
returns = expand_to_daily(
    returns,
    start_date=START_DATE,
    end_date=END_DATE,
    start_date_missing=START_DATE_MISSING,
    ffill=True,
    cut_sample=True,
    last_valid=True,
    ffill_limit=None,
)
riskfree = expand_to_daily(
    riskfree,
    start_date=START_DATE,
    end_date=END_DATE,
    start_date_missing=START_DATE_MISSING,
    ffill=True,
    cut_sample=True,
    last_valid=True,
    ffill_limit=None,
)

# drop columns that are all NaN
market_cap = market_cap.dropna(how="all", axis=1)
returns = returns.dropna(how="all", axis=1)

# Portfolio sorts

This notebook follows the procedure from Matin et al. (2021). The text below is the procedure as described in the article.

For every asset (S&P 500) constituents $i$ we estimate

$$
\begin{align*}
r_{i, t}-r_{f, t}=\alpha_i+\beta_i F_t+\gamma_i X_t+\epsilon_t,
\end{align*}
$$
where $r_{i, t}$ is the daily return on security $i, r_{f, t}$ is the risk-free return, $F_t$ is the textual sentiment (for each topic) factor, $X_t$ is a vector that includes standard controls that have been found to explain the cross section of U.S. stock returns and $\epsilon_t$ is an i.i.d. error term with zero mean. At the end of every month, we estimate equation (1) recursively, using a rolling window consisting of daily observations over the previous three months. We roll forward the starting date of the window by one month at each iteration. At the end of any given month, given the estimated betas across stocks, we rank stocks according to their estimated betas and group them in portfolios; we form decile and quintile portfolios, separately. Then, for each portfolio, we compute the portfolio's post-ranking value-weighted monthly returns. Next, we compute the long-short spread portfolio's monthly return. We repeat the process until we exhaust our sample.

In [12]:
estimated_portfolio_sorts = {}

for model in AP_MODELS.keys():
    for split in SPLITS:
        for sent_measure in SENTMENT_MEASURES:

            df = estimate_portfolio_sorts(
                fac=factors,
                sent=sentiment,
                snp=snp500,
                ret=returns,
                nw_lags=NW_LAGS,
                look_back=LOOK_BACK,
                fac_cols=AP_MODELS[model],
                sent_col=sent_measure,
                prettify=True,
            )

            # from Panel to Time-Series
            pf = pd.DataFrame(
                columns=snp500["ric"].unique(),
                index=pd.date_range(
                    start=df["date"].min(), end=last_of_month(df["date"].max())
                ),
            ).transpose()

            # Join each individual t-statistic onto an empty HML data-frame
            for date, panel in df.groupby("date"):

                # display(panel)

                panel = panel[["ticker", "sentiment_t"]].set_index("ticker")
                panel.index.name = None

                pf[date] = pf[date].fillna(panel.squeeze()).dropna()

            pf = pf.transpose()

            # expand time-series to daily observations (i.e. forward fill, etc.)
            pf = expand_to_daily(
                pf,
                start_date=pf.index.min(),
                end_date=pf.index.max(),
                start_date_missing=pf.index.min(),
                last_valid=True,
            )

            # Construct HML (high-minus-low) groups
            hml = high_minus_low(
                sorts=pf, snp=snp500, slice_method="tiles", q1=split, q2=1 - split
            )

            # Construct HML returns
            hml_results = construct_portfolio(
                returns, riskfree, market_cap, hml, weights="cap", calc_excess=False
            )

            estimated_portfolio_sorts[f"{split}:{model}:{sent_measure}"] = hml_results
            #break

        #break

    #break


100%|██████████| 158/158 [00:46<00:00,  3.39it/s]
100%|██████████| 3502/3502 [00:26<00:00, 132.74it/s]
100%|██████████| 3502/3502 [00:26<00:00, 134.31it/s]
100%|██████████| 158/158 [00:47<00:00,  3.35it/s]
100%|██████████| 3502/3502 [00:24<00:00, 144.23it/s]
100%|██████████| 3502/3502 [00:26<00:00, 130.26it/s]
100%|██████████| 158/158 [00:47<00:00,  3.35it/s]
100%|██████████| 3502/3502 [00:23<00:00, 150.24it/s]
100%|██████████| 3502/3502 [00:26<00:00, 133.79it/s]
100%|██████████| 158/158 [00:46<00:00,  3.37it/s]
100%|██████████| 3502/3502 [00:22<00:00, 153.14it/s]
100%|██████████| 3502/3502 [00:26<00:00, 130.92it/s]
100%|██████████| 158/158 [00:47<00:00,  3.33it/s]
100%|██████████| 3502/3502 [00:23<00:00, 150.19it/s]
100%|██████████| 3502/3502 [00:41<00:00, 83.68it/s] 
100%|██████████| 158/158 [00:46<00:00,  3.37it/s]
100%|██████████| 3502/3502 [00:24<00:00, 145.24it/s]
100%|██████████| 3502/3502 [00:40<00:00, 86.27it/s] 
100%|██████████| 158/158 [00:47<00:00,  3.33it/s]
100%|█████████

In [13]:
# Dump portfolio sort results to disk

import pickle

with open('pf_sorts_dump.bin', 'wb') as file:
    # dump information to that file
    pickle.dump(estimated_portfolio_sorts, file)

    file.close()
