In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../../')

In [3]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
import statsmodels.formula.api as smf

from config import STOCKTWITS_TICKER_LIST
from util.file_util import (
    StockTwitsFileReader, StockDataFileReader
)
from util.plot_util import plot_twit_series_for_ticker
from util.ts_util import resample_weekly
from util.ff_util import read_ff_factors_daily, get_ff_factors_with_freq

In [4]:
with open('./ticker_at_least_2_median.pkl', 'rb') as f:
    all_ticker_list = pickle.load(f)

In [5]:
twit_file_reader = StockTwitsFileReader()
stock_data_reader = StockDataFileReader()

In [6]:
start_date = '2017-01-01'
end_date = '2019-08-10'

In [7]:
def get_weekly_twits_for_ticker(ticker,
                                start_date,
                                end_date):
    twit_df = twit_file_reader.read_daily_sentiment_summary_prelim(ticker,
                                                                   start_date=start_date,
                                                                   end_date=end_date)
    weekly_twit_df = resample_weekly(twit_df)
    weekly_twit_df['B_m_B'] = weekly_twit_df['Bullish'] - weekly_twit_df['Bearish']
    weekly_twit_df['ticker'] = ticker
    
    return weekly_twit_df

def get_weekly_returns_for_ticker(ticker,
                                  start_date,
                                  end_date):
    price_ts = stock_data_reader.read_stockdata_in_range(ticker,
                                                         start_date,
                                                         end_date)
    weekly_price_ts = price_ts.resample('W-FRI').last()
    weekly_return_ts = weekly_price_ts.pct_change()
    weekly_return_ts['ticker'] = ticker
    return weekly_return_ts

In [8]:
def get_weekly_returns_for_ticker_list(ticker_list,
                                       start_date,
                                       end_date):
    return_dfs = []

    for ticker in ticker_list:
        return_dfs.append(get_weekly_returns_for_ticker(ticker,
                                                        start_date,
                                                        end_date))

    return_df = pd.concat(return_dfs)
    return_df = return_df.pivot_table(values='adjusted close', 
                                      columns=['ticker'], 
                                      index=return_df.index).sort_index()
    return return_df

In [9]:
def get_weekly_sentiment_and_return_merged_for_ticker(ticker,
                                                      start_date,
                                                      end_date,
                                                      sentiment_col='B_m_B'):
    return_df = get_weekly_returns_for_ticker(ticker, 
                                              start_date,
                                              end_date).dropna()
    twit_df = get_weekly_twits_for_ticker(ticker, 
                                          start_date,
                                          end_date)[[sentiment_col]].dropna()
    merged = twit_df.merge(return_df, left_index=True, right_index=True)
    return merged

In [10]:
stacks = []

for t in tqdm(all_ticker_list):
    weekly_info_for_ticker = get_weekly_sentiment_and_return_merged_for_ticker(
        t, start_date, end_date)
    weekly_info_for_ticker['shifted_return'] =\
        weekly_info_for_ticker['adjusted close'].shift(1)
    stacks.append(weekly_info_for_ticker)
    
stacked_df = pd.concat(stacks)

100%|██████████| 201/201 [00:12<00:00, 16.05it/s]


In [11]:
lm = smf.ols(
        "B_m_B ~ Q('adjusted close') + C(ticker) - 1", data=stacked_df).fit()

In [12]:
lm.summary()

0,1,2,3
Dep. Variable:,B_m_B,R-squared:,0.598
Model:,OLS,Adj. R-squared:,0.595
Method:,Least Squares,F-statistic:,196.4
Date:,"Tue, 05 Nov 2019",Prob (F-statistic):,0.0
Time:,06:35:19,Log-Likelihood:,-178760.0
No. Observations:,26766,AIC:,357900.0
Df Residuals:,26564,BIC:,359600.0
Df Model:,201,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
C(ticker)[AAL],17.7344,16.626,1.067,0.286,-14.853,50.322
C(ticker)[AAPL],851.1709,16.626,51.194,0.000,818.582,883.759
C(ticker)[ABBV],39.9767,16.626,2.404,0.016,7.389,72.564
C(ticker)[ABMD],5.7348,16.688,0.344,0.731,-26.975,38.445
C(ticker)[ABT],3.9675,16.627,0.239,0.811,-28.622,36.557
C(ticker)[ACN],2.2257,16.626,0.134,0.894,-30.363,34.814
C(ticker)[ADBE],27.3263,16.627,1.643,0.100,-5.264,59.917
C(ticker)[ADSK],2.6591,16.627,0.160,0.873,-29.930,35.248
C(ticker)[AGN],20.9107,16.626,1.258,0.209,-11.677,53.498

0,1,2,3
Omnibus:,47608.278,Durbin-Watson:,1.026
Prob(Omnibus):,0.0,Jarque-Bera (JB):,211053560.584
Skew:,12.287,Prob(JB):,0.0
Kurtosis:,437.326,Cond. No.,2.6


In [13]:
stacked_df['residual_sentiment'] = lm.resid.values

In [34]:
stacked_df['residual_sentiment'].corr(stacked_df['shifted_return'])

0.025574314013163485

In [14]:
weekly_signal_df = stacked_df.pivot_table(values='residual_sentiment',
                                          columns=['ticker'],
                                          index=stacked_df.index)

In [15]:
weekly_signal_df = weekly_signal_df.fillna(0)

In [16]:
quantile_lables = ['q1', 'q2', 'q3', 'q4', 'q5']

def label_rows_by_quantile(row):
    result = pd.qcut(row, 5, labels=quantile_lables,
                     duplicates='drop')
    return result

def get_portfolio_signal_for_quantile(weekly_signal_df, quantile):
    
    q_label_df = weekly_signal_df.apply(
        label_rows_by_quantile, axis=1).sort_index()
    
    portfolio_ts = (q_label_df == quantile).astype(int)
    portfolio_ts_one_time_ahead = portfolio_ts.shift(1)
    return portfolio_ts_one_time_ahead

def calc_portfolio_returns(portfolio_holding_df,
                           return_df):
    assert np.all(return_df.columns == portfolio_holding_df.columns)
    n_holding_ts = portfolio_holding_df.sum(axis=1)
    
    portfolio_returns = return_df * portfolio_holding_df
    portfolio_returns = (portfolio_returns.sum(axis=1) 
                         / n_holding_ts).iloc[1:]
    return portfolio_returns

In [17]:
portfolio_holding_df_q1 = get_portfolio_signal_for_quantile(weekly_signal_df,
                                                            'q1')
portfolio_holding_df_q5 = get_portfolio_signal_for_quantile(weekly_signal_df,
                                                            'q5')

In [18]:
return_df = get_weekly_returns_for_ticker_list(all_ticker_list,
                                               start_date,
                                               end_date)

In [19]:
portfolio_return_df_q1 = calc_portfolio_returns(portfolio_holding_df_q1,
                                                return_df)
portfolio_return_df_q5 = calc_portfolio_returns(portfolio_holding_df_q5,
                                                return_df)

In [20]:
# strategy - long q1, short q5
portfolio_return_df_strategy = portfolio_return_df_q1 - portfolio_return_df_q5

### Read Fama French Data

In [21]:
ff_df_wkly = get_ff_factors_with_freq()

In [22]:
ff_df_wkly.head()

Unnamed: 0_level_0,Mkt-RF,SMB,HML,Mom
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1926-11-05,0.008617,-0.004793,0.00639,0.011976
1926-11-12,0.011203,0.00079,-0.005505,-0.008418
1926-11-19,-0.011878,0.004678,0.002586,0.011541
1926-11-26,0.015893,-0.000504,0.003499,-0.005317
1926-12-03,0.009307,-0.0015,-0.008983,-0.00222


### Compare Q5 vs. Q1 portfolio
- Assume equal (dollar) weighting for now (which means we avg the returns)

In [23]:
def calc_portfolio_performance(portfolio_return_weekly_df,
                               ff_df_weekly):
    avg_annualized_return = (1 + portfolio_return_weekly_df.mean()) ** 52
    
    all_df = portfolio_return_weekly_df.to_frame(
        'portfolio_return').merge(ff_df_wkly,
                                  left_index=True, 
                                  right_index=True)
    ff_lm = smf.ols(
        "portfolio_return ~ Q('Mkt-RF') + SMB + HML + Mom", data=all_df).fit()
    avg_annualized_alpha = (1 + ff_lm.params['Intercept']) ** 52 - 1
    
    return {
        'lm': ff_lm,
        'annualized_return': avg_annualized_return,
        'annualized_alpha': avg_annualized_alpha,
    }

In [24]:
performance_q1 = calc_portfolio_performance(portfolio_return_df_q1,
                                            ff_df_wkly)
performance_q5 = calc_portfolio_performance(portfolio_return_df_q5,
                                            ff_df_wkly)
performance_strategy = calc_portfolio_performance(portfolio_return_df_strategy,
                                                  ff_df_wkly)

In [25]:
performance_strategy['annualized_alpha']

0.11378996656204787

In [26]:
performance_q5['annualized_return']

1.098346073567932

In [27]:
performance_strategy['lm'].summary()

0,1,2,3
Dep. Variable:,portfolio_return,R-squared:,0.025
Model:,OLS,Adj. R-squared:,-0.005
Method:,Least Squares,F-statistic:,0.825
Date:,"Tue, 05 Nov 2019",Prob (F-statistic):,0.512
Time:,06:35:34,Log-Likelihood:,410.53
No. Observations:,134,AIC:,-811.1
Df Residuals:,129,BIC:,-796.6
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0021,0.001,2.006,0.047,2.86e-05,0.004
Q('Mkt-RF'),0.0205,0.053,0.387,0.700,-0.084,0.125
SMB,0.0147,0.095,0.155,0.877,-0.174,0.203
HML,-0.1405,0.097,-1.450,0.150,-0.332,0.051
Mom,-0.1092,0.084,-1.302,0.195,-0.275,0.057

0,1,2,3
Omnibus:,10.566,Durbin-Watson:,1.944
Prob(Omnibus):,0.005,Jarque-Bera (JB):,11.12
Skew:,-0.581,Prob(JB):,0.00385
Kurtosis:,3.802,Cond. No.,112.0


In [None]:
1.0021 ** 52 - 1