In [224]:
from pathlib import Path
from collections import Counter
from datetime import date

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels as sm
from statsmodels.regression.linear_model import OLS
from datetime import datetime
import calendar
import math

In [225]:
DATA_PATH = Path.cwd().parent.joinpath('factoring_consumer_staples-master','data', 'processed', 'three_factor_model.txt')

In [226]:
df = pd.read_csv(DATA_PATH, delimiter = "\t")

In [227]:
df.head()

Unnamed: 0,datadate,tic,conm,prccq,chng,chng_cs,excess,Price_tb,book_value,bv_per_share,ptb,cshoc,prccd,prcod,mkvaltq
0,2000-03-31,ABF,AIRBORNE INC,24.0,,-0.014137,-0.072737,5.86,894267000.0,18.339048,1.308683,48763000.0,24.0,,1170312000.0
1,2000-06-30,ABF,AIRBORNE INC,18.9375,-0.210938,0.008527,-0.050073,5.86,884944000.0,18.054923,1.048883,49014000.0,18.9375,,928202600.0
2,2001-12-31,ABF,AIRBORNE INC,14.83,0.55288,0.00497,-0.01239,1.736,834216000.0,17.341926,0.855153,48104000.0,14.83,,713382300.0
3,2002-09-30,ABF,AIRBORNE INC,11.34,-0.409375,-0.07499,-0.0905,1.551,829688000.0,17.143377,0.66148,48397000.0,11.34,,548822000.0
4,2002-12-31,ABF,AIRBORNE INC,14.83,0.30776,0.002302,-0.009888,1.219,839163000.0,17.329843,0.855749,48423000.0,14.83,,718113100.0


In [228]:
df["Price_tb"] = df["Price_tb"]/100

In [229]:
df.head()

Unnamed: 0,datadate,tic,conm,prccq,chng,chng_cs,excess,Price_tb,book_value,bv_per_share,ptb,cshoc,prccd,prcod,mkvaltq
0,2000-03-31,ABF,AIRBORNE INC,24.0,,-0.014137,-0.072737,0.0586,894267000.0,18.339048,1.308683,48763000.0,24.0,,1170312000.0
1,2000-06-30,ABF,AIRBORNE INC,18.9375,-0.210938,0.008527,-0.050073,0.0586,884944000.0,18.054923,1.048883,49014000.0,18.9375,,928202600.0
2,2001-12-31,ABF,AIRBORNE INC,14.83,0.55288,0.00497,-0.01239,0.01736,834216000.0,17.341926,0.855153,48104000.0,14.83,,713382300.0
3,2002-09-30,ABF,AIRBORNE INC,11.34,-0.409375,-0.07499,-0.0905,0.01551,829688000.0,17.143377,0.66148,48397000.0,11.34,,548822000.0
4,2002-12-31,ABF,AIRBORNE INC,14.83,0.30776,0.002302,-0.009888,0.01219,839163000.0,17.329843,0.855749,48423000.0,14.83,,718113100.0


In [230]:
#excess is market excess
df.rename(columns = {"excess":"mkt_excess"}, inplace = True)

In [231]:
df.head()

Unnamed: 0,datadate,tic,conm,prccq,chng,chng_cs,mkt_excess,Price_tb,book_value,bv_per_share,ptb,cshoc,prccd,prcod,mkvaltq
0,2000-03-31,ABF,AIRBORNE INC,24.0,,-0.014137,-0.072737,0.0586,894267000.0,18.339048,1.308683,48763000.0,24.0,,1170312000.0
1,2000-06-30,ABF,AIRBORNE INC,18.9375,-0.210938,0.008527,-0.050073,0.0586,884944000.0,18.054923,1.048883,49014000.0,18.9375,,928202600.0
2,2001-12-31,ABF,AIRBORNE INC,14.83,0.55288,0.00497,-0.01239,0.01736,834216000.0,17.341926,0.855153,48104000.0,14.83,,713382300.0
3,2002-09-30,ABF,AIRBORNE INC,11.34,-0.409375,-0.07499,-0.0905,0.01551,829688000.0,17.143377,0.66148,48397000.0,11.34,,548822000.0
4,2002-12-31,ABF,AIRBORNE INC,14.83,0.30776,0.002302,-0.009888,0.01219,839163000.0,17.329843,0.855749,48423000.0,14.83,,718113100.0


In [232]:
#need to make column of chng - tbill --> stock excess. probs can be done in temp files for OLS

#need to create a separate object to tabulate small versus big stocks and each time period, 
#since stocks can grow and become big
#
# first step: get mkt value and returns of all stocks available at each quarter q 
# second step: get percentiles at each time in terms of q
# third step: take returns of bottom 10% less top 10% (regular average) to get factor at each time step q

# need similar process for HML, tbd

In [233]:
##making SMB
dat = {'datadate':[],'SMB value':[]}
SMB = pd.DataFrame(data = dat)
j = 0
for i in np.unique(df["datadate"]):
    temp = df[df.datadate == i][["chng","mkvaltq"]]
 
    ten = np.nanpercentile(temp["mkvaltq"],10)
    ninety = np.nanpercentile(temp["mkvaltq"],90)
    small = temp[temp["mkvaltq"]< ten]
    big = temp[temp["mkvaltq"]> ninety]
    small_rets = np.nanmean(small["chng"])
    big_rets = np.nanmean(big["chng"])
    factor =  small_rets - big_rets
    SMB.loc[j] = [i,factor]
    j+=1

In [None]:
SMB.head()
#first row is 0 because all stocks on q1 have a nan change percent lol

In [235]:
##making HML
dat = {'datadate':[],'HML value':[]}
HML = pd.DataFrame(data = dat)
j = 0
for i in np.unique(df["datadate"]):
    temp = df[df.datadate == i][["tic","chng","ptb"]]
 
    ten = np.nanpercentile(temp["ptb"],10)
    ninety = np.nanpercentile(temp["ptb"],90)
    value = temp[temp["ptb"]< ten]
    growth = temp[temp["ptb"]> ninety]
    val_rets = np.nanmean(value["chng"])
    growth_rets = np.nanmean(growth["chng"])
    factor =  val_rets - growth_rets
    HML.loc[j] = [i,factor]
    j+=1

In [None]:
HML.head()
##same thing with HML, first row isnt technically usable

In [237]:
df = df.merge(SMB, how = "left", on = "datadate")

In [238]:
df = df.merge(HML, how = "left", on = "datadate")

In [248]:
df.head()

Unnamed: 0,datadate,tic,conm,prccq,chng,chng_cs,mkt_excess,Price_tb,book_value,bv_per_share,ptb,cshoc,prccd,prcod,mkvaltq,SMB value,HML value
0,2000-03-31,ABF,AIRBORNE INC,24.0,,-0.014137,-0.072737,0.0586,894267000.0,18.339048,1.308683,48763000.0,24.0,,1170312000.0,0.0,0.0
1,2000-06-30,ABF,AIRBORNE INC,18.9375,-0.210938,0.008527,-0.050073,0.0586,884944000.0,18.054923,1.048883,49014000.0,18.9375,,928202600.0,-0.193951,0.0322
2,2001-12-31,ABF,AIRBORNE INC,14.83,0.55288,0.00497,-0.01239,0.01736,834216000.0,17.341926,0.855153,48104000.0,14.83,,713382300.0,0.161618,0.014985
3,2002-09-30,ABF,AIRBORNE INC,11.34,-0.409375,-0.07499,-0.0905,0.01551,829688000.0,17.143377,0.66148,48397000.0,11.34,,548822000.0,0.028483,-0.099567
4,2002-12-31,ABF,AIRBORNE INC,14.83,0.30776,0.002302,-0.009888,0.01219,839163000.0,17.329843,0.855749,48423000.0,14.83,,718113100.0,0.138784,-0.10085


In [253]:
factors = []
for tic in np.unique(df["tic"]):
    try:
        temp_df = df[df["tic"] == tic]
        temp_df = temp_df.reset_index(drop = True)
        temp_df = temp_df.dropna()
        Y = temp_df["chng"]-(temp_df["Price_tb"])
        X = temp_df[["mkt_excess","SMB value","HML value"]]
        model = OLS(Y,sm.tools.add_constant(X)).fit()
        factors.append((tic,model.params,model.pvalues))
    except:
        continue

  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid


In [254]:
factors

##plot expected value versus actual returns of a couple stocks, some with at least one (hoepfully more)
#statisitically signifcaint betas and those with none

[('ABC', const         0.008186
  mkt_excess    0.550345
  SMB value     0.015906
  HML value    -0.195249
  dtype: float64, const         0.709525
  mkt_excess    0.354908
  SMB value     0.708255
  HML value     0.110917
  dtype: float64), ('ABEV', const        -0.008382
  mkt_excess    0.942745
  SMB value     0.098566
  HML value     0.330901
  dtype: float64, const         0.822690
  mkt_excess    0.349754
  SMB value     0.166063
  HML value     0.102756
  dtype: float64), ('AD', const         0.416541
  mkt_excess    0.104675
  SMB value     0.063904
  HML value     4.258915
  dtype: float64, const         0.019133
  mkt_excess    0.941991
  SMB value     0.093260
  HML value     0.020163
  dtype: float64), ('ADM', const         0.023444
  mkt_excess    1.537686
  SMB value     0.016037
  HML value    -0.197350
  dtype: float64, const         0.370700
  mkt_excess    0.032345
  SMB value     0.748597
  HML value     0.169646
  dtype: float64), ('ADNT', const         0.195346
  m

In [242]:
sm.tools.add_constant(X)

Unnamed: 0,const,mkt_excess,SMB value,HML value
0,1.0,-0.025853,-0.018299,-0.024703
1,1.0,0.009498,0.024959,-0.173008
2,1.0,0.001625,-0.038558,-0.12422
3,1.0,0.018329,0.059078,-0.030667
4,1.0,-0.005766,-0.045982,-0.055276
5,1.0,0.00253,-0.018379,-0.088626
6,1.0,-0.014072,0.026653,-0.035172
7,1.0,-0.024668,1.5778,-0.027132
8,1.0,-0.021836,0.012877,-0.014714
9,1.0,0.0012,-0.051729,-0.233485
