In [1]:
import numpy as np
import pandas as pd
import pandas_datareader as pdr
import datetime

import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS

### (change data directory below if you need to)

In [2]:
dir_sec = 'data/sec_13f_new.csv'
dir_price = 'data/prices_new.csv'

# Funds data processing

In [3]:
df_sec = pd.read_csv(dir_sec)
print(df_sec.shape)

(30643987, 11)


In [4]:
# df_sec[(df_sec['iFILING_DATE'] == '2013-08-01') & (df_sec['iCIK'] == 1349353)]
# iLong_Fraction sums up to one for each asset manager at each filing date
df_sec

Unnamed: 0,iRECORD_ID,iCIK,iCUSIP,iPERIOD_END,iFILING_DATE,iAMEND,iRESTATEMENT,iTYPE,iQTY,iMARKET_VALUE,iLONG_FRACTION
0,38587073,1349353,002824100,2013-06-30,2013-08-01,0,0,0,96510.0,3366000.0,0.023443
1,38587074,1349353,025816109,2013-06-30,2013-08-01,0,0,0,70214.0,5249000.0,0.036557
2,38587075,1349353,026874784,2013-06-30,2013-08-01,0,0,0,346115.0,15471000.0,0.107748
3,38587076,1349353,060505104,2013-06-30,2013-08-01,0,0,0,882340.0,11347000.0,0.079026
4,38587077,1349353,064058100,2013-06-30,2013-08-01,0,0,0,129065.0,3620000.0,0.025212
...,...,...,...,...,...,...,...,...,...,...,...
30643982,74561590,1730479,039483102,2020-03-31,2020-04-23,0,0,0,300.0,11000.0,0.000086
30643983,74561591,1730479,88160R101,2020-03-31,2020-04-23,0,0,0,25.0,13000.0,0.000102
30643984,74561592,1730479,921937793,2020-03-31,2020-04-23,0,0,0,230.0,25000.0,0.000196
30643985,74561593,1730479,863667101,2020-03-31,2020-04-23,0,0,0,1030.0,171000.0,0.001341


In [5]:
df_funds = df_sec.copy()
df_funds['iFILING_DATE']= pd.to_datetime(df_funds['iFILING_DATE']).dt.date
df_funds['iPERIOD_END']= pd.to_datetime(df_funds['iPERIOD_END']).dt.date

In [6]:
# Data processing filters
print('---Filtering records that are duplicates---')
df_funds = df_funds.drop_duplicates()
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

print('---Filtering records wherein QTY is zero---')
df_funds = df_funds[df_funds['iQTY'] != 0]
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

print('---Filtering records wherein MARKET_VALUE is zero---')
df_funds = df_funds[df_funds['iMARKET_VALUE'] != 0]
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

---Filtering records that are duplicates---
Remaining funds:  7221
---Filtering records wherein QTY is zero---
Remaining funds:  7166
---Filtering records wherein MARKET_VALUE is zero---
Remaining funds:  7166


In [7]:
# Exhibit 3 filters: Criterion 1
print('---Fitering funds with less than 20 holdings or more than 200 holdings---')
df_funds = df_funds.groupby(["iCIK", "iPERIOD_END"]).filter(lambda x: ((20 <= len(x))&(len(x) <= 200)))
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

---Fitering funds with less than 20 holdings or more than 200 holdings---
Remaining funds:  4959


In [8]:
# Exhibit 3 filters: Criterion 2
print('---Filtering funds complete filings not within 46 days---')
df_funds['FILING_INTERVAL'] = df_funds['iFILING_DATE'] - df_funds['iPERIOD_END']
df_funds = df_funds.groupby(["iCIK", "iPERIOD_END"]).filter(lambda x:(x.FILING_INTERVAL <= datetime.timedelta(days = 45)).all())
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

---Filtering funds complete filings not within 46 days---
Remaining funds:  4817


In [9]:
# Exhibit 3 filters: Criterion 3
print('---Fitering funds with less than $100M or more than $500M---')
df_funds = df_funds.groupby(["iCIK", "iPERIOD_END"]).filter(lambda x:((1e8 <= sum(x.iMARKET_VALUE))&(sum(x.iMARKET_VALUE) <= 5e8)))
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

---Fitering funds with less than $100M or more than $500M---
Remaining funds:  3500


In [10]:
# Exhibit 3 filters: Criterion 4
print('---Fitering funds with more than one filing point in each quarter')
df_funds = df_funds.groupby(["iCIK", "iPERIOD_END"]).filter(lambda x:((x.iAMEND == 0)&(x.iRESTATEMENT == 0)&(x.iTYPE == 0)).all())
df_funds = df_funds.drop(['iAMEND','iRESTATEMENT','iTYPE'], axis=1)
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

---Fitering funds with more than one filing point in each quarter
Remaining funds:  3440


In [11]:
# Data processing, select survival funds
print('---Fitering funds do not meet criteria more than one year')
df_funds = df_funds.groupby("iCIK").filter(lambda x:(x.iPERIOD_END.nunique() >= 4))
print('Remaining funds: ', len(df_funds['iCIK'].unique()))

---Fitering funds do not meet criteria more than one year
Remaining funds:  2292


In [12]:
# Backtest time interval
PERIOD_TRAIN = datetime.date(2018,6,30)
print('---Selecting training period data')
df_funds_train = df_funds[df_funds.iPERIOD_END <= PERIOD_TRAIN]
print('Remaining funds: ', len(df_funds['iCIK'].unique()))
df_funds_train

---Selecting training period data
Remaining funds:  2292


Unnamed: 0,iRECORD_ID,iCIK,iCUSIP,iPERIOD_END,iFILING_DATE,iQTY,iMARKET_VALUE,iLONG_FRACTION,FILING_INTERVAL
0,38587073,1349353,002824100,2013-06-30,2013-08-01,96510.0,3366000.0,0.023443,32 days
1,38587074,1349353,025816109,2013-06-30,2013-08-01,70214.0,5249000.0,0.036557,32 days
2,38587075,1349353,026874784,2013-06-30,2013-08-01,346115.0,15471000.0,0.107748,32 days
3,38587076,1349353,060505104,2013-06-30,2013-08-01,882340.0,11347000.0,0.079026,32 days
4,38587077,1349353,064058100,2013-06-30,2013-08-01,129065.0,3620000.0,0.025212,32 days
...,...,...,...,...,...,...,...,...,...
23185133,67098517,1641042,731094108,2018-06-30,2018-08-13,29000.0,683000.0,0.003574,44 days
23185134,67098518,1641042,85570W100,2018-06-30,2018-08-13,150000.0,5445000.0,0.028494,44 days
23185135,67098519,1641042,056752108,2018-06-30,2018-08-13,48222.0,11720000.0,0.061331,44 days
23185136,67098520,1641042,427746102,2018-06-30,2018-08-13,334331.0,12989000.0,0.067971,44 days


In [13]:
df_funds_train.iPERIOD_END.unique()

array([datetime.date(2013, 6, 30), datetime.date(2014, 12, 31),
       datetime.date(2014, 6, 30), datetime.date(2013, 12, 31),
       datetime.date(2013, 9, 30), datetime.date(2014, 3, 31),
       datetime.date(2014, 9, 30), datetime.date(2015, 3, 31),
       datetime.date(2015, 6, 30), datetime.date(2015, 9, 30),
       datetime.date(2015, 12, 31), datetime.date(2016, 3, 31),
       datetime.date(2016, 6, 30), datetime.date(2016, 9, 30),
       datetime.date(2016, 12, 31), datetime.date(2017, 3, 31),
       datetime.date(2017, 6, 30), datetime.date(2017, 9, 30),
       datetime.date(2017, 12, 31), datetime.date(2018, 3, 31),
       datetime.date(2018, 6, 30)], dtype=object)

In [14]:
df_funds

Unnamed: 0,iRECORD_ID,iCIK,iCUSIP,iPERIOD_END,iFILING_DATE,iQTY,iMARKET_VALUE,iLONG_FRACTION,FILING_INTERVAL
0,38587073,1349353,002824100,2013-06-30,2013-08-01,96510.0,3366000.0,0.023443,32 days
1,38587074,1349353,025816109,2013-06-30,2013-08-01,70214.0,5249000.0,0.036557,32 days
2,38587075,1349353,026874784,2013-06-30,2013-08-01,346115.0,15471000.0,0.107748,32 days
3,38587076,1349353,060505104,2013-06-30,2013-08-01,882340.0,11347000.0,0.079026,32 days
4,38587077,1349353,064058100,2013-06-30,2013-08-01,129065.0,3620000.0,0.025212,32 days
...,...,...,...,...,...,...,...,...,...
30643159,74560767,1670053,478160104,2020-03-31,2020-04-23,30167.0,3956000.0,0.034295,23 days
30643160,74560768,1670053,922908769,2020-03-31,2020-04-23,2287.0,295000.0,0.002557,23 days
30643161,74560769,1670053,941848103,2020-03-31,2020-04-23,12301.0,2239000.0,0.019410,23 days
30643162,74560770,1670053,502431109,2020-03-31,2020-04-23,5373.0,968000.0,0.008392,23 days


# Feature X19: Idiosyncratic Risk

In [15]:
df_price = pd.read_csv(dir_price)
print(df_price.shape)

(89139787, 9)


In [16]:
last_day_of_month = []
for year in range(2011,2019):
    for month in range(1,13):
        last_day_of_month.append(datetime.date(year, month, 1) - datetime.timedelta(days=1))
last_day_of_month = pd.Series(last_day_of_month).apply(str)

df_price = df_price[df_price.pSP_DATE.isin(last_day_of_month)]
df_price.sort_values(by=["pSP_DATE"], inplace = True, ascending = True)
df_price = df_price[df_price.pSP_CUSIP.isin(df_funds.iCUSIP.unique())]
df_price['X19_IDIO'] = np.nan
df_price['MONTH'] = pd.to_datetime(df_price.pSP_DATE.copy()).dt.to_period('M')
df_price['DAILY_RETURN'] = (df_price.pSP_CLOSE - df_price.pSP_OPEN)/df_price.pSP_OPEN

In [17]:
df_fama = pdr.get_data_famafrench("F-F_Research_Data_Factors", start="1-1-2011")[0].rename(columns={'Mkt-RF':'Mkt_RF'})
df_fama['RF'] = df_fama['RF']/100

In [18]:
stocks = df_price.pSP_CUSIP.unique()
NUM_STOCKS = len(stocks)

In [19]:
df_price

Unnamed: 0,pSP_CUSIP,pSP_EXCHANGE,pSP_TICKER,pSP_DATE,pSP_VOLUME,pSP_OPEN,pSP_HIGH,pSP_LOW,pSP_CLOSE,X19_IDIO,MONTH,DAILY_RETURN
36153736,872275102,N,TCB,2010-12-31,511530,14.8500,14.9700,14.8100,14.8100,,2010-12,-0.002694
36153735,895848109,N,TCAP,2010-12-31,40552,19.1500,19.3500,19.0000,19.0000,,2010-12,-0.007833
36153732,89785X101,N,TBI,2010-12-31,176733,17.9700,18.2700,17.8200,17.9900,,2010-12,0.001113
36153731,60871R209,N,TAP,2010-12-31,376870,50.3600,50.4400,50.1700,50.1900,,2010-12,-0.003376
36153729,874083108,N,TAL,2010-12-31,59100,31.1600,31.2800,30.8000,30.8700,,2010-12,-0.009307
...,...,...,...,...,...,...,...,...,...,...,...,...
80532014,73933G202,PK,PSIX,2018-11-30,548,9.2000,9.2000,9.2000,9.2000,,2018-11,0.000000
80531630,73936K200,PK,PSFT,2018-11-30,0,0.0023,0.0023,0.0023,0.0023,,2018-11,0.000000
80531390,69362L100,PK,PSBP,2018-11-30,0,23.2960,23.2960,23.2960,23.2960,,2018-11,0.000000
80530928,74273Y100,PK,PRZM,2018-11-30,3200,0.0600,0.0900,0.0600,0.0775,,2018-11,0.291667


In [20]:
df_fama

Unnamed: 0_level_0,Mkt_RF,SMB,HML,RF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01,1.99,-2.44,0.77,0.0001
2011-02,3.49,1.47,1.24,0.0001
2011-03,0.46,2.56,-1.88,0.0001
2011-04,2.90,-0.32,-2.48,0.0000
2011-05,-1.27,-0.65,-2.00,0.0000
...,...,...,...,...
2021-12,3.10,-1.60,3.22,0.0001
2022-01,-6.25,-5.93,12.74,0.0000
2022-02,-2.29,2.18,3.09,0.0000
2022-03,3.06,-1.61,-1.82,0.0000


In [23]:
ROLLING_WINDOW = 6
stocks_nope = []

PROGRESS_CUR = 0
PROGRESS_TOTAL = 0
for stock in stocks:
    df_stock = df_price[df_price.pSP_CUSIP == stock]
    if df_stock.shape[0] >= ROLLING_WINDOW:
        try:
            stock_fama = df_stock.join(df_fama, on='MONTH')
            stock_fama['S_RF'] = stock_fama['DAILY_RETURN'] - stock_fama['RF']
            mod = RollingOLS.from_formula("S_RF ~ Mkt_RF + SMB + HML", data=stock_fama, window=ROLLING_WINDOW)
            rres = mod.fit()
            df_price['X19_IDIO'] = 1-rres.rsquared
        except ValueError:
            stocks_nope.append(df_stock.pSP_CUSIP.unique()[0])
            #print(print(df_stock.pSP_CUSIP.unique()[0], ' NOPE---'))
            pass
    else:
        stocks_nope.append(df_stock.pSP_CUSIP.unique()[0])
        #print(df_stock.pSP_CUSIP.unique()[0], ' NOPE---')
        
    PROGRESS_CUR += 1
    if PROGRESS_CUR/NUM_STOCKS >= 0.05:
        PROGRESS_TOTAL += PROGRESS_CUR/NUM_STOCKS
        PROGRESS_CUR = 0
        print('Progress: {:.2%} '.format(PROGRESS_TOTAL))

Progress: 5.01% 
Progress: 10.02% 


  llf = -np.log(ssr) * nobs2  # concentrated likelihood


Progress: 15.02% 
Progress: 20.03% 
Progress: 25.04% 
Progress: 30.05% 
Progress: 35.05% 
Progress: 40.06% 
Progress: 45.07% 
Progress: 50.08% 
Progress: 55.08% 
Progress: 60.09% 
Progress: 65.10% 
Progress: 70.11% 
Progress: 75.12% 
Progress: 80.12% 
Progress: 85.13% 
Progress: 90.14% 
Progress: 95.15% 
