In [3]:
%%capture
%run test_data.ipynb
%run utils.ipynb

In [4]:
import pandas as pd
import numpy as np

## Wrangling

Lag data

In [5]:
pd.options.mode.chained_assignment = None

def lag_return_permno(data,n,permno):
    # Return df with added lagged returns (up to n lags) of a stock (PERMNO)
    # Loses n first observations 
    permno_data = data.loc[data['PERMNO'] == permno]
    for lag in range(1,n+1):
        col = 'LRET_{}'.format(lag)
        permno_data[col] = permno_data['RET'].shift(lag)
    return permno_data.iloc[3:]

def lag_returns(data,n):
    # Adds lagged returns (up to n lags) of each stock to the dataset
    # Loses n first observations for each stock
    permnos = data['PERMNO'].unique()
    dfs = threadify(lambda permno: lag_return_permno(data,n,permno),permnos)
    return pd.concat(dfs, ignore_index=True)

Form quantiles

In [None]:
def quantiles_for_date(data,wrt,date,n):
    # Helper for quantile_table
    r = 100//n
    data = data.loc[data['date'] == date]
    qs = [q/100 for q in range(r,100+r,r)]
    breakpoints = [np.quantile(data[wrt].values,q) for q in qs]
    return {**{'date':date}, **dict(zip(range(1,r+1),breakpoints))}

def quantile_table(data,wrt,n):
    # Returns DataFrame of quantile breakpoints with respect to variable (wrt).
    # n is the number breakpoints formed, e.g. with n = 10, 10 breakpoints are formed
    # which can used to assign stock into 10 portfolios with respect to variable (wrt)
    dates = data['date'].unique()
    qs_dates = threadify(lambda date: quantiles_for_date(data,wrt,date,n),dates)
    return pd.DataFrame(qs_dates).set_index('date')

Form portfolios

In [38]:
from bisect import bisect

def assign_portfolio(breakpoints,date,var):
    # Helper for portfolios_for_date (form_portfolios)
    s = len(breakpoints.columns)
    bp = breakpoints.loc[breakpoints['date'] == date][range(1,s+1)].values.flatten()
    pos = bisect(bp,me) + 1
    if pos > s: pos = s
    return pos

from collections import OrderedDict

def portfolios_for_date(data,breakpoints,date,wrt):
    # Helper for form_portfolios
    df = data.loc[data['date'] == date]
    name = 'PORT_' + str(wrt)
    df[name] = df['ME'].map(lambda e: assign_portfolio(breakpoints,date,e))
    print("{} ".format(date), end='')
    return df

import time

def form_portfolios(data,months,wrt,n):
    # Maps (parallel portfolios_for_date) all months' stocks into market value portfolios
    start = time.time()
    print('Progress...')
    breakpoints = quantile_table(data,wrt,n)
    #months = data['date'].unique()
    dfs = threadify(lambda d: portfolios_for_date(data,breakpoints,d,wrt),months)
    print('Done. Execution time: {}s'.format(round(time.time()-start,3)))
    return pd.concat(dfs, ignore_index=True)

Portfolios summary

In [None]:
def portfolios_summary_month(data,date,wrt):
    # Helper for portfolios_summary_table
    name = 'PORT_' + str(wrt)
    return {**{'date':date}, **data.loc[data['date'] == date][name].value_counts().to_dict()}

def portfolios_summary_table(data,months,wrt):
    # Forms a summary table of number of stocks in portfolios
    df = pd.DataFrame(threadify(lambda d: portfolios_summary_month(data,d),months)).set_index('date')
    return df.sort_index(axis=1)

In [31]:
def filter_by_me(data):
    pass

In [41]:
h = form_portfolios(lagged_data,lagged_data['date'].unique(),'LRET_1',10)

Progress...


KeyError: 'date'

In [None]:
lagged_data = lag_returns(data,1)

In [34]:
breakpoints_mom = quantile_table(lagged_data,'LRET_1',10)
breakpoints_mom.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
201504,-0.104745,-0.055241,-0.028374,-0.010547,0.003913,0.020808,0.03852,0.064205,0.112004,2.025783
201505,-0.111171,-0.069818,-0.046071,-0.028331,-0.012776,0.002763,0.023269,0.051835,0.111181,9.564357
201506,-0.102128,-0.051048,-0.023609,-0.00546,0.008096,0.023127,0.041088,0.069362,0.128765,1.793594
201507,-0.100592,-0.062148,-0.0398,-0.020046,-0.002456,0.016041,0.036203,0.062969,0.103392,0.930233
201508,-0.160441,-0.096924,-0.063009,-0.035843,-0.014799,0.003454,0.024891,0.053396,0.097946,1.65102


In [37]:
data['date'].unique()

array(['201501', '201502', '201503', '201504', '201505', '201506',
       '201507', '201508', '201509', '201510', '201511', '201512',
       '201601', '201602', '201603', '201604', '201605', '201606',
       '201607', '201608', '201609', '201610', '201611', '201612',
       '201701', '201702', '201703', '201704', '201705', '201706',
       '201707', '201708', '201709', '201710', '201711', '201712',
       '201801', '201802', '201803', '201804', '201805', '201806',
       '201807', '201808', '201809', '201810', '201811', '201812',
       '201901', '201902', '201903', '201904', '201905', '201906',
       '201907', '201908', '201909', '201910', '201911', '201912'],
      dtype=object)