# core

> Extends pandas with common functions used in finance and economics research

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#|exports
from __future__ import annotations
from typing import List 
import pandas as pd
import numpy as np

First, we set up an example dataset to showcase the functions in this module.

In [None]:
df = pd.DataFrame(np.random.rand(8,2), 
                  columns=list('AB'), 
                  index=pd.MultiIndex.from_product(
                      [[1,2],
                       pd.to_datetime(['2010-01','2010-02','2010-02','2010-04']
                                      ).to_period('M')],
                      names=['permno','Mdate']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2010-01,0.769378,0.902521
1,2010-02,0.774357,0.779548
1,2010-02,0.973493,0.435882
1,2010-04,0.967311,0.524849
2,2010-01,0.816255,0.008192
2,2010-02,0.217013,0.94766
2,2010-02,0.20114,0.623742
2,2010-04,0.676889,0.657162


### Robust lagging

Note how `shift` fails when we have (1) panel data, (2) duplicate dates, or (3) gaps in the time-series

In [None]:
df.shift()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2010-01,,
1,2010-02,0.769378,0.902521
1,2010-02,0.774357,0.779548
1,2010-04,0.973493,0.435882
2,2010-01,0.967311,0.524849
2,2010-02,0.816255,0.008192
2,2010-02,0.217013,0.94766
2,2010-04,0.20114,0.623742


In [None]:
#|export
def fast_lag(df: pd.Series|pd.DataFrame, # Index (or level 1 of MultiIndex) must be period date
        n: int=1, # Number of periods to lag based on frequency of df.index; Negative values means lead.
        ) -> pd.Series: # Series with lagged values; Name is taken from `df`, with _lag{n} or _lead{n} added
    """Lag data in 'df' by 'n' periods. 
    ASSUMES DATA IS SORTED BY DATES AND HAS NO DUPLICATE OR MISSING DATES."""

    if isinstance(df,pd.Series): df = df.to_frame()
    if len(df.columns) > 1: raise ValueError("<df> must have a single column")
    dfl = df.copy()
    old_name = str(df.columns[0])
    new_varname = old_name + f'_lag{n}' if n>=0 else old_name + f'_lead{-n}'
    
    if isinstance(df.index, pd.MultiIndex):
        if f'{df.index.levels[1].dtype}'.startswith('period'):
            (panelvar, timevar) = dfl.index.names
            dfl = dfl.reset_index()
            dfl[['lag_panel','lag_time',new_varname]] = dfl[[panelvar, timevar, old_name]].shift(n)
            dfl[new_varname] = np.where((dfl[panelvar]==dfl['lag_panel']) & (dfl[timevar]==dfl['lag_time']+n),
                                        dfl[new_varname], np.nan)
            dfl = dfl.set_index([panelvar, timevar])
        else:
            raise ValueError('Dimension 1 of multiindex must be period date')
    else:
        if f'{df.index.dtype}'.startswith('period'):
            timevar = dfl.index.name
            dfl = dfl.reset_index()
            dfl[['lag_time',new_varname]] = dfl[[timevar, old_name]].shift(n)
            dfl[new_varname] = np.where((dfl[timevar]==dfl['lag_time']+n),
                                        dfl[new_varname], np.nan)
            dfl = dfl.set_index([timevar])
        else:
            raise ValueError('Index must be period date')
    return dfl[new_varname].squeeze()

In [None]:
#|export
def lag(df: pd.Series|pd.DataFrame, # Index (or level 1 of MultiIndex) must be period date with no missing values.
        n: int=1, # Number of periods to lag based on frequency of df.index; Negative values means lead.
        fast: bool=True, # Assumes data is sorted by date and no duplicate or missing dates
        ) -> pd.Series: # Series with lagged values; Name is taken from `df`, with _lag{n} or _lead{n} added
    """Lag data in 'df' by 'n' periods. ASSUMES NO MISSING DATES"""

    if fast: return fast_lag(df,n)

    if isinstance(df,pd.Series): df = df.to_frame()
    if len(df.columns) > 1: raise ValueError("'df' parameter must have a single column")
    dfl = df.copy()
    dfl.columns = [str(df.columns[0]) + f'_lag{n}'] if n>=0 else df.columns + f'_lead{-n}'

    if isinstance(df.index, pd.MultiIndex):
        if f'{df.index.levels[1].dtype}'.startswith('period'):
            dfl.index = dfl.index.set_levels(df.index.levels[1]+n, level=1)
        else:
            raise ValueError('Dimension 1 of multiindex must be period date')
    else:
        if f'{df.index.dtype}'.startswith('period'):
            dfl.index += n
        else:
            raise ValueError('Index must be period date')

    dfl = df.join(dfl).drop(columns=df.columns)
    return dfl.squeeze()

The index of the `df` parameter can not contain missing values.

In [None]:
lag(df['A'])

permno  Mdate  
1       2010-01         NaN
        2010-02    0.769378
        2010-02         NaN
        2010-04         NaN
2       2010-01         NaN
        2010-02    0.816255
        2010-02         NaN
        2010-04         NaN
Name: A_lag1, dtype: float64

In [None]:
lag(df['A'],fast=False)

permno  Mdate  
1       2010-01         NaN
        2010-02    0.769378
        2010-02    0.769378
        2010-04         NaN
2       2010-01         NaN
        2010-02    0.816255
        2010-02    0.816255
        2010-04         NaN
Name: A_lag1, dtype: float64

In [None]:
#|export
def add_lags(df: pd.Series|pd.DataFrame, # If series, it must have a name equal to 'vars' parameter
             vars: str|List[str], # Variables to be lagged; must be a subset of df.columns()
             lags: int|List[int]=1, # Which lags to be added
             lag_suffix: str='_lag',
             lead_suffix: str='_lead',
             fast: bool=True, # Weather to use fast_lag function
             ) -> pd.DataFrame:
    """Returns a copy of 'df' with all 'lags' of all 'vars' added to it"""

    df = df.copy()
    if isinstance(df, pd.Series): df = df.to_frame()  
    if isinstance(vars, str): vars = [vars]
    if isinstance(lags, int): lags = [lags]

    for var in vars:
        for n in lags:
            suffix = f'{lag_suffix}{n}' if n>=0 else f'{lead_suffix}{-n}'
            df[f'{var}{suffix}'] = lag(df[var], n, fast)
    return df

Because this makes a copy of `df`, when `df` is a large dataset, this should be used as `df = add_lags(df)`.

In [None]:
add_lags(df['A'], vars='A')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A_lag1
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2010-01,0.769378,
1,2010-02,0.774357,0.769378
1,2010-02,0.973493,
1,2010-04,0.967311,
2,2010-01,0.816255,
2,2010-02,0.217013,0.816255
2,2010-02,0.20114,
2,2010-04,0.676889,


In [None]:
add_lags(df, vars=['A','B'], lags=[3,-1])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,A_lag3,A_lead1,B_lag3,B_lead1
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2010-01,0.769378,0.902521,,0.774357,,0.779548
1,2010-02,0.774357,0.779548,,,,
1,2010-02,0.973493,0.435882,,,,
1,2010-04,0.967311,0.524849,0.769378,,0.902521,
2,2010-01,0.816255,0.008192,,0.217013,,0.94766
2,2010-02,0.217013,0.94766,,,,
2,2010-02,0.20114,0.623742,,,,
2,2010-04,0.676889,0.657162,0.816255,,0.008192,


In [None]:
add_lags(df,vars=['A','B'],lags=[2,-2], lag_suffix='_lg', lead_suffix='_ld')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,A_lg2,A_ld2,B_lg2,B_ld2
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2010-01,0.769378,0.902521,,,,
1,2010-02,0.774357,0.779548,,0.967311,,0.524849
1,2010-02,0.973493,0.435882,,,,
1,2010-04,0.967311,0.524849,0.774357,,0.779548,
2,2010-01,0.816255,0.008192,,,,
2,2010-02,0.217013,0.94766,,0.676889,,0.657162
2,2010-02,0.20114,0.623742,,,,
2,2010-04,0.676889,0.657162,0.217013,,0.94766,


And remember that by default, `lag` uses `fast=True`, which is not robust to duplicate dates (or unsorted dates).

In [None]:
display(add_lags(df, vars='A', fast=False))
display(add_lags(df, vars='A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,A_lag1
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2010-01,0.769378,0.902521,
1,2010-02,0.774357,0.779548,0.769378
1,2010-02,0.973493,0.435882,0.769378
1,2010-04,0.967311,0.524849,
2,2010-01,0.816255,0.008192,
2,2010-02,0.217013,0.94766,0.816255
2,2010-02,0.20114,0.623742,0.816255
2,2010-04,0.676889,0.657162,


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,A_lag1
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2010-01,0.769378,0.902521,
1,2010-02,0.774357,0.779548,0.769378
1,2010-02,0.973493,0.435882,
1,2010-04,0.967311,0.524849,
2,2010-01,0.816255,0.008192,
2,2010-02,0.217013,0.94766,0.816255
2,2010-02,0.20114,0.623742,
2,2010-04,0.676889,0.657162,


### Utilities using robust lagging

In [None]:
#|export
def rpct_change(df: pd.Series, n: int=1, fast=True):
    """Percentage change using robust lag function"""
    return df / lag(df, n, fast) - 1

In [None]:
rpct_change(df['A'])

permno  Mdate  
1       2010-01         NaN
        2010-02    0.006472
        2010-02         NaN
        2010-04         NaN
2       2010-01         NaN
        2010-02   -0.734136
        2010-02         NaN
        2010-04         NaN
dtype: float64

In [None]:
#|export
def rdiff(df: pd.Series, n: int=1, fast=True):
    """Difference using robust lag function"""
    return df - lag(df, n, fast)

In [None]:
rdiff(df['A'])

permno  Mdate  
1       2010-01         NaN
        2010-02    0.004979
        2010-02         NaN
        2010-04         NaN
2       2010-01         NaN
        2010-02   -0.599242
        2010-02         NaN
        2010-04         NaN
dtype: float64

In [None]:
def order_columns(df: pd.DataFrame, these_first: List[str]) -> pd.DataFrame:
    """Returns df with reordered columns. Use as df = order_columns(df,_)"""
    remaining = [x for x in df.columns if x not in these_first]
    return df[these_first + remaining]

Note that this function will not make a copy of 'df'. It is meant to be used to rewrite the 'df' input, as below:

In [None]:
df2 = order_columns(df, ['B','A'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,B,A
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2010-01,0.902521,0.769378
1,2010-02,0.779548,0.774357
1,2010-02,0.435882,0.973493
1,2010-04,0.524849,0.967311
2,2010-01,0.008192,0.816255
2,2010-02,0.94766,0.217013
2,2010-02,0.623742,0.20114
2,2010-04,0.657162,0.676889


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()