# core

> Extends pandas with common functions used in finance and economics research

Almost all these functions make a copy of the input DataFrame. When that DataFrame is large, use these functions as `df = func(df)`.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#|exports
from __future__ import annotations
from typing import List 
import pandas as pd
import numpy as np

First, we set up an example dataset to showcase the functions in this module.

In [None]:
raw = pd.DataFrame(np.random.rand(15,2), 
                  columns=list('AB'), 
                  index=pd.MultiIndex.from_product(
                      [[1,2, np.nan],[np.nan,'2010-01','2010-02','2010-02','2010-04']],
                      names = ['permno','date'])
                    ).reset_index()
raw

Unnamed: 0,permno,date,A,B
0,1.0,,0.913174,0.661243
1,1.0,2010-01,0.063473,0.878561
2,1.0,2010-02,0.479259,0.752895
3,1.0,2010-02,0.452011,0.531252
4,1.0,2010-04,0.946336,0.514326
5,2.0,,0.77639,0.859403
6,2.0,2010-01,0.856406,0.020527
7,2.0,2010-02,0.096957,0.078928
8,2.0,2010-02,0.225275,0.945342
9,2.0,2010-04,0.66256,0.0411


### Common panel setup procedures

In [None]:
#|export
def order_columns(df: pd.DataFrame, these_first: List[str]) -> pd.DataFrame:
    """Returns `df` with reordered columns. Use as `df = order_columns(df,_)`"""
    remaining = [x for x in df.columns if x not in these_first]
    return df[these_first + remaining]

In [None]:
order_columns(raw, these_first=['B']).head()

Unnamed: 0,B,permno,date,A
0,0.661243,1.0,,0.913174
1,0.878561,1.0,2010-01,0.063473
2,0.752895,1.0,2010-02,0.479259
3,0.531252,1.0,2010-02,0.452011
4,0.514326,1.0,2010-04,0.946336


In [None]:
#|export
def process_dates(df: pd.DataFrame, # Function returns copy of this df with `dtdate_var` and `f'{freq}date'` cols added
                time_var: str='date', # This will be the date variable used to generate datetime var `dtdate_var`
                time_var_format: str='%Y-%m-%d', # Format of `time_var`; must be valid pandas `strftime`
                dtdate_var: str='dtdate', # Name of datetime var to be created from `time_var`
                freq: str=None, # Used to create `f'{freq}date'` period date; must be valid pandas offset string
                ) -> pd.DataFrame:
    """Makes datetime date `dtdate_var` from `time_var`; adds period date `f'{freq}date'`."""
    
    df = df.copy()
    df[dtdate_var] = pd.to_datetime(df[time_var], format=time_var_format)
    df[f'{freq}date'] = df['dtdate'].dt.to_period(freq)
    return order_columns(df, [time_var,dtdate_var,f'{freq}date'])

In [None]:
newdf = process_dates(raw, time_var_format="%Y-%m", freq='M')
newdf.head()

Unnamed: 0,date,dtdate,Mdate,permno,A,B
0,,NaT,NaT,1.0,0.913174,0.661243
1,2010-01,2010-01-01,2010-01,1.0,0.063473,0.878561
2,2010-02,2010-02-01,2010-02,1.0,0.479259,0.752895
3,2010-02,2010-02-01,2010-02,1.0,0.452011,0.531252
4,2010-04,2010-04-01,2010-04,1.0,0.946336,0.514326


In [None]:
#|export
def setup_panel(df: pd.DataFrame, # Input DataFrame; a copy is returned
                panel_ids :str=None, # Name of variable that identifies panel entities
# Params passed to `process_dates`
                time_var: str='date', # This will be the date variable used to generate datetime var `dtdate_var`
                time_var_format: str='%Y-%m-%d', # Format of `time_var`; must be valid pandas `strftime`
                dtdate_var: str='dtdate', # Name of datetime var to be created from `time_var`
                freq: str=None, # Used to create `f'{freq}date'` period date; must be valid pandas offset string
# Params for cleaning                 
                drop_missing_index_vals: bool=True, # What to do with missing `panel_ids` or `f'{freq}date'`
                panel_ids_toint: str='Int64', # Converts `panel_ids` to int in place; use falsy value if not wanted
                drop_index_duplicates: bool=True, # What to do with duplicates in (`panel_ids`, `f'{freq}date'`) values
                duplicates_which_keep: str='last', # If duplicates in index, which to keep; must be 'first', 'last' or `False`
                ) -> pd.DataFrame:
    """Applies `process_dates` to `df`; cleans up (`panel_ids` ,`f'{freq}date'`) and sets it as index."""

    df = process_dates(df, time_var=time_var, time_var_format=time_var_format, dtdate_var=dtdate_var, freq=freq)
    if drop_missing_index_vals:
        df = df.dropna(subset=[panel_ids,time_var])
    if panel_ids_toint:
        df[panel_ids] = df[panel_ids].astype('Int64')
    df = df.set_index([panel_ids, f'{freq}date']).sort_index()
    if drop_index_duplicates:
        df = df[~df.index.duplicated(keep=duplicates_which_keep)]   
    return order_columns(df,[time_var,dtdate_var]) 

In [None]:
df = setup_panel(raw,
                 panel_ids='permno',
                 time_var='date', time_var_format="%Y-%m",
                 freq='M')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,A,B
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2010-01,2010-01,2010-01-01,0.063473,0.878561
1,2010-02,2010-02,2010-02-01,0.452011,0.531252
1,2010-04,2010-04,2010-04-01,0.946336,0.514326
2,2010-01,2010-01,2010-01-01,0.856406,0.020527
2,2010-02,2010-02,2010-02-01,0.225275,0.945342
2,2010-04,2010-04,2010-04-01,0.66256,0.0411


### Robust lagging

Lagging with `shift` fails when we have (1) panel data, (2) duplicate dates, (3) gaps in the time-series, (4) data is not sorted by dates (5) NaN dates.

The `fast_lag` function below correctly lags data, assuming we do not have problems (2) to (4).

The `lag` function below correctly lags data, assuming we do not have problem (5).

In [None]:
#|export
def fast_lag(df: pd.Series|pd.DataFrame, # Index (or level 1 of MultiIndex) must be period date
        n: int=1, # Number of periods to lag based on frequency of df.index; Negative values means lead.
        ) -> pd.Series: # Series with lagged values; Name is taken from `df`, with _lag{n} or _lead{n} added
    """Lag data in 'df' by 'n' periods. 
    ASSUMES DATA IS SORTED BY DATES AND HAS NO DUPLICATE OR MISSING DATES."""

    if isinstance(df,pd.Series): df = df.to_frame()
    if len(df.columns) > 1: raise ValueError("<df> must have a single column")
    dfl = df.copy()
    old_name = str(df.columns[0])
    new_varname = old_name + f'_lag{n}' if n>=0 else old_name + f'_lead{-n}'
    
    if isinstance(df.index, pd.MultiIndex):
        if f'{df.index.levels[1].dtype}'.startswith('period'):
            (panelvar, timevar) = dfl.index.names
            dfl = dfl.reset_index()
            dfl[['lag_panel','lag_time',new_varname]] = dfl[[panelvar, timevar, old_name]].shift(n)
            dfl[new_varname] = np.where((dfl[panelvar]==dfl['lag_panel']) & (dfl[timevar]==dfl['lag_time']+n),
                                        dfl[new_varname], np.nan)
            dfl = dfl.set_index([panelvar, timevar])
        else:
            raise ValueError('Dimension 1 of multiindex must be period date')
    else:
        if f'{df.index.dtype}'.startswith('period'):
            timevar = dfl.index.name
            dfl = dfl.reset_index()
            dfl[['lag_time',new_varname]] = dfl[[timevar, old_name]].shift(n)
            dfl[new_varname] = np.where((dfl[timevar]==dfl['lag_time']+n),
                                        dfl[new_varname], np.nan)
            dfl = dfl.set_index([timevar])
        else:
            raise ValueError('Index must be period date')
    return dfl[new_varname].squeeze()

In [None]:
#|export
def lag(df: pd.Series|pd.DataFrame, # Index (or level 1 of MultiIndex) must be period date with no missing values.
        n: int=1, # Number of periods to lag based on frequency of df.index; Negative values means lead.
        fast: bool=True, # Assumes data is sorted by date and no duplicate or missing dates
        ) -> pd.Series: # Series with lagged values; Name is taken from `df`, with _lag{n} or _lead{n} added
    """Lag data in 'df' by 'n' periods. ASSUMES NO MISSING DATES"""

    if fast: return fast_lag(df,n)

    if isinstance(df,pd.Series): df = df.to_frame()
    if len(df.columns) > 1: raise ValueError("'df' parameter must have a single column")
    dfl = df.copy()
    dfl.columns = [str(df.columns[0]) + f'_lag{n}'] if n>=0 else df.columns + f'_lead{-n}'

    if isinstance(df.index, pd.MultiIndex):
        if f'{df.index.levels[1].dtype}'.startswith('period'):
            dfl.index = dfl.index.set_levels(df.index.levels[1]+n, level=1)
        else:
            raise ValueError('Dimension 1 of multiindex must be period date')
    else:
        if f'{df.index.dtype}'.startswith('period'):
            dfl.index += n
        else:
            raise ValueError('Index must be period date')

    dfl = df.join(dfl).drop(columns=df.columns)
    return dfl.squeeze()

The index of the `df` parameter can not contain missing values.

In [None]:
lag(df['A'])

permno  Mdate  
1       2010-01         NaN
        2010-02    0.063473
        2010-04         NaN
2       2010-01         NaN
        2010-02    0.856406
        2010-04         NaN
Name: A_lag1, dtype: float64

In [None]:
lag(df['A'],fast=False)

permno  Mdate  
1       2010-01         NaN
        2010-02    0.063473
        2010-04         NaN
2       2010-01         NaN
        2010-02    0.856406
        2010-04         NaN
Name: A_lag1, dtype: float64

In [None]:
#|export
def add_lags(df: pd.Series|pd.DataFrame, # If series, it must have a name equal to 'vars' parameter
             vars: str|List[str], # Variables to be lagged; must be a subset of df.columns()
             lags: int|List[int]=1, # Which lags to be added
             lag_suffix: str='_lag',
             lead_suffix: str='_lead',
             fast: bool=True, # Weather to use fast_lag function
             ) -> pd.DataFrame:
    """Returns a copy of 'df' with all 'lags' of all 'vars' added to it"""

    df = df.copy()
    if isinstance(df, pd.Series): df = df.to_frame()  
    if isinstance(vars, str): vars = [vars]
    if isinstance(lags, int): lags = [lags]

    for var in vars:
        for n in lags:
            suffix = f'{lag_suffix}{n}' if n>=0 else f'{lead_suffix}{-n}'
            df[f'{var}{suffix}'] = lag(df[var], n, fast)
    return df

Because this makes a copy of `df`, when `df` is a large dataset, this should be used as `df = add_lags(df)`.

In [None]:
add_lags(df['A'], vars='A')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A_lag1
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2010-01,0.063473,
1,2010-02,0.452011,0.063473
1,2010-04,0.946336,
2,2010-01,0.856406,
2,2010-02,0.225275,0.856406
2,2010-04,0.66256,


In [None]:
add_lags(df, vars=['A','B'], lags=[3,-1])

Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,A,B,A_lag3,A_lead1,B_lag3,B_lead1
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2010-01,2010-01,2010-01-01,0.063473,0.878561,,0.452011,,0.531252
1,2010-02,2010-02,2010-02-01,0.452011,0.531252,,,,
1,2010-04,2010-04,2010-04-01,0.946336,0.514326,,,,
2,2010-01,2010-01,2010-01-01,0.856406,0.020527,,0.225275,,0.945342
2,2010-02,2010-02,2010-02-01,0.225275,0.945342,,,,
2,2010-04,2010-04,2010-04-01,0.66256,0.0411,,,,


In [None]:
add_lags(df,vars=['A','B'],lags=[2,-2], lag_suffix='_lg', lead_suffix='_ld')

Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,A,B,A_lg2,A_ld2,B_lg2,B_ld2
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2010-01,2010-01,2010-01-01,0.063473,0.878561,,,,
1,2010-02,2010-02,2010-02-01,0.452011,0.531252,,,,
1,2010-04,2010-04,2010-04-01,0.946336,0.514326,,,,
2,2010-01,2010-01,2010-01-01,0.856406,0.020527,,,,
2,2010-02,2010-02,2010-02-01,0.225275,0.945342,,,,
2,2010-04,2010-04,2010-04-01,0.66256,0.0411,,,,


And remember that by default, `lag` uses `fast=True`, which is not robust to duplicate dates (or unsorted dates).

### Utilities using robust lagging

In [None]:
#|export
def rpct_change(df: pd.Series, n: int=1, fast=True):
    """Percentage change using robust lag function"""
    return df / lag(df, n, fast) - 1

In [None]:
rpct_change(df['A'])

permno  Mdate  
1       2010-01         NaN
        2010-02    6.121259
        2010-04         NaN
2       2010-01         NaN
        2010-02   -0.736953
        2010-04         NaN
dtype: float64

In [None]:
#|export
def rdiff(df: pd.Series, n: int=1, fast=True):
    """Difference using robust lag function"""
    return df - lag(df, n, fast)

In [None]:
rdiff(df['A'])

permno  Mdate  
1       2010-01         NaN
        2010-02    0.388537
        2010-04         NaN
2       2010-01         NaN
        2010-02   -0.631131
        2010-04         NaN
dtype: float64

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()