In [1]:
import pandas as pd
import numpy as np
from functions import *
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

**Table of contents**<a id='toc0_'></a>    
- [Combining data sources](#toc1_)    
    - [Add industry data for DK](#toc1_1_1_)    
    - [Add financial data](#toc1_1_2_)    
    - [Add beta](#toc1_1_3_)    
  - [Add risk-free rate](#toc1_2_)    
- [Create variables](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Combining data sources](#toc0_)

In [2]:
trade = pd.read_csv('data/trade.csv', header=[0,1], index_col=0)
stocks = pd.read_csv('data/stocks.csv')
dk_industry = pd.read_csv('data/dk_industry.csv')
financials = pd.read_csv('data/financials.csv')
beta = pd.read_csv('data/beta.csv')
rf_rate = pd.read_csv('data/rf_rate.csv')

In [3]:
# offsets
quarterly_offset = 1 # quarterly data is 2 months behind end of quarter
annual_offset = 6 # annual data is 6 months behind publication

In [4]:
# convert the index to datetime (the index holds the dates)
trade.index = pd.to_datetime(trade.index)
trade = trade.stack(level=0).reset_index()


# rename columns to have a proper
trade.rename(columns={'level_1': 'ticker'}, inplace=True)

# Data Cleaning and Sorting
trade.drop_duplicates(inplace=True)
trade.dropna(inplace=True) # happens if there was one data point the first day of a given ticker but not the rest of the values (e.g. trade values but no ask or bid)
trade.sort_values(['ticker', 'timestamp'], inplace=True)
trade.reset_index(drop=True, inplace=True)


  trade = trade.stack(level=0).reset_index()


In [5]:
df = pd.merge(trade, stocks[['ticker','shares','NACE']], how='left', on=['ticker'])
# display(df)


### <a id='toc1_1_1_'></a>[Add industry data for DK](#toc0_)

In [6]:
dk_industry['timestamp'] = pd.to_datetime(dk_industry['timestamp'])
# join stocks and dk_industry on 'NACE industry' 
industry = stocks[['ticker','NACE']].merge(dk_industry, how='left', on='NACE')

# adjust the timestamp to be 2 months behind
industry['timestamp'] = industry['timestamp'] + pd.DateOffset(months=2+quarterly_offset)
industry['timestamp'] = (
    industry['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

industry = industry.drop(columns=['NACE'])

In [7]:
df = pd.merge(df, industry, how='left', on=['timestamp', 'ticker'])

#ffil the industry values
for col in industry.columns[2:]:
    df[col] = df[col].groupby(df['ticker']).ffill()


### <a id='toc1_1_2_'></a>[Add financial data](#toc0_)

In [8]:
financials['timestamp'] = pd.to_datetime(financials['timestamp'])

financials['timestamp'] = financials['timestamp'] + pd.DateOffset(months=annual_offset)
financials['timestamp'] = (
    financials['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

# expand the dataset
financials = (
    financials
        # sort data and find the next timestamp
      .sort_values(['ticker','timestamp'])
      .assign(
        next_fye   = lambda df: df.groupby('ticker')['timestamp'].shift(-1),
        plus_12m   = lambda df: df['timestamp'] + pd.DateOffset(months=12),
        period_end = lambda df: pd.to_datetime(np.where(
                          (df.next_fye - df.timestamp).abs()
                            < 
                          (df.plus_12m   - df.timestamp).abs(),
                          df.next_fye,
                          df.plus_12m
                        ))
      )
      # expand the data
      .assign(timestamp = lambda df: df.apply(expand_monthly, axis=1))
      .explode('timestamp')
      .drop(columns=['next_fye','plus_12m','period_end'])
      .reset_index(drop=True)
)

In [9]:
# join df and financials
df = pd.merge(df, financials, how='left', on=['timestamp', 'ticker'])

# display(df)

### <a id='toc1_1_3_'></a>[Add beta](#toc0_)

In [None]:
beta['timestamp'] = pd.to_datetime(beta['timestamp'])

# adjust to be last trading day of the month only and set to end of month
monthly_beta = (
    beta
    .groupby(
        ['ticker', pd.Grouper(key='timestamp', freq='ME')],
         as_index=False
    )['beta']
    .mean()
)

In [11]:
# join df and beta
df = pd.merge(df, monthly_beta, how='left', on=['timestamp', 'ticker'])

## <a id='toc1_2_'></a>[Add risk-free rate](#toc0_)

In [12]:
rf_rate['timestamp'] = pd.to_datetime(rf_rate['timestamp'])
# set date as index and resample by month, taking the mean of diskonto & folio
monthly_avg = (
    rf_rate
    .set_index('timestamp')
    .resample('ME')[['diskonto', 'folio']]
    .mean()
    .rename_axis('timestamp')
    .reset_index()
)
# convert to monthly rates
monthly_avg['diskonto'] = monthly_avg['diskonto'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg['folio'] = monthly_avg['folio'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg
# drop diskonto and rename folio to risk-free
monthly_avg.drop(columns=['diskonto'], inplace=True)
monthly_avg.rename(columns={'folio': 'risk_free'}, inplace=True)

In [13]:
# join df and beta
df = pd.merge(df, monthly_avg, how='left', on=['timestamp'])

# display(df)

In [14]:
df['target'] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=1, fill_method=None))

# subtract the risk-free rate from the target
df['target'] = df['target'] #- df['risk_free']
df['target'] = df['target'].shift(-1) # shift the target by 1 month

df['baspread_m'] = (df['ask']-df['bid'])/((df['ask']+df['bid'])/2) # (df['ask']-df['bid'])/(df['ask']) # (df['ask']-df['bid'])/((df['ask']+df['bid'])/2) # bid-ask spread

# momentum
momentum_periods = {'mom1m': 1, 'mom3m': 3, 'mom6m': 6, 'mom12m': 12}

# compute percentage change over each period for each ticker separately.
for feature_name, period in momentum_periods.items():
    df[feature_name] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=period))


# drop if mom12m or target is nan (first 12 months and last month for each ticker)
df2 = df.copy()
df2.dropna(subset=['mom12m','target'], inplace=True)

In [15]:
# display rows with missing values
display(df2[df2.isnull().any(axis=1)])
# display with no missing values
display(df2[df2.notnull().all(axis=1)])

Unnamed: 0,timestamp,ticker,ask,bid,adjclose,high,low,open,volume,turnover,...,debt,currentassets,beta,risk_free,target,baspread_m,mom1m,mom3m,mom6m,mom12m
425,2012-04-30,AARHUS.CO^D12,6.45,5.9,6.0,7.0,5.9,6.2,101940.0,651610.0,...,143374000.0,0.0,,0.000208,0.0,0.089069,0.0,-0.090909,0.153846,-0.620253
426,2012-05-31,AARHUS.CO^D12,6.45,5.9,6.0,7.0,5.9,6.2,101940.0,651610.0,...,143374000.0,0.0,,0.000191,0.0,0.089069,0.0,-0.016393,0.263158,-0.504132
427,2012-06-30,AARHUS.CO^D12,6.45,5.9,6.0,7.0,5.9,6.2,101940.0,651610.0,...,33220000.0,0.0,,0.000000,0.0,0.089069,0.0,0.000000,0.263158,-0.393939
428,2012-07-31,AARHUS.CO^D12,6.45,5.9,6.0,7.0,5.9,6.2,101940.0,651610.0,...,33220000.0,0.0,,0.000000,0.0,0.089069,0.0,0.000000,-0.090909,-0.400000
429,2012-08-31,AARHUS.CO^D12,6.45,5.9,6.0,7.0,5.9,6.2,101940.0,651610.0,...,33220000.0,0.0,,0.000000,0.0,0.089069,0.0,0.000000,-0.016393,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81104,2024-11-30,WEWER.CO^L05,1160.00,850.0,1140.0,1140.0,1140.0,1140.0,60.0,68.0,...,,,,0.002345,0.0,0.308458,0.0,0.000000,0.000000,0.000000
81105,2024-12-31,WEWER.CO^L05,1160.00,850.0,1140.0,1140.0,1140.0,1140.0,60.0,68.0,...,,,,0.002243,0.0,0.308458,0.0,0.000000,0.000000,0.000000
81106,2025-01-31,WEWER.CO^L05,1160.00,850.0,1140.0,1140.0,1140.0,1140.0,60.0,68.0,...,,,,0.002132,0.0,0.308458,0.0,0.000000,0.000000,0.000000
81107,2025-02-28,WEWER.CO^L05,1160.00,850.0,1140.0,1140.0,1140.0,1140.0,60.0,68.0,...,,,,0.001938,0.0,0.308458,0.0,0.000000,0.000000,0.000000


Unnamed: 0,timestamp,ticker,ask,bid,adjclose,high,low,open,volume,turnover,...,debt,currentassets,beta,risk_free,target,baspread_m,mom1m,mom3m,mom6m,mom12m
12,2002-02-28,AAB.CO,2928.445121,2876.151458,2928.445121,2928.445121,2562.389480,2719.270469,1.944595e+02,5.280000e+05,...,2855000.0,5.657500e+07,0.117701,0.002669,0.125000,0.018018,0.120000,0.076923,-0.081967,-0.222222
13,2002-03-31,AAB.CO,3294.500761,3189.913435,3294.500761,3399.088086,2928.445121,2928.445121,1.694852e+02,5.560000e+05,...,2855000.0,5.657500e+07,0.093670,0.002669,-0.063492,0.032258,0.125000,0.260000,0.125000,-0.059701
14,2002-04-30,AAB.CO,3085.326109,2928.445121,3085.326109,3189.913435,2928.445121,3189.913435,1.992593e+02,6.280000e+05,...,2855000.0,5.657500e+07,0.111161,0.002669,-0.169492,0.052174,-0.063492,0.180000,0.092593,-0.032787
15,2002-05-31,AAB.CO,2614.683143,2457.802155,2562.389480,2928.445121,2510.095818,2928.445121,6.842129e+01,1.960000e+05,...,2855000.0,5.657500e+07,0.101388,0.002669,-0.040816,0.061856,-0.169492,-0.125000,-0.057692,-0.109091
16,2002-06-30,AAB.CO,2457.802155,2353.214829,2457.802155,2510.095818,2353.214829,2510.095818,9.922808e+01,2.440000e+05,...,15962000.0,6.795400e+07,0.085370,0.002669,0.042553,0.043478,-0.040816,-0.253968,-0.060000,-0.160714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81278,2024-11-30,ZELA.CO,730.000000,729.000000,730.000000,893.000000,672.500000,789.500000,4.650485e+06,3.587543e+09,...,119230000.0,1.779788e+09,1.155276,0.002345,-0.019863,0.001371,-0.076534,-0.176537,0.141517,1.172619
81279,2024-12-31,ZELA.CO,716.500000,715.500000,715.500000,819.000000,587.000000,728.000000,4.138727e+06,2.995576e+09,...,119230000.0,1.779788e+09,0.923565,0.002243,0.027952,0.001397,-0.019863,-0.119926,-0.198319,0.917203
81280,2025-01-31,ZELA.CO,738.000000,737.000000,735.500000,808.000000,682.500000,718.500000,3.538390e+06,2.572094e+09,...,119230000.0,1.779788e+09,0.966188,0.002132,-0.098572,0.001356,0.027952,-0.069576,-0.209140,0.557603
81281,2025-02-28,ZELA.CO,665.000000,664.000000,663.000000,788.000000,657.000000,714.500000,3.876841e+06,2.801496e+09,...,119230000.0,1.779788e+09,0.929932,0.001938,-0.220211,0.001505,-0.098572,-0.091781,-0.252115,-0.001506


# <a id='toc2_'></a>[Feature engineering](#toc0_)

In [None]:
df['target'] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=1, fill_method=None))

# subtract the risk-free rate from the target
df['target'] = df['target'] #- df['risk_free']
df['target'] = df['target'].shift(-1) # shift the target by 1 month

df['baspread_m'] = (df['ask']-df['bid'])/((df['ask']+df['bid'])/2) # (df['ask']-df['bid'])/(df['ask']) # (df['ask']-df['bid'])/((df['ask']+df['bid'])/2) # bid-ask spread

# define (lag_start, lag_end) for each momentum
momentum_windows = {
    'mom1m' : (0, 1),   # current vs 1-month ago
    'mom3m' : (0, 3),   # current vs 3-months ago
    'mom6m' : (0, 6),   # 1-month ago vs 6-months ago
    'mom12m': (0, 12),  # 1-month ago vs 12-months ago
    'mom7-12m': (7, 12), # 7-months ago vs 12-months ago
}

for name, (lag_start, lag_end) in momentum_windows.items():
    df[name] = (
        df.groupby('ticker')['adjclose']
          .transform(lambda x: x.shift(lag_start) / x.shift(lag_end) - 1)
    )

# # 36m dropped as we only have ~24 years of data in total
# df['mom36m'] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=36)) 

df['mcap'] = df['adjclose'] * df['shares'] # market cap

# other features
df['acc'] = (df['netinc'] - df['cashflow'])/df['assets'] # accruals

df['absacc'] = np.abs(df['acc']) # absolute accruals

df['aeavol'] = None # abnormal earnings announcement volume

df['age'] = (df['timestamp'] - df.groupby('ticker')['timestamp'].transform('min')).dt.days / 365.25 # age of the stock

df['agr'] = (df.groupby('ticker')['assets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # yearly asset growth

df['betasq'] = df['beta'] ** 2 # beta squared

df['bm'] = (df['assets'] - df['debt']) / df['mcap'] # book to market ratio #nb to check if debt includes both current and long term liabilities (pretty sure it does)

df['bm_ia'] = None # industry adjusted book to market ratio

df['cash'] = (df['cash'] / df['assets']).replace([np.inf, -np.inf], 0) # cash to assets ratio

df['cashdebt'] = (df['cashflow'] / df['debt']).fillna(0).replace([np.inf, -np.inf], 0) # cashflow to debt ratio

df['cashpr'] = (df['mcap'] + df['debt'] - df['assets']) / df['cash']

df['cfp'] = df['cashflow'] / df['mcap'] # cashflow to market cap ratio

df['cfp_ia'] = None # industry adjusted cashflow to market cap ratio

df['chatoia'] = None # industry adjusted change in total assets

df['chcsho'] = None # change in common shares outstanding

df['chempia'] = None # industry adjusted change in employees

df['chinv'] = df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0)

df['chmom'] = df['mom6m'] - df['mom7-12m'] # change in momentum
df.drop(columns=['mom7-12m'], inplace=True) # only used for this calculation

df['chpmia'] = None # industry adjusted profit margin

df['chtx'] = None # change in tax rate

df['cinvest'] = None # corporate investment

df['convdebt'] = None # convertible debt

df['depr'] = None # depreciation/PP&E

df['divi'] = df['divi'].fillna(0) # dividens initiation

df['divo'] = df['divo'].fillna(0) # dividens omission

df['dkkvol'] = df['adjclose'] * df['volume'] # dkk volume (dollar volume in Gu et. al.

df['dy'] = df['dy']/df['adjclose'] # dividend to price (annualized)

df['ear'] = None # earnings announcement return

df['egr'] = None # growth in common shareholders equity

df['ep'] = df['revenue'] / df['mcap'] # earnings to market cap ratio

df['gma'] = df['grossprofit'] / df['assets'] # gross profitability

df['grCAPX'] = None # growth in capital expenditures

df['grlnoa'] = None # growth in long-term net operating assets

df['herf'] = None # herfindahl index

df['hire'] = None # hiring

df['idiovol'] = None # idiosyncratic volatility

df['ill'] = None # illiquidity (to be calculated on the daily data)

df['indmom'] = df.groupby(['NACE', 'timestamp'])['mom12m'].transform('mean') # industry momentum

df['invest'] = None # capital expenditures and investment

df['lev'] = df['debt'] / df['assets'] # leverage

df['lgr'] = None # growth in long-term debt

df['maxret'] = None # maximum daily return

df['ms'] = None # financial statement score

df['mvel1'] = np.log(df['mcap']) # Size (log of market cap)

df['mve_ia'] = None # industry adjusted size

df['nincr'] = None # number of earnings increases

df['operprof'] = df['opinc'] / df['assets'] # operating profitability

df['orgcap'] = None # organizational capital

df['pchcapx_ia'] = None # industry adjusted % change in capital expenditures

df['pchcurrat'] = (df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in current ratio

df['pchdpr'] = None # % change in depreciation

df['pchgm_pchsale'] = (df.groupby('ticker')['grossprofit'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in gross margin - % change in sales

df['pchquick'] = None # % change in quick ratio

df['pchsale_pchinvt'] = (df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in sales - % change in inventory

df['pchsale_pchrect'] = None # % change in sales - % change in A/R

df['pchsale_pchxsga'] = None # % change in sales - % change in SG&A

df['saleinv'] = (df['revenue'] / df['currentassets']).fillna(0).replace([np.inf, -np.inf], 0) # sales to inventory ratio

df['pchsaleinv'] = (df.groupby('ticker')['saleinv'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in sales to inventory ratio

df['pctacc'] = (df['netinc'] - df['cashflow'])/np.abs(df['netinc']) # % accruals

df['pricedelay'] = None # price delay

df['ps'] = None # financial statements score

df['quick'] = None # quick ratio

df['rd'] = (df.groupby('ticker')['rnd'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # R&D increase

df['rd_mve'] = df['rnd'].fillna(0) / df['mcap'] # R&D to market cap ratio

df['rd_sale'] = (df['rnd'].fillna(0) / df['revenue']).fillna(0).replace([np.inf, -np.inf], 0) # R&D to sales ratio

df['realestate'] = None # real estate holdings

df['retvol'] = None # return volatility (to be calculated on the daily data)

df['roaq'] = df['netinc'] / df['assets'] # return on assets

df['roavol'] = None # earning volatility

df['roeq'] = None # df['netinc'] / df['equity'] # return on equity

df['roic'] = None # df['netinc'] / (df['debt'] + df['equity']) # return on invested capital

df['rsup'] = None # revnue surprise

df['salecash'] = (df['revenue'] / df['cash']).fillna(0).replace([np.inf, -np.inf], 0) # sales to cash ratio 

df['salerec'] = None # df['revenue'] / df['receivables'] # sales to receivables ratio

df['secured'] = None # secured debt

df['securedind'] = None # secured debt indicator

df['sgr'] = (df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # sales growth

df['sin'] = None # sin stock indicator

df['sp'] = df['revenue'].fillna(0) / df['mcap'] # sales to price ratio

df['std_dkkvol'] = None # standard deviation of dkk volume

df['std_turn'] = None # standard deviation of turnover

df['stdacc'] = None # standard deviation of accruals

df['stdcf'] = None # standard deviation of cashflow

df['tang'] = None # debt capacity/firm tangibility

df['tb'] = None # tax income to book income

df['turn'] = df['turnover'] # turnover

df['zerotrade'] = None # zero trading days

  result = getattr(ufunc, method)(*inputs, **kwargs)
  df['turn'] = df['turnover'] # turnover
  df['zerotrade'] = None # zero trading days


In [17]:
# remove empty columns (None from above)
for col in df.columns:
    if df[col].isnull().all():
        df.drop(columns=[col], inplace=True)

# drop if mom12m or target is nan (first 12 months and last month for each ticker)
df.dropna(subset=['mom12m','target'], inplace=True)

# drop if any nan
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
# save the data
df.to_csv('data/data.csv', index=False)