In [1]:
import pandas as pd
import numpy as np
from functions import *
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

**Table of contents**<a id='toc0_'></a>    
- [Combining data sources](#toc1_)    
    - [Add industry data for DK](#toc1_1_1_)    
    - [Add financial data](#toc1_1_2_)    
    - [Add beta](#toc1_1_3_)    
  - [Add risk-free rate](#toc1_2_)    
- [Create variables](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Combining data sources](#toc0_)

In [2]:
trade = pd.read_csv('data/trade.csv', header=[0,1], index_col=0)
stocks = pd.read_csv('data/stocks.csv')
dk_industry = pd.read_csv('data/dk_industry.csv')
financials = pd.read_csv('data/financials.csv')
# beta = pd.read_csv('data/beta.csv')
rf_rate = pd.read_csv('data/rf_rate.csv')
trade_daily = pd.read_csv('data/trade_daily.csv')

In [3]:
# offsets
quarterly_offset = 3 # quarterly data is 2 months behind end of quarter
annual_offset = 6 # annual data is 6 months behind publication

In [4]:
# convert the index to datetime (the index holds the dates)
trade.index = pd.to_datetime(trade.index)
trade = trade.stack(level=0).reset_index()


# rename columns to have a proper
trade.rename(columns={'level_1': 'ticker'}, inplace=True)

# Data Cleaning and Sorting
trade.drop_duplicates(inplace=True)
trade.dropna(inplace=True) # happens if there was one data point the first day of a given ticker but not the rest of the values (e.g. trade values but no ask or bid)
trade.sort_values(['ticker', 'timestamp'], inplace=True)
trade.reset_index(drop=True, inplace=True)


  trade = trade.stack(level=0).reset_index()


In [5]:
trade

Unnamed: 0,timestamp,ticker,ask,bid,adjclose,high,low,open,volume,turnover,div,divi,divo,dy
0,2001-02-28,AAB.CO,3765.143726,3660.556401,3765.143726,4235.786692,3399.088086,4131.199367,5.061034e+02,2.047656e+06,0.0,0.0,0.0,0.0
1,2001-03-31,AAB.CO,3660.556401,3529.822244,3503.675412,3765.143726,3451.381749,3765.143726,1.172800e+02,4.385660e+05,0.0,0.0,0.0,0.0
2,2001-04-30,AAB.CO,3346.794423,3189.913435,3189.913435,4183.493029,3189.913435,4183.493029,1.263059e+02,4.467180e+05,0.0,0.0,0.0,0.0
3,2001-05-31,AAB.CO,3137.619772,3006.885615,2876.151458,3399.088086,2876.151458,3137.619772,1.653355e+02,5.048560e+05,0.0,0.0,0.0,0.0
4,2001-06-30,AAB.CO,2928.445121,2823.857795,2928.445121,3033.032446,2771.564132,3033.032446,1.220033e+02,3.534210e+05,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81279,2024-12-31,ZELA.CO,716.500000,715.500000,715.500000,819.000000,587.000000,728.000000,4.138727e+06,2.995576e+09,0.0,0.0,0.0,0.0
81280,2025-01-31,ZELA.CO,738.000000,737.000000,735.500000,808.000000,682.500000,718.500000,3.538390e+06,2.572094e+09,0.0,0.0,0.0,0.0
81281,2025-02-28,ZELA.CO,665.000000,664.000000,663.000000,788.000000,657.000000,714.500000,3.876841e+06,2.801496e+09,0.0,0.0,0.0,0.0
81282,2025-03-31,ZELA.CO,520.500000,519.500000,517.000000,725.000000,486.400000,664.500000,1.085072e+07,6.399725e+09,0.0,0.0,0.0,0.0


In [6]:
df = pd.merge(trade, stocks[['ticker','shares','NACE']], how='left', on=['ticker'])
# display(df)


### <a id='toc1_1_1_'></a>[Add industry data for DK](#toc0_)

In [7]:
dk_industry['timestamp'] = pd.to_datetime(dk_industry['timestamp'])
# join stocks and dk_industry on 'NACE industry' 
industry = stocks[['ticker','NACE']].merge(dk_industry, how='left', on='NACE')

# adjust the timestamp to be 2 months behind
industry['timestamp'] = industry['timestamp'] + pd.DateOffset(months=quarterly_offset)
industry['timestamp'] = (
    industry['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

industry = industry.drop(columns=['NACE'])

In [8]:
df = pd.merge(df, industry, how='left', on=['timestamp', 'ticker'])

#ffil the industry values
for col in industry.columns[2:]:
    df[col] = df[col].groupby(df['ticker']).ffill()


### <a id='toc1_1_2_'></a>[Add financial data](#toc0_)

In [9]:
financials['timestamp'] = pd.to_datetime(financials['timestamp'])

financials['timestamp'] = financials['timestamp'] + pd.DateOffset(months=annual_offset)
financials['timestamp'] = (
    financials['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

# expand the dataset
financials = (
    financials
        # sort data and find the next timestamp
      .sort_values(['ticker','timestamp'])
      .assign(
        next_fye   = lambda df: df.groupby('ticker')['timestamp'].shift(-1),
        plus_12m   = lambda df: df['timestamp'] + pd.DateOffset(months=12),
        period_end = lambda df: pd.to_datetime(np.where(
                          (df.next_fye - df.timestamp).abs()
                            < 
                          (df.plus_12m   - df.timestamp).abs(),
                          df.next_fye,
                          df.plus_12m
                        ))
      )
      # expand the data
      .assign(timestamp = lambda df: df.apply(expand_monthly, axis=1))
      .explode('timestamp')
      .drop(columns=['next_fye','plus_12m','period_end'])
      .reset_index(drop=True)
)

In [10]:
# join df and financials
df = pd.merge(df, financials, how='left', on=['timestamp', 'ticker'])

### <a id='toc1_1_3_'></a>[Add variables calculated on daily data](#toc0_)

In [11]:
trade_daily['timestamp'] = pd.to_datetime(trade_daily['timestamp'])

In [12]:
df.drop(columns=['volume'], inplace=True)

In [13]:
# join df and beta
df = pd.merge(df, trade_daily, how='left', on=['timestamp', 'ticker'])

## <a id='toc1_2_'></a>[Add risk-free rate](#toc0_)

In [14]:
rf_rate['timestamp'] = pd.to_datetime(rf_rate['timestamp'])
# set date as index and resample by month, taking the mean of diskonto & folio
monthly_avg = (
    rf_rate
    .set_index('timestamp')
    .resample('ME')[['diskonto', 'folio']]
    .mean()
    .rename_axis('timestamp')
    .reset_index()
)
# convert to monthly rates
monthly_avg['diskonto'] = monthly_avg['diskonto'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg['folio'] = monthly_avg['folio'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg
# drop diskonto and rename folio to risk-free
monthly_avg.drop(columns=['diskonto'], inplace=True)
monthly_avg.rename(columns={'folio': 'risk_free'}, inplace=True)

In [15]:
# join df and beta
df = pd.merge(df, monthly_avg, how='left', on=['timestamp'])

# display(df)

# <a id='toc2_'></a>[Feature engineering](#toc0_)

In [16]:
df['adjclose_div'] = df['adjclose'] + df['div'] # adjcose incl monthly dividend
df['target'] = df.groupby('ticker')['adjclose_div'].transform(lambda x: x.pct_change(periods=1, fill_method=None))

# subtract the risk-free rate from the target
df['target'] = df['target'] #- df['risk_free']
df['target'] = df['target'].shift(-1) # shift the target by 1 month

df['mcap'] = df['adjclose'] * df['shares'] # market cap

# other features
df['acc'] = (df['netinc'] - df['cashflow'])/df['assets'] # accruals

df['absacc'] = np.abs(df['acc']) # absolute accruals

df['aeavol'] = None # abnormal earnings announcement volume

df['age'] = (df['timestamp'] - df.groupby('ticker')['timestamp'].transform('min')).dt.days / 365.25 # age of the stock

df['agr'] = (df.groupby('ticker')['assets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # yearly asset growth

df['betasq'] = df['betasq'] # beta squared

df['bm'] = (df['assets'] - df['debt']) / df['mcap'] # book to market ratio #nb to check if debt includes both current and long term liabilities (pretty sure it does)

df['bm_ia'] = None # industry adjusted book to market ratio

df['cash'] = (df['cash'] / df['assets']).replace([np.inf, -np.inf], 0) # cash to assets ratio

df['cashdebt'] = (df['cashflow'] / df['debt']).fillna(0).replace([np.inf, -np.inf], 0) # cashflow to debt ratio

df['cashpr'] = (df['mcap'] + df['debt'] - df['assets']) / df['cash']

df['cfp'] = df['cashflow'] / df['mcap'] # cashflow to market cap ratio

df['cfp_ia'] = None # industry adjusted cashflow to market cap ratio

df['chatoia'] = None # industry adjusted change in total assets

df['chcsho'] = None # change in common shares outstanding

df['chempia'] = None # industry adjusted change in employees

df['chinv'] = df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0)

df['chmom'] = df['mom6m'] - df['mom7_12m'] # change in momentum
df.drop(columns=['mom7_12m'], inplace=True) # only used for this calculation

df['chpmia'] = None # industry adjusted profit margin

df['chtx'] = None # change in tax rate

df['cinvest'] = None # corporate investment

df['convdebt'] = None # convertible debt

df['depr'] = None # depreciation/PP&E

df['divi'] = df['divi'].fillna(0) # dividens initiation

df['divo'] = df['divo'].fillna(0) # dividens omission

df['dkkvol'] = df['dkkvol'] # dkk volume (dollar volume in Gu et. al.

df['dy'] = df['dy']/df['adjclose'] # dividend to price (annualized)

df['ear'] = None # earnings announcement return

df['egr'] = None # growth in common shareholders equity

df['ep'] = df['revenue'] / df['mcap'] # earnings to market cap ratio

df['gma'] = df['grossprofit'] / df['assets'] # gross profitability

df['grCAPX'] = None # growth in capital expenditures

df['grlnoa'] = None # growth in long-term net operating assets

df['herf'] = None # herfindahl index

df['hire'] = None # hiring

df['idiovol'] = df['idiovol'] # idiosyncratic volatility

df['ill'] = df['ill'] # illiquidity (to be calculated on the daily data)

df['indmom'] = df.groupby(['NACE', 'timestamp'])['mom12m'].transform('mean') # industry momentum

df['invest'] = None # capital expenditures and investment

df['lev'] = df['debt'] / df['assets'] # leverage

df['lgr'] = None # growth in long-term debt

df['maxret'] = df['maxret'] # maximum daily return

df['ms'] = None # financial statement score

df['mvel1'] = np.log(df['mcap']) # Size (log of market cap)

df['mve_ia'] = None # industry adjusted size

df['nincr'] = None # number of earnings increases

df['operprof'] = df['opinc'] / df['assets'] # operating profitability

df['orgcap'] = None # organizational capital

df['pchcapx_ia'] = None # industry adjusted % change in capital expenditures

df['pchcurrat'] = (df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in current ratio

df['pchdpr'] = None # % change in depreciation

df['pchgm_pchsale'] = (df.groupby('ticker')['grossprofit'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in gross margin - % change in sales

df['pchquick'] = None # % change in quick ratio

df['pchsale_pchinvt'] = (df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in sales - % change in inventory

df['pchsale_pchrect'] = None # % change in sales - % change in A/R

df['pchsale_pchxsga'] = None # % change in sales - % change in SG&A

df['saleinv'] = (df['revenue'] / df['currentassets']).fillna(0).replace([np.inf, -np.inf], 0) # sales to inventory ratio

df['pchsaleinv'] = (df.groupby('ticker')['saleinv'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in sales to inventory ratio
df['pchsaleinv'] = df['pchsaleinv'].replace([np.inf, -np.inf], 0)

df['pctacc'] = (df['netinc'] - df['cashflow'])/np.abs(df['netinc']) # % accruals

df['pricedelay'] = df['pricedelay']  # price delay

df['ps'] = None # financial statements score

df['quick'] = None # quick ratio

df['rd'] = (df.groupby('ticker')['rnd'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # R&D increase

df['rd_mve'] = df['rnd'].fillna(0) / df['mcap'] # R&D to market cap ratio

df['rd_sale'] = (df['rnd'].fillna(0) / df['revenue']).fillna(0).replace([np.inf, -np.inf], 0) # R&D to sales ratio

df['realestate'] = None # real estate holdings

df['retvol'] = df['retvol'] # return volatility (to be calculated on the daily data)

df['roaq'] = df['netinc'] / df['assets'] # return on assets

df['roavol'] = None # earning volatility

df['roeq'] = None # df['netinc'] / df['equity'] # return on equity

df['roic'] = None # df['netinc'] / (df['debt'] + df['equity']) # return on invested capital

df['rsup'] = None # revnue surprise

df['salecash'] = (df['revenue'] / df['cash']).fillna(0).replace([np.inf, -np.inf], 0) # sales to cash ratio 

df['salerec'] = None # df['revenue'] / df['receivables'] # sales to receivables ratio

df['secured'] = None # secured debt

df['securedind'] = None # secured debt indicator

df['sgr'] = (df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # sales growth

df['sin'] = None # sin stock indicator

df['sp'] = df['revenue'].fillna(0) / df['mcap'] # sales to price ratio

df['std_dkkvol'] = df['std_dkkvol']  # standard deviation of dkk volume

df['std_turn'] = df['std_turn']  # standard deviation of turnover

df['stdacc'] = None # standard deviation of accruals

df['stdcf'] = None # standard deviation of cashflow

df['tang'] = None # debt capacity/firm tangibility

df['tb'] = None # tax income to book income

df['turn'] = df['turn']  # turnover

df['zerotrade'] = df['zerotrade'] # zero trading days


  result = getattr(ufunc, method)(*inputs, **kwargs)
  df['tb'] = None # tax income to book income


In [17]:
# drop columns that are not needed
df.drop(columns=['adjclose', 'high', 'low', 'open', 'div', 'ask', 'bid','turnover','shares',
                 'grossoutput', 'intercons', 'prodind', 'valaddoutput',
                 'assets', 'debt', 'opinc', 'netinc', 'rnd', 'cashflow', 'revenue', 'grossprofit', 'currentassets', 'volume' 
                 ], inplace=True)

# remove empty columns (None from above)
for col in df.columns:
    if df[col].isnull().all():
        df.drop(columns=[col], inplace=True)

# drop if mom12m or target is nan (first 12 months and last month for each ticker)
df.dropna(subset=['mom12m','target'], inplace=True)

# drop if any nan
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

cols = df.columns
cols = cols.sort_values()
for col in cols:
    print(f'{col}')

NACE
absacc
acc
adjclose_div
age
agr
baspread
beta
betasq
bm
cash
cashdebt
cashpr
cfp
chinv
chmom
divi
divo
dkkvol
dy
ep
gma
idiovol
ill
indmom
lev
maxret
mcap
mom12m
mom1m
mom6m
mvel1
operprof
pchcurrat
pchgm_pchsale
pchsale_pchinvt
pchsaleinv
pctacc
pricedelay
rd
rd_mve
rd_sale
retvol
risk_free
roaq
salecash
saleinv
sgr
sp
std_dkkvol
std_turn
target
ticker
timestamp
turn
zerotrade


In [18]:
df

Unnamed: 0,timestamp,ticker,divi,divo,dy,NACE,cash,maxret,retvol,turn,...,saleinv,pchsaleinv,pctacc,rd,rd_mve,rd_sale,roaq,salecash,sgr,sp
0,2002-01-31,AAB.CO,0.0,0.0,0.0,93,0.317120,0.040816,0.018440,2.590909e+04,...,0.934635,0.000000,-3.630328,0.000000,0.000000,0.000000,-0.046779,1.667411e+08,0.000000,0.007500
1,2002-02-28,AAB.CO,0.0,0.0,0.0,93,0.317120,0.058824,0.025166,8.690000e+04,...,0.934635,0.000000,-3.630328,0.000000,0.000000,0.000000,-0.046779,1.667411e+08,0.000000,0.006697
2,2002-03-31,AAB.CO,0.0,0.0,0.0,93,0.317120,0.125000,0.040068,3.347368e+04,...,0.934635,0.000000,-3.630328,0.000000,0.000000,0.000000,-0.046779,1.667411e+08,0.000000,0.005953
3,2002-04-30,AAB.CO,0.0,0.0,0.0,93,0.317120,0.035088,0.019353,3.860000e+04,...,0.934635,0.000000,-3.630328,0.000000,0.000000,0.000000,-0.046779,1.667411e+08,0.000000,0.006356
4,2002-05-31,AAB.CO,0.0,0.0,0.0,93,0.317120,0.020833,0.021445,6.504762e+04,...,0.934635,0.000000,-3.630328,0.000000,0.000000,0.000000,-0.046779,1.667411e+08,0.000000,0.007653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45715,2024-11-30,ZELA.CO,0.0,0.0,0.0,72,0.824779,0.061986,0.041247,1.708354e+08,...,0.192600,1.438117,-0.035327,0.129009,0.012847,1.943863,-0.355425,4.156118e+08,2.296482,0.006609
45716,2024-12-31,ZELA.CO,0.0,0.0,0.0,72,0.824779,0.052288,0.026187,1.664209e+08,...,0.192600,1.438117,-0.035327,0.129009,0.013107,1.943863,-0.355425,4.156118e+08,2.296482,0.006743
45717,2025-01-31,ZELA.CO,0.0,0.0,0.0,72,0.824779,0.049498,0.023436,1.169134e+08,...,0.192600,1.438117,-0.035327,0.129009,0.012751,1.943863,-0.355425,4.156118e+08,2.296482,0.006560
45718,2025-02-28,ZELA.CO,0.0,0.0,0.0,72,0.824779,0.070494,0.030878,1.400748e+08,...,0.192600,1.438117,-0.035327,0.129009,0.014145,1.943863,-0.355425,4.156118e+08,2.296482,0.007277


In [19]:
# create dummy variables for the industry
industry_dummies = pd.get_dummies(df['NACE'], prefix='NACE')
df = pd.concat([df, industry_dummies], axis=1)
df.drop(columns=['NACE'], inplace=True)

# create dummies for the month
month_dummies = pd.get_dummies(df['timestamp'].dt.month, prefix='month')
df = pd.concat([df, month_dummies], axis=1)

# save the data
df.to_csv('data/data.csv', index=False)

In [20]:
df

Unnamed: 0,timestamp,ticker,divi,divo,dy,cash,maxret,retvol,turn,std_turn,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2002-01-31,AAB.CO,0.0,0.0,0.0,0.317120,0.040816,0.018440,2.590909e+04,2.276152e+04,...,False,False,False,False,False,False,False,False,False,False
1,2002-02-28,AAB.CO,0.0,0.0,0.0,0.317120,0.058824,0.025166,8.690000e+04,1.045395e+05,...,False,False,False,False,False,False,False,False,False,False
2,2002-03-31,AAB.CO,0.0,0.0,0.0,0.317120,0.125000,0.040068,3.347368e+04,4.223396e+04,...,True,False,False,False,False,False,False,False,False,False
3,2002-04-30,AAB.CO,0.0,0.0,0.0,0.317120,0.035088,0.019353,3.860000e+04,8.499746e+04,...,False,True,False,False,False,False,False,False,False,False
4,2002-05-31,AAB.CO,0.0,0.0,0.0,0.317120,0.020833,0.021445,6.504762e+04,3.449852e+04,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45715,2024-11-30,ZELA.CO,0.0,0.0,0.0,0.824779,0.061986,0.041247,1.708354e+08,6.465919e+07,...,False,False,False,False,False,False,False,False,True,False
45716,2024-12-31,ZELA.CO,0.0,0.0,0.0,0.824779,0.052288,0.026187,1.664209e+08,2.382756e+08,...,False,False,False,False,False,False,False,False,False,True
45717,2025-01-31,ZELA.CO,0.0,0.0,0.0,0.824779,0.049498,0.023436,1.169134e+08,4.341962e+07,...,False,False,False,False,False,False,False,False,False,False
45718,2025-02-28,ZELA.CO,0.0,0.0,0.0,0.824779,0.070494,0.030878,1.400748e+08,6.534724e+07,...,False,False,False,False,False,False,False,False,False,False
