In [1]:
import pandas as pd
import numpy as np
from functions import *
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

**Table of contents**<a id='toc0_'></a>    
- [Combining data sources](#toc1_)    
    - [Add industry data for DK](#toc1_1_1_)    
    - [Add financial data](#toc1_1_2_)    
    - [Add beta](#toc1_1_3_)    
  - [Add risk-free rate](#toc1_2_)    
- [Create variables](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Combining data sources](#toc0_)

In [2]:
trade = pd.read_csv('data/trade.csv', header=[0,1], index_col=0)
stocks = pd.read_csv('data/stocks.csv')
dk_industry = pd.read_csv('data/dk_industry.csv')
financials = pd.read_csv('data/financials.csv')
beta = pd.read_csv('data/beta.csv')
rf_rate = pd.read_csv('data/rf_rate.csv')

In [3]:
# offsets
quarterly_offset = 1 # quarterly data is 2 months behind end of quarter
annual_offset = 6 # annual data is 6 months behind publication

In [4]:
# convert the index to datetime (the index holds the dates)
trade.index = pd.to_datetime(trade.index)
trade = trade.stack(level=0).reset_index()


# rename columns to have a proper
trade.rename(columns={'level_1': 'ticker'}, inplace=True)

# Data Cleaning and Sorting
trade.drop_duplicates(inplace=True)
trade.dropna(inplace=True) # happens if there was one data point the first day of a given ticker but not the rest of the values (e.g. trade values but no ask or bid)
trade.sort_values(['ticker', 'timestamp'], inplace=True)
trade.reset_index(drop=True, inplace=True)


  trade = trade.stack(level=0).reset_index()


In [5]:
df = pd.merge(trade, stocks[['ticker','shares','NACE']], how='left', on=['ticker'])
# display(df)


### <a id='toc1_1_1_'></a>[Add industry data for DK](#toc0_)

In [6]:
dk_industry['timestamp'] = pd.to_datetime(dk_industry['timestamp'])
# join stocks and dk_industry on 'NACE industry' 
industry = stocks[['ticker','NACE']].merge(dk_industry, how='left', on='NACE')

# adjust the timestamp to be 2 months behind
industry['timestamp'] = industry['timestamp'] + pd.DateOffset(months=2+quarterly_offset)
industry['timestamp'] = (
    industry['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

industry = industry.drop(columns=['NACE'])

In [7]:
df = pd.merge(df, industry, how='left', on=['timestamp', 'ticker'])

#ffil the industry values
for col in industry.columns[2:]:
    df[col] = df[col].groupby(df['ticker']).ffill()


### <a id='toc1_1_2_'></a>[Add financial data](#toc0_)

In [8]:
financials['timestamp'] = pd.to_datetime(financials['timestamp'])

financials['timestamp'] = financials['timestamp'] + pd.DateOffset(months=annual_offset)
financials['timestamp'] = (
    financials['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

# expand the dataset
financials = (
    financials
        # sort data and find the next timestamp
      .sort_values(['ticker','timestamp'])
      .assign(
        next_fye   = lambda df: df.groupby('ticker')['timestamp'].shift(-1),
        plus_12m   = lambda df: df['timestamp'] + pd.DateOffset(months=12),
        period_end = lambda df: pd.to_datetime(np.where(
                          (df.next_fye - df.timestamp).abs()
                            < 
                          (df.plus_12m   - df.timestamp).abs(),
                          df.next_fye,
                          df.plus_12m
                        ))
      )
      # expand the data
      .assign(timestamp = lambda df: df.apply(expand_monthly, axis=1))
      .explode('timestamp')
      .drop(columns=['next_fye','plus_12m','period_end'])
      .reset_index(drop=True)
)

In [9]:
# join df and financials
df = pd.merge(df, financials, how='left', on=['timestamp', 'ticker'])

# display(df)

### <a id='toc1_1_3_'></a>[Add beta](#toc0_)

In [10]:
beta['timestamp'] = pd.to_datetime(beta['timestamp'])

# adjust to be last trading day of the month only and set to end of month
monthly_beta = (
    beta
    .groupby(
        ['ticker', pd.Grouper(key='timestamp', freq='ME')],
         as_index=False
    )['beta']
    .last()
)

In [11]:
# join df and beta
df = pd.merge(df, monthly_beta, how='left', on=['timestamp', 'ticker'])

## <a id='toc1_2_'></a>[Add risk-free rate](#toc0_)

In [12]:
rf_rate['timestamp'] = pd.to_datetime(rf_rate['timestamp'])
# set date as index and resample by month, taking the mean of diskonto & folio
monthly_avg = (
    rf_rate
    .set_index('timestamp')
    .resample('ME')[['diskonto', 'folio']]
    .mean()
    .rename_axis('timestamp')
    .reset_index()
)
# convert to monthly rates
monthly_avg['diskonto'] = monthly_avg['diskonto'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg['folio'] = monthly_avg['folio'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg
# drop diskonto and rename folio to risk-free
monthly_avg.drop(columns=['diskonto'], inplace=True)
monthly_avg.rename(columns={'folio': 'risk_free'}, inplace=True)

In [13]:
rf_rate

Unnamed: 0,timestamp,diskonto,folio
0,2000-01-03,0.030,0.030
1,2000-01-04,0.030,0.030
2,2000-01-05,0.030,0.030
3,2000-01-06,0.030,0.030
4,2000-01-07,0.030,0.030
...,...,...,...
6326,2025-04-04,0.021,0.021
6327,2025-04-07,0.021,0.021
6328,2025-04-08,0.021,0.021
6329,2025-04-09,0.021,0.021


In [14]:
# join df and beta
df = pd.merge(df, monthly_avg, how='left', on=['timestamp'])

# display(df)

In [15]:
print(df.columns)

Index(['timestamp', 'ticker', 'ask', 'bid', 'adjclose', 'high', 'low', 'open',
       'volume', 'turnover', 'shares', 'NACE', 'grossoutput', 'intercons',
       'invest', 'prodind', 'valaddoutput', 'assets', 'netinc', 'revenue',
       'rnd', 'cash', 'grossprofit', 'opinc', 'cashflow', 'debt',
       'currentassets', 'beta', 'risk_free'],
      dtype='object')


# <a id='toc2_'></a>[Feature engineering](#toc0_)

In [None]:
df['target'] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=1, fill_method=None))

# subtract the risk-free rate from the target
df['target'] = df['target'] - df['risk_free']
# df['target'] = df['target'].shift(-1) # shift the target by 1 month

df['mcap'] = df['adjclose'] * df['shares'] # market cap

# momentum
momentum_periods = {'mom1m': 1, 'mom3m': 3, 'mom6m': 6, 'mom12m': 12}

# compute percentage change over each period for each ticker separately.
for feature_name, period in momentum_periods.items():
    df[feature_name] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=period))

# df['mom36'] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=36)) # dropped as we only have ~24 years of data in total

# other features
df['absacc'] = None # absolute accruals

df['acc'] = None # accruals

df['aeavol'] = None # abnormal earnings announcement volume

df['age'] = (df['timestamp'] - df.groupby('ticker')['timestamp'].transform('min')).dt.days / 365.25 # age of the stock

df['agr'] = df.groupby('ticker')['assets'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) # yearly asset growth

df['baspread_close'] = (df['ask']-df['bid'])/((df['ask']+df['bid'])/2) # bid-ask spread

df['betasq'] = df['beta'] ** 2 # beta squared

df['bm'] = (df['assets'] - df['debt']) / df['mcap'] # book to market ratio #nb to check if debt includes both current and long term liabilities (pretty sure it does)

df['bm_ia'] = None # industry adjusted book to market ratio

df['cash'] = df['cash'] / df['assets'] # cash to assets ratio

df['cashdebt'] = df['cashflow'] / df['debt'] # cashflow to debt ratio

df['cashpr'] = None # cash productivity

df['cfp'] = df['cashflow'] / df['mcap'] # cashflow to market cap ratio

df['cfp_ia'] = None # industry adjusted cashflow to market cap ratio

df['chatoia'] = None # industry adjusted change in total assets

df['chcsho'] = None # change in common shares outstanding

df['chempia'] = None # industry adjusted change in employees

df['chinv'] = df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))

df['chmom'] = df.groupby('ticker')['mom6m'].transform(lambda x: x - x.shift(7)) # change in momentum

df['chpmia'] = None # industry adjusted profit margin

df['chtx'] = None # change in tax rate

df['cinvest'] = None # corporate investment

df['convdebt'] = None # convertible debt

df['depr'] = None # depreciation/PP&E

df['divi'] = None # dividens initiation

df['divo'] = None # dividens omission

df['dkkvol'] = df['adjclose'] * df['volume'] # dkk volume (dollar volume in Gu et. al.

df['dy'] = None # dividend to price (annualized)

df['ear'] = None # earnings announcement return

df['egr'] = None # growth in common shareholders equity

df['ep'] = df['revenue'] / df['mcap'] # earnings to market cap ratio

df['gma'] = df['grossprofit'] / df['assets'] # gross profitability

df['grCAPX'] = None # growth in capital expenditures

df['grlnoa'] = None # growth in long-term net operating assets

df['herf'] = None # herfindahl index

df['hire'] = None # hiring

df['idiovol'] = None # idiosyncratic volatility

df['ill'] = None # illiquidity (to be calculated on the daily data)

df['indmom'] = df.groupby(['NACE', 'timestamp'])['mom12m'].transform('mean') # industry momentum

df['invest'] = None # capital expenditures and investment

df['lev'] = df['debt'] / df['assets'] # leverage

df['lgr'] = None # growth in long-term debt

df['maxret'] = None # maximum daily return

df['ms'] = None # financial statement score

df['mvel1'] = np.log(df['mcap']) # Size (log of market cap)

df['mve_ia'] = None # industry adjusted size

df['nincr'] = None # number of earnings increases

df['operprof'] = df['opinc'] / df['assets'] # operating profitability

df['orgcap'] = None # organizational capital

df['pchcapx_ia'] = None # industry adjusted % change in capital expenditures

df['pchcurrat'] = df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) # % change in current ratio

df['pchdpr'] = None # % change in depreciation

df['pchgm_pchsale'] = df.groupby('ticker')['grossprofit'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) # % change in gross margin - % change in sales

df['pchquick'] = None # % change in quick ratio

df['pchsale_pchinvt'] = df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) # % change in sales - % change in inventory

df['pchsale_pchrect'] = None # % change in sales - % change in A/R

df['pchsale_pchxsga'] = None # % change in sales - % change in SG&A

df['saleinv'] = df['revenue'] / df['currentassets'] # sales to inventory ratio

df['pchsaleinv'] = df.groupby('ticker')['saleinv'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) # % change in sales to inventory ratio

df['pctacc'] = None # % accruals

df['pricedelay'] = None # price delay

df['ps'] = None # financial statements score

df['quick'] = None # quick ratio

df['rd'] = df.groupby('ticker')['rnd'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) # R&D increase

df['rd_mve'] = df['rnd'] / df['mcap'] # R&D to market cap ratio

df['rd_sale'] = df['rnd'] / df['revenue'] # R&D to sales ratio

df['realestate'] = None # real estate holdings

df['retvol'] = None # return volatility (to be calculated on the daily data)

df['roaq'] = df['netinc'] / df.groupby('ticker')['assets'].shift(-12) # return on assets

df['roavol'] = None # earning volatility

df['roeq'] = None # df['netinc'] / df.groupby('ticker')['equity'].shift(-12) # return on equity

df['roic'] = None # df['netinc'] / (df['debt'] + df['equity']) # return on invested capital

df['rsup'] = None # revnue surprise

df['salecash'] = df['revenue'] / df['cash'] # sales to cash ratio 

df['saleinv'] = df['revenue'] / df['currentassets'] # sales to inventory ratio

df['salerec'] = None # df['revenue'] / df['receivables'] # sales to receivables ratio

df['secured'] = None # secured debt

df['securedind'] = None # secured debt indicator

df['sgr'] = df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) # sales growth

df['sin'] = None # sin stock indicator

df['sp'] = df['revenue'] / df['mcap'] # sales to price ratio

df['std_dkkvol'] = None # standard deviation of dkk volume

df['std_turn'] = None # standard deviation of turnover

df['stdacc'] = None # standard deviation of accruals

df['stdcf'] = None # standard deviation of cashflow

df['tang'] = None # debt capacity/firm tangibility

df['tb'] = None # tax income to book income

df['turn'] = df['turnover'] # turnover

df['zerotrade'] = None # zero trading days

# remove empty columns (None from above)
for col in df.columns:
    if df[col].isnull().all():
        df.drop(columns=[col], inplace=True)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [17]:
print(df.columns)

Index(['timestamp', 'ticker', 'ask', 'bid', 'adjclose', 'high', 'low', 'open',
       'volume', 'turnover', 'shares', 'NACE', 'grossoutput', 'intercons',
       'prodind', 'valaddoutput', 'assets', 'netinc', 'revenue', 'rnd', 'cash',
       'grossprofit', 'opinc', 'cashflow', 'debt', 'currentassets', 'beta',
       'risk_free', 'target', 'mcap', 'mom1m', 'mom3m', 'mom6m', 'mom12m',
       'age', 'agr', 'baspread_close', 'betasq', 'bm', 'cashdebt', 'cfp',
       'chinv', 'chmom', 'dkkvol', 'ep', 'gma', 'indmom', 'lev', 'mvel1',
       'operprof', 'pchcurrat', 'pchgm_pchsale', 'pchsale_pchinvt', 'saleinv',
       'pchsaleinv', 'rd', 'rd_mve', 'rd_sale', 'roaq', 'salecash', 'sgr',
       'sp', 'turn'],
      dtype='object')


In [18]:
# drop if mom12m or target is nan (first 12 months and last month for each ticker)
df.dropna(subset=['mom12m','target'], inplace=True)

# drop if any nan
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# save the data
df.to_csv('data/data.csv', index=False)