In [1]:
import pandas as pd
import numpy as np
from functions import *
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

**Table of contents**<a id='toc0_'></a>    
- [Combining data sources](#toc1_)    
    - [Add industry data for DK](#toc1_1_1_)    
    - [Add financial data](#toc1_1_2_)    
    - [Add beta](#toc1_1_3_)    
  - [Add risk-free rate](#toc1_2_)    
- [Create variables](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Combining data sources](#toc0_)

In [2]:
trade = pd.read_csv('data/trade.csv', header=[0,1], index_col=0)
stocks = pd.read_csv('data/stocks.csv')
dk_industry = pd.read_csv('data/dk_industry.csv')
financials = pd.read_csv('data/financials.csv')
# beta = pd.read_csv('data/beta.csv')
rf_rate = pd.read_csv('data/rf_rate.csv')
trade_daily = pd.read_csv('data/trade_daily.csv')

In [3]:
# offsets
quarterly_offset = 3 # quarterly data is 2 months behind end of quarter
annual_offset = 6 # annual data is 6 months behind publication

In [4]:
# convert the index to datetime (the index holds the dates)
trade.index = pd.to_datetime(trade.index)
trade = trade.stack(level=0).reset_index()


# rename columns to have a proper
trade.rename(columns={'level_1': 'ticker'}, inplace=True)

# Data Cleaning and Sorting
trade.drop_duplicates(inplace=True)
trade.dropna(inplace=True) # happens if there was one data point the first day of a given ticker but not the rest of the values (e.g. trade values but no ask or bid)
trade.sort_values(['ticker', 'timestamp'], inplace=True)
trade.reset_index(drop=True, inplace=True)


  trade = trade.stack(level=0).reset_index()


In [5]:
df = pd.merge(trade, stocks[['ticker','shares','NACE']], how='left', on=['ticker'])
# display(df)


### <a id='toc1_1_1_'></a>[Add industry data for DK](#toc0_)

In [6]:
dk_industry['timestamp'] = pd.to_datetime(dk_industry['timestamp'])
# join stocks and dk_industry on 'NACE industry' 
industry = stocks[['ticker','NACE']].merge(dk_industry, how='left', on='NACE')

# adjust the timestamp to be 2 months behind
industry['timestamp'] = industry['timestamp'] + pd.DateOffset(months=quarterly_offset)
industry['timestamp'] = (
    industry['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

industry = industry.drop(columns=['NACE'])

In [7]:
df = pd.merge(df, industry, how='left', on=['timestamp', 'ticker'])

#ffil the industry values
for col in industry.columns[2:]:
    df[col] = df[col].groupby(df['ticker']).ffill()


### <a id='toc1_1_2_'></a>[Add financial data](#toc0_)

In [8]:
financials['timestamp'] = pd.to_datetime(financials['timestamp'])

financials['timestamp'] = financials['timestamp'] + pd.DateOffset(months=annual_offset)
financials['timestamp'] = (
    financials['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

# expand the dataset
financials = (
    financials
        # sort data and find the next timestamp
      .sort_values(['ticker','timestamp'])
      .assign(
        next_fye   = lambda df: df.groupby('ticker')['timestamp'].shift(-1),
        plus_12m   = lambda df: df['timestamp'] + pd.DateOffset(months=12),
        period_end = lambda df: pd.to_datetime(np.where(
                          (df.next_fye - df.timestamp).abs()
                            < 
                          (df.plus_12m   - df.timestamp).abs(),
                          df.next_fye,
                          df.plus_12m
                        ))
      )
      # expand the data
      .assign(timestamp = lambda df: df.apply(expand_monthly, axis=1))
      .explode('timestamp')
      .drop(columns=['next_fye','plus_12m','period_end'])
      .reset_index(drop=True)
)

# display(financials)

In [9]:
# join df and financials
df = pd.merge(df, financials, how='left', on=['timestamp', 'ticker'])

### <a id='toc1_1_3_'></a>[Add variables calculated on daily data](#toc0_)

In [10]:
trade_daily['timestamp'] = pd.to_datetime(trade_daily['timestamp'])

In [11]:
df.drop(columns=['volume'], inplace=True)

In [12]:
# join df and beta
df = pd.merge(df, trade_daily, how='left', on=['timestamp', 'ticker'])

## <a id='toc1_2_'></a>[Add risk-free rate](#toc0_)

In [13]:
rf_rate['timestamp'] = pd.to_datetime(rf_rate['timestamp'])
# set date as index and resample by month, taking the mean of diskonto & folio
monthly_avg = (
    rf_rate
    .set_index('timestamp')
    .resample('ME')[['diskonto', 'folio']]
    .mean()
    .rename_axis('timestamp')
    .reset_index()
)
# convert to monthly rates
monthly_avg['diskonto'] = monthly_avg['diskonto'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg['folio'] = monthly_avg['folio'].apply(lambda x: (1 + x) ** (1/12) - 1)
monthly_avg
# drop diskonto and rename folio to risk-free
monthly_avg.drop(columns=['diskonto'], inplace=True)
monthly_avg.rename(columns={'folio': 'risk_free'}, inplace=True)

In [14]:
# join df and beta
df = pd.merge(df, monthly_avg, how='left', on=['timestamp'])

# display(df)

# <a id='toc2_'></a>[Feature engineering](#toc0_)

In [15]:
df['adjclose_div'] = df['adjclose'] + df['div'] # adjcose incl monthly dividend
df['target'] = df.groupby('ticker')['adjclose_div'].transform(lambda x: x.pct_change(periods=1, fill_method=None))

# subtract the risk-free rate from the target
# df['target'] = df['target'] - df['risk_free']
df['target'] = df['target'].shift(-1) # shift the target by 1 month

df['mcap'] = df['adjclose'] * df['shares'] # market cap

# other features
df['acc'] = (df['netinc'] - df['cashflow'])/df['assets'] # accruals

df['absacc'] = np.abs(df['acc']) # absolute accruals

df['aeavol'] = None # abnormal earnings announcement volume

df['age'] = (df['timestamp'] - df.groupby('ticker')['timestamp'].transform('min')).dt.days / 365.25 # age of the stock

df['agr'] = (df.groupby('ticker')['assets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # yearly asset growth

df['betasq'] = df['betasq'] # beta squared

df['bm'] = (df['assets'] - df['debt']) / df['mcap'] # book to market ratio #nb to check if debt includes both current and long term liabilities (pretty sure it does)

df['bm_ia'] = None # industry adjusted book to market ratio

df['cash'] = (df['cash'] / df['assets']).replace([np.inf, -np.inf], 0) # cash to assets ratio

df['cashdebt'] = (df['cashflow'] / df['debt']).fillna(0).replace([np.inf, -np.inf], 0) # cashflow to debt ratio

df['cashpr'] = (df['mcap'] + df['debt'] - df['assets']) / df['cash']

df['cfp'] = df['cashflow'] / df['mcap'] # cashflow to market cap ratio

df['cfp_ia'] = df['cfp'] - df.groupby(['NACE','timestamp'])['cfp'].transform('mean') # industry adjusted cashflow to market cap ratio

df['chat'] = (df['revenue']/ df['assets']).transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0) # change in total assets
df['chatoia'] = df['chat'] - df.groupby(['NACE','timestamp'])['chat'].transform('mean') # industry adjusted change in total assets
# df.drop(columns=['chat'], inplace=True) # only used for this calculation

df['chcsho'] = None # change in common shares outstanding

df['chempia'] = None # industry adjusted change in employees

df['chinv'] = df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0)

df['chmom'] = df['mom6m'] - df['mom7_12m'] # change in momentum
df.drop(columns=['mom7_12m'], inplace=True) # only used for this calculation

df['chpm'] = (df['opinc'] / df['revenue']).fillna(0).replace([np.inf, -np.inf], 0) # change in profit margin
df['chpmia'] = df['chpm'] - df.groupby(['NACE','timestamp'])['chpm'].transform('mean') # industry adjusted change in profit margin
# df.drop(columns=['chpm'], inplace=True) # only used for this calculation

df['chtx'] = None # change in tax rate

df['cinvest'] = None # corporate investment

df['convdebt'] = None # convertible debt

df['depr'] = None # depreciation/PP&E

df['divi'] = df['divi'].fillna(0) # dividens initiation

df['divo'] = df['divo'].fillna(0) # dividens omission

df['dkkvol'] = df['dkkvol'] # dkk volume (dollar volume in Gu et. al.)

df['dy'] = df['dy']/df['adjclose'] # dividend to price (annualized)

df['ear'] = None # earnings announcement return

df['egr'] = df.groupby('ticker')['equity'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0) # growth in common shareholders equity

df['ep'] = df['revenue'] / df['mcap'] # earnings to market cap ratio

df['gma'] = df['grossprofit'] / df['assets'] # gross profitability

df['grCAPX'] = None # growth in capital expenditures

df['grltnoa'] = (df['assets'] - df['currentassets']).transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0) # growth in long-term net operating assets

df['herf'] = None # herfindahl index

df['hire'] = None # hiring

df['idiovol'] = df['idiovol'] # idiosyncratic volatility

df['ill'] = df['ill'] # illiquidity (to be calculated on the daily data)

df['indmom'] = df.groupby(['NACE', 'timestamp'])['mom12m'].transform('mean') # industry momentum

df['invest'] = None # capital expenditures and investment

df['lev'] = df['debt'] / df['assets'] # leverage

df['lgr'] = df.groupby('ticker')['longdebt'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0) # long-term debt growth

df['maxret'] = df['maxret'] # maximum daily return

df['ms'] = None # financial statement score

df['mvel1'] = np.log(df['mcap']) # Size (log of market cap)

df['mve_ia'] = df['mvel1'] - df.groupby(['NACE','timestamp'])['mvel1'].transform('mean') # industry adjusted size

df['nincr'] = None # number of earnings increases

df['operprof'] = df['opinc'] / df['assets'] # operating profitability

df['orgcap'] = None # organizational capital

df['pchcapx_ia'] = None # industry adjusted % change in capital expenditures

df['pchcurrat'] = (df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in current ratio

# df['pchdpr'] = df.groupby('ticker')['depreciation'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0) # % change in depreciation

df['pchgm_pchsale'] = (df.groupby('ticker')['grossprofit'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in gross margin - % change in sales

df['pchquick'] = (df.groupby('ticker')['quick'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in quick ratio

df['pchsale_pchinvt'] = (df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None)) - df.groupby('ticker')['currentassets'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in sales - % change in inventory

# df['pchsale_pchrect'] = df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0) - df.groupby('ticker')['receivables'].transform(lambda x: x.pct_change(periods=12, fill_method=None)).fillna(0) # % change in sales - % change in receivables

df['pchsale_pchxsga'] = None # % change in sales - % change in SG&A

df['saleinv'] = (df['revenue'] / df['currentassets']).fillna(0).replace([np.inf, -np.inf], 0) # sales to inventory ratio

df['pchsaleinv'] = (df.groupby('ticker')['saleinv'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # % change in sales to inventory ratio
df['pchsaleinv'] = df['pchsaleinv'].replace([np.inf, -np.inf], 0)

df['pctacc'] = (df['netinc'] - df['cashflow'])/np.abs(df['netinc'].replace(0,0.01)) # % accruals

df['pricedelay'] = df['pricedelay']  # price delay

df['ps'] = None # financial statements score

df['quick'] = df['quick'] # quick ratio

df['rd'] = (df.groupby('ticker')['rnd'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # R&D increase

df['rd_mve'] = df['rnd'].fillna(0) / df['mcap'] # R&D to market cap ratio

df['rd_sale'] = (df['rnd'].fillna(0) / df['revenue']).fillna(0).replace([np.inf, -np.inf], 0) # R&D to sales ratio

df['realestate'] = None # real estate holdings

df['retvol'] = df['retvol'] # return volatility

df['roaq'] = df['netinc'] / df['assets'] # return on assets

df['roavol'] = None # earning volatility

df['roeq'] = df['netinc'] / df['equity'] # return on equity

df['roic'] = df['netinc'] / (df['liabilities_equity']) # return on invested capital

df['rsup'] = None # revenue surprise

df['salecash'] = (df['revenue'] / df['cash']).fillna(0).replace([np.inf, -np.inf], 0) # sales to cash ratio 

# df['salerec'] =  df['revenue'] / df['receivables'] # sales to receivables ratio

df['secured'] = None # secured debt

df['securedind'] = None # secured debt indicator

df['sgr'] = (df.groupby('ticker')['revenue'].transform(lambda x: x.pct_change(periods=12, fill_method=None))).fillna(0) # sales growth

df['sin'] = None # sin stock indicator

df['sp'] = df['revenue'].fillna(0) / df['mcap'] # sales to price ratio

df['std_dkkvol'] = df['std_dkkvol']  # standard deviation of dkk volume

df['std_turn'] = df['std_turn']  # standard deviation of turnover

df['stdacc'] = None # standard deviation of accruals

df['stdcf'] = None # standard deviation of cashflow

df['tang'] = None # debt capacity/firm tangibility

df['tb'] = None # tax income to book income

df['turn'] = df['turn']  # turnover

df['zerotrade'] = df['zerotrade'] # zero trading days


  result = getattr(ufunc, method)(*inputs, **kwargs)
  df['tb'] = None # tax income to book income


In [16]:
fin_df = df.copy()

fin_df['adjclose_div'] = fin_df['adjclose'] + fin_df['div'] # adjcose incl monthly dividend
fin_df['target'] = fin_df.groupby('ticker')['adjclose_div'].transform(lambda x: x.pct_change(periods=1, fill_method=None))
fin_df['target'] = fin_df['target'].shift(-1) # shift the target by 1 month

# drop rows where mom12m is NaN
fin_df.dropna(subset=['mom12m','target'], inplace=True)

for col in fin_df.columns:
    if fin_df[col].isnull().all():
        fin_df.drop(columns=[col], inplace=True)


fin_df['year'] = fin_df['timestamp'].dt.year

# 3. Group, count NaNs, then filter out all-zero rows
nan_summary = (
    fin_df
    .groupby(['ticker','year'])
    .apply(lambda grp: grp.isna().sum())                # count NaNs column-wise
    .drop(columns=['timestamp','ticker','year'])         # drop grouping cols
    .reset_index()
)

# 4. Keep only rows where there’s at least one missing value
#    (i.e. the row-sum of NaNs across data columns > 0)
data_cols    = nan_summary.columns.difference(['ticker','year'])
nan_summary  = nan_summary[nan_summary[data_cols].sum(axis=1) > 0]
# nan_summary.drop(columns=['ask','bid','adjclose','high','low','open','turnover','div','beta','betasq','idiovol','pricedelay','mom1m','mom6m','mom12m','mom7_12m','adjclose_div','target','divi','divo','dy', 'shares','NACE','grossoutput','intercons'], inplace=True)

display(nan_summary)

  .apply(lambda grp: grp.isna().sum())                # count NaNs column-wise


Unnamed: 0,ticker,year,ask,bid,adjclose,high,low,open,turnover,div,...,pctacc,rd,rd_mve,rd_sale,roaq,roeq,roic,salecash,sgr,sp
24,AARHUS.CO^D12,2002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,AARHUS.CO^D12,2003,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,AARHUS.CO^D12,2004,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27,AARHUS.CO^D12,2005,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,AARHUS.CO^D12,2006,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4001,VORD.CO^A14,2012,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4002,VORD.CO^A14,2013,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4035,WALLS.CO^I10,2007,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4036,WALLS.CO^I10,2008,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# assume you already have:
#   nan_summary with columns ['ticker','year', <all your data-columns>]

# 1. Identify just the “data” columns (i.e. drop the grouping cols)
data_cols = nan_summary.columns.difference(['ticker','year'])

# 2. Compute total missing per ticker
#    - group by ticker, sum each data column over all years
#    - then sum across those columns to get one number per ticker
ticker_missing = (
    nan_summary
    .groupby('ticker')[data_cols]    # group & select data cols
    .sum()                           # sum each col over years
    .sum(axis=1)                     # sum across cols → one total per ticker
)

# 3. Compute total missing per column
#    - just sum each column over all rows (ticker-years)
column_missing = nan_summary[data_cols].sum()

# 4. Print them out
print("Missing values by ticker:")
print(ticker_missing)

# Filter to only cols with >0 missing
nonzero_column_missing = column_missing[column_missing > 0]

print("Missing values by column (only non-zero):")
print(nonzero_column_missing)


Missing values by ticker:
ticker
AARHUS.CO^D12    123
AFFI.CO^J12       12
ALBCb.CO^F02     155
ALMB.CO          365
ALMBFb.CO^I14    236
                ... 
VIND.CO^K12      209
VJBA.CO          457
VORD.CO^A14      144
WALLS.CO^I10      12
WEWER.CO^L05       2
Length: 151, dtype: int64
Missing values by column (only non-zero):
absacc                 198
acc                    198
assets                 198
bm                     198
cash                   210
cashflow               198
cashpr                 210
cfp                    198
cfp_ia                 198
chatoia                368
currentassets          194
currentliabilities     194
debt                   198
depreciation          7067
ep                     194
equity                 195
gma                    198
grossprofit            194
lev                    198
liabilities_equity     198
longdebt               198
netinc                 198
operprof               198
opinc                  197
pctacc              

In [18]:
# drop columns that are not needed
df.drop(columns=['adjclose', 'high', 'low', 'open', 'div', 'ask', 'bid','turnover','shares',
                 'grossoutput', 'intercons', 'prodind', 'valaddoutput',
                 'assets', 'debt', 'opinc', 'netinc', 'rnd', 'cashflow', 'revenue', 'grossprofit', 'currentassets', 'volume',
                 'currentliabilities', 'liabilities_equity', 'assets', 'longdebt', 'receivables', 'depreciation', 'equity' 
                 ], inplace=True)

# remove empty columns (None from above)
for col in df.columns:
    if df[col].isnull().all():
        df.drop(columns=[col], inplace=True)

# drop if mom12m or target is nan (first 12 months and last month for each ticker)
df.dropna(subset=['mom12m','target'], inplace=True)

# drop if any nan
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

cols = df.columns
cols = cols.sort_values()
for col in cols:
    if col in ['NACE']:
        continue
    print(f'{col}')

absacc
acc
adjclose_div
age
agr
baspread
beta
betasq
bm
cash
cashdebt
cashpr
cfp
cfp_ia
chat
chatoia
chinv
chmom
chpm
chpmia
divi
divo
dkkvol
dy
egr
ep
gma
grltnoa
idiovol
ill
indmom
lev
lgr
maxret
mcap
mom12m
mom1m
mom6m
mve_ia
mvel1
operprof
pchcurrat
pchgm_pchsale
pchquick
pchsale_pchinvt
pchsaleinv
pctacc
pricedelay
quick
rd
rd_mve
rd_sale
retvol
risk_free
roaq
roeq
roic
salecash
saleinv
sgr
sp
std_dkkvol
std_turn
target
ticker
timestamp
turn
zerotrade


In [19]:
df

Unnamed: 0,timestamp,ticker,divi,divo,dy,NACE,cash,quick,maxret,retvol,...,pctacc,rd,rd_mve,rd_sale,roaq,roeq,roic,salecash,sgr,sp
0,2002-01-31,AAB.CO,0.0,0.0,0.0,93,0.317120,3.16768,0.040816,0.018440,...,-3.630328,0.000000,0.000000,0.000000,-0.046779,-0.059482,-0.046779,1.667411e+08,0.000000,0.007500
1,2002-02-28,AAB.CO,0.0,0.0,0.0,93,0.317120,3.16768,0.058824,0.025166,...,-3.630328,0.000000,0.000000,0.000000,-0.046779,-0.059482,-0.046779,1.667411e+08,0.000000,0.006697
2,2002-03-31,AAB.CO,0.0,0.0,0.0,93,0.317120,3.16768,0.125000,0.040068,...,-3.630328,0.000000,0.000000,0.000000,-0.046779,-0.059482,-0.046779,1.667411e+08,0.000000,0.005953
3,2002-04-30,AAB.CO,0.0,0.0,0.0,93,0.317120,3.16768,0.035088,0.019353,...,-3.630328,0.000000,0.000000,0.000000,-0.046779,-0.059482,-0.046779,1.667411e+08,0.000000,0.006356
4,2002-05-31,AAB.CO,0.0,0.0,0.0,93,0.317120,3.16768,0.020833,0.021445,...,-3.630328,0.000000,0.000000,0.000000,-0.046779,-0.059482,-0.046779,1.667411e+08,0.000000,0.007653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45388,2024-11-30,ZELA.CO,0.0,0.0,0.0,72,0.824779,6.22623,0.061986,0.041247,...,-0.035327,0.129009,0.012847,1.943863,-0.355425,-0.441814,-0.355425,4.156118e+08,2.296482,0.006609
45389,2024-12-31,ZELA.CO,0.0,0.0,0.0,72,0.824779,6.22623,0.052288,0.026187,...,-0.035327,0.129009,0.013107,1.943863,-0.355425,-0.441814,-0.355425,4.156118e+08,2.296482,0.006743
45390,2025-01-31,ZELA.CO,0.0,0.0,0.0,72,0.824779,6.22623,0.049498,0.023436,...,-0.035327,0.129009,0.012751,1.943863,-0.355425,-0.441814,-0.355425,4.156118e+08,2.296482,0.006560
45391,2025-02-28,ZELA.CO,0.0,0.0,0.0,72,0.824779,6.22623,0.070494,0.030878,...,-0.035327,0.129009,0.014145,1.943863,-0.355425,-0.441814,-0.355425,4.156118e+08,2.296482,0.007277


In [20]:
# create dummy variables for the industry
industry_dummies = pd.get_dummies(df['NACE'], prefix='NACE')
df = pd.concat([df, industry_dummies], axis=1)
df.drop(columns=['NACE'], inplace=True)

# create dummies for the month
month_dummies = pd.get_dummies(df['timestamp'].dt.month, prefix='month')
df = pd.concat([df, month_dummies], axis=1)

# save the data
df.to_csv('data/data.csv', index=False)