In [1]:
import pandas as pd
import numpy as np
from functions import *
import re

%load_ext autoreload
%autoreload 2

pd.set_option('future.no_silent_downcasting', True)

# Import data

In [2]:
# set paths to data
path = '/Users/johan/Library/CloudStorage/GoogleDrive-johan.oelgaard@gmail.com/My Drive/04 Økonomi/10 Thesis/Data'

# read daily market data from eikon
daily = 'eikon_daily.xlsx'
eikon_dfs = pd.read_excel(path + '/' + daily, sheet_name=None)
eikon_keys = eikon_dfs.keys()

monthly = 'eikon_monthly.xlsx'
eikon_divi = pd.read_excel(path + '/' + monthly, sheet_name='Dividend', header=1)

# Load in trading data and dividends

In [3]:
# set first row as header
divi = eikon_divi.iloc[:,1:].copy()
divi.rename(columns={'Unnamed: 1': 'ticker', 'Date': 'announcement timestamp', 'Dividend Pay Date': 'timestamp', 'Adjusted Gross Dividend Amount': 'adjdivi gross', 'Adjusted Net Dividend Amount': 'adjdivi net'}, inplace=True)

# if net dividend is not available, use gross dividend
divi['adjdivi net'] = divi['adjdivi net'].fillna(divi['adjdivi gross'])
divi['timestamp'] = divi['timestamp'].fillna(divi['announcement timestamp'])

divi['timestamp'] = pd.to_datetime(divi['timestamp'], format='%d-%b-%Y', errors='coerce')

# drop other columns
divi = divi[['ticker', 'timestamp', 'adjdivi net']]
divi = divi.rename(columns={'adjdivi net': 'dividend'})

# drop na
divi = divi.dropna().reset_index(drop=True)
divi.set_index('timestamp', inplace=True)


In [4]:
# load trade data
trade_values_df = eikon_dfs['Trade Values'].iloc[:,1:]
# set up multi-index for the columns
trade_values_df.columns = pd.MultiIndex.from_arrays(trade_values_df.iloc[:2].values)
# drop the first two rows as they are now headers
trade_values_df = trade_values_df.iloc[2:].reset_index(drop=True)
# set the first column as index
trade_values_df.set_index(trade_values_df.columns[0], inplace=True)
trade_values_df.index.name = "timestamp"
trade_values_df = trade_values_df.sort_index(axis=1, level=0)
# keep only trade close values
trade_values_df = trade_values_df.loc[:, (slice(None), ['Trade Close','Trade Volume'])]
# set 0 values to NaN
trade_values_df = trade_values_df.replace(0, np.nan)

# backward fill the data for each ticker
idx = pd.IndexSlice

# loop over the tickers that are actually in the df
for ticker in trade_values_df.columns.get_level_values(0).unique():
    # extract the sub-dataframe for this ticker using .loc with IndexSlice
    subdf = trade_values_df.loc[:, idx[ticker, :]]
    
    # find the index range where the ticker has any valid data
    valid_idx = subdf.dropna(how='all').index

    # use backward fill in the date range
    trade_values_df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]] = trade_values_df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]].bfill()

# stack first level of columns to rows
trade_values_df = trade_values_df.stack(level=0,future_stack=True).reset_index()
trade_values_df = trade_values_df.dropna()
# rename columns
trade_values_df.columns = ['timestamp', 'ticker', 'adjclose', 'volume']
# set first column as index
trade_values_df.set_index('timestamp', inplace=True)


  return Index(sequences[0], name=names)


In [5]:
# add dividend data
trade_values_df = trade_values_df.merge(divi, how='left', on=['ticker', 'timestamp'])
# set dividend to 0 na
trade_values_df['dividend'] = trade_values_df['dividend'].fillna(0)

trade_values_df['adjclose_divi'] = trade_values_df['adjclose'] + trade_values_df['dividend']

# calculate the daily returns
trade_values_df = trade_values_df.sort_values(by=['ticker', 'timestamp'], ascending=[True, True])
trade_values_df['stkre'] = trade_values_df.groupby('ticker', group_keys=False)['adjclose_divi'].pct_change()

# Load index data

In [6]:
# load index data
omxcpi = eikon_dfs['OMXCPI'].iloc[:,1:]
# set first row as header
omxcpi.columns = omxcpi.iloc[0]
# drop the first row as it is now header
omxcpi = omxcpi.iloc[1:].reset_index(drop=True)
# set the first column as index
omxcpi.set_index(omxcpi.columns[0], inplace=True)
omxcpi.index.name = "timestamp"
omxcpi = omxcpi.sort_index(axis=1)
# keep only closing values
omxcpi = omxcpi.loc[:,'Trade Close']
# convert to dataframe
omxcpi = pd.DataFrame(omxcpi)
# rename columns
omxcpi.columns = ['OMXCPI']

omxcpi = omxcpi.sort_index(ascending=True)
omxcpi['mktre'] = omxcpi['OMXCPI'].pct_change()

  return Index(sequences[0], name=names)


# Calculate beta

In [7]:
# # calculate the rolling beta
# # join the two dataframes on index
# beta = trade_values_df.join(omxcpi, how='left')
# # drop Trade Close and OMXCPI columns and calculate beta
# beta = beta.drop(columns=['adjclose', 'volume', 'OMXCPI', 'adjclose_divi', 'dividend']).dropna()
# beta = beta.groupby('ticker').apply(rolling_beta, include_groups=False)

# # create df
# beta = beta.reset_index()
# beta.columns = ['ticker', 'timestamp', 'beta']
# # set the index to timestamp
# beta.set_index('timestamp', inplace=True)

# # save the beta to csv
# beta.to_csv('data/beta.csv')

# Load additional trading data

In [8]:
turnover_df = eikon_dfs['Turnover'].iloc[:,1:]
ask_df = eikon_dfs['Ask'].iloc[:,1:]
bid_df = eikon_dfs['Bid'].iloc[:,1:]


turnover_df.columns = pd.MultiIndex.from_arrays(turnover_df.iloc[:2].values)
turnover_df = turnover_df.iloc[2:].reset_index(drop=True)
turnover_df.set_index(turnover_df.columns[0], inplace=True)
turnover_df.index.name = "timestamp"  

ask_df.columns = pd.MultiIndex.from_arrays(ask_df.iloc[:2].values)
ask_df = ask_df.iloc[2:].reset_index(drop=True)
ask_df.set_index(ask_df.columns[0], inplace=True)
ask_df.index.name = "timestamp"

bid_df.columns = pd.MultiIndex.from_arrays(bid_df.iloc[:2].values)
bid_df = bid_df.iloc[2:].reset_index(drop=True)
bid_df.set_index(bid_df.columns[0], inplace=True)
bid_df.index.name = "timestamp"


turnover_df = turnover_df.stack(level=0, future_stack=True).reset_index().set_index('timestamp')
turnover_df.drop(columns=['TRNOVR_UNS'], inplace=True)
# turnover_df.dropna(inplace=True)
turnover_df.columns = ['ticker', 'turnover']

ask_df = ask_df.stack(level=0, future_stack=True).reset_index().set_index('timestamp')
ask_df.drop(columns=['ASK'], inplace=True)
# ask_df.dropna(inplace=True)
ask_df.columns = [ 'ticker', 'ask']

bid_df = bid_df.stack(level=0,future_stack=True).reset_index().set_index('timestamp')
bid_df.drop(columns=['BID'], inplace=True)
# bid_df.dropna(inplace=True)
bid_df.columns = ['ticker', 'bid']

# merge w. trade values on index and ticker
# df = trade_values_df.reset_index()
df = trade_values_df.copy()
df = df.merge(turnover_df, on=['timestamp', 'ticker'], how='left')
df = df.merge(ask_df, on=['timestamp', 'ticker'], how='left')
df = df.merge(bid_df, on=['timestamp', 'ticker'], how='left')

# prerequisites -------------------------------------------------------------
df = df.sort_values(['ticker', 'timestamp'])         # already have the right order
cols_to_ffill = ['turnover', 'ask', 'bid']           # numeric columns to fill

# Identify where row have *any* real data?
has_val = df[cols_to_ffill].notna().any(axis=1)      # boolean Series, same length as df

#     Inside every ticker, mark rows that lie *after* the first real data point, AND *before* the last real data point.
g = df['ticker']                                     # short alias

# cummax() of True/False gives a running “ever seen True so far?”
left_ok  = has_val.groupby(g).cummax()               # after (or at) 1st real value
right_ok = has_val.iloc[::-1].groupby(g.iloc[::-1]) \
                        .cummax().iloc[::-1]         # before (or at) last real value

mask = left_ok & right_ok                            # True only inside the window

# compute a forward fill *inside each ticker* once
filled = df.groupby(g, group_keys=False)[cols_to_ffill].ffill()

# put the filled numbers back, but **only** where `mask` is True
df.loc[mask, cols_to_ffill] = filled.loc[mask]

# optionally get rid of rows that are still all-NaN
df.dropna(subset=cols_to_ffill, how='all', inplace=True)

  return Index(sequences[0], name=names)
  return Index(sequences[0], name=names)
  return Index(sequences[0], name=names)


# Calculate metrics

In [9]:
# set variables to numeric
df['adjclose'] = pd.to_numeric(df['adjclose'], errors='coerce')
df['volume'] = pd.to_numeric(df['volume'], errors='coerce')
df['stkre'] = pd.to_numeric(df['stkre'], errors='coerce')
df['turnover'] = pd.to_numeric(df['turnover'], errors='coerce')
df['ask'] = pd.to_numeric(df['ask'], errors='coerce')
df['bid'] = pd.to_numeric(df['bid'], errors='coerce')

# set index to datetime
df.reset_index(inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
# set index to timestamp
# df.set_index('timestamp', inplace=True)


# calc addl metrics
df["baspread"] = ((df.ask - df.bid) / (df.ask + df.bid) / 2).where((df.ask + df.bid) / 2 != 0)
df['dkk_vol'] = df['adjclose'] * df['volume']
df['zerotrade'] = np.where(df['dkk_vol'] == 0, 1, 0)
df['ill'] = (df['stkre'].abs() / df['dkk_vol']).replace(np.inf, 0)

# group by ticker & month-end, aggregating stkre with max and everything else with mean
monthly = (
    df
    .groupby(
        ['ticker', pd.Grouper(key='timestamp', freq='ME')]
    )
    .agg(
        volume      =('volume',   'mean'),
        maxret      =('stkre',    'max'),
        retvol      =('stkre',    'std'),
        turn        =('turnover','mean'),
        std_turn    =('turnover','std'),
        baspread    =('baspread','mean'),
        dkkvol      =('dkk_vol',  'mean'),
        std_dkkvol  =('dkk_vol',  'std'),
        zerotrade   =('zerotrade','mean'),
        ill         =('ill',      'mean'),
    )
    .reset_index()
)


In [10]:
# index variance
WINDOW = 365
MINP = 1
svar = omxcpi['mktre'].rolling(WINDOW, min_periods=MINP).var(ddof=0)
svar = svar.resample('M').last()
# create df with market variance
svar_df = pd.DataFrame(svar)
# rename columns
svar_df.columns = ['svar']

monthly = monthly.merge(svar_df, on=["timestamp"], how="left")

  svar = svar.resample('M').last()


## Metrics based on weekly series

In [11]:
df["week"] = df["timestamp"].dt.to_period("W").dt.to_timestamp("W-SAT")
mkt_ret_w = (1 + omxcpi['mktre']).resample("W-SAT").prod() - 1


# stock-level compounded weekly return
wkret = (
    df.groupby(["ticker", "week"])["stkre"]
      .apply(lambda x: (1 + x).prod() - 1)
      .unstack("ticker")              # rows=week, cols=ticker
      .reindex(mkt_ret_w.index)       # align with market
)

WINDOW = 52  # reducing to 1 year from 3 years due to data availability
MINP   = 1   # minimum number of periods for rolling calculations (first 12 months will be dropped later)

# helper — same for every ticker
mkt_var = mkt_ret_w.rolling(WINDOW, min_periods=MINP).var(ddof=0)

# creat df with mkt_var and timestamp
mkt_var_df = pd.DataFrame(mkt_var)
mkt_var_df.reset_index(inplace=True)


betas      = {}
idiovols   = {}
pricedelay = {}

# lagged market matrix — give the columns names 'lag0' … 'lag4'
lagged_mkt = pd.concat(
    [mkt_ret_w.shift(i).rename(f"lag{i}") for i in range(5)],
    axis=1
)

# pre-compute the five lagged market series once
market_lag = {j: mkt_ret_w.shift(j) for j in range(5)}

# loop over tickers  (β, idioσ unchanged)
for tic in wkret.columns:
    r = wkret[tic]

    # beta and betasq
    cov  = r.rolling(WINDOW, min_periods=MINP).cov(mkt_ret_w, ddof=0)
    beta = cov / mkt_var
    betas[tic] = beta
    idiovols[tic] = (r - beta * mkt_ret_w).rolling(WINDOW, min_periods=MINP).std(ddof=0)

    # price-delay (Hou-Moskowitz)
    r = wkret[tic]

    # rolling corr(r, mkt lag j) for j = 0…4   ⇒   R²_j
    r2 = [r.rolling(WINDOW, min_periods=MINP)
            .corr(market_lag[j])
            .pow(2)
          for j in range(5)]

    r2_sum = sum(r2)                 # R²_full  (vectorised)
    pd_ser = 1 - r2[0] / r2_sum      # price-delay

    # clean up divisions by 0 or all-NaN windows
    pricedelay[tic] = pd_ser.where(r2_sum != 0)

In [12]:
beta_df      = pd.concat(betas,      axis=1).stack().rename("beta")
idiovol_df   = pd.concat(idiovols,   axis=1).stack().rename("idiovol")
pricedelay_df= pd.concat(pricedelay, axis=1).stack().rename("pricedelay")


weekly_panel = pd.concat([beta_df, idiovol_df, pricedelay_df], axis=1)
weekly_panel["betasq"] = weekly_panel["beta"] ** 2

# take the *last* weekly observation in each calendar month
weekly_panel.index.names = ["week", "ticker"]
week_to_month = weekly_panel.groupby(["ticker",
                                      pd.Grouper(level="week", freq="ME")]).last()
week_to_month = week_to_month.reset_index().rename(columns={"week": "timestamp"})

monthly = monthly.merge(week_to_month, on=["ticker", "timestamp"], how="left")


## Momentum metrics

In [13]:
df["month"] = df["timestamp"].dt.to_period("M").dt.to_timestamp("M")

mret = (
    1 + df.groupby(["ticker", "month"])["stkre"]
          .apply(lambda x: (1 + x).prod() - 1)    # daily ⇒ monthly
)
mret.index = mret.index.set_names(["ticker", "month"])

g = mret.groupby(level=0, group_keys=False)                # group by ticker only

# momentum variables
mom1m  = mret                             .rename("mom1m")   # month t-1
mom6m  = g.apply(cumret, 6, 2)            .rename("mom6m")   # t-2 … t-6
mom12m = g.apply(cumret, 12, 2)           .rename("mom12m")  # t-2 … t-12
mom7_12m = g.apply(cumret,12,7)          .rename("mom7_12m") # t-7 … t-12
# mom36m = g.apply(cumret, 36, 13)          .rename("mom36m")  # t-13 … t-36 # removing this for now

# df for merge
momentum = (
    pd.concat([mom1m, mom6m, mom12m, mom7_12m], axis=1)   # guaranteed same index
      .reset_index()
      .rename(columns={"month": "timestamp"})
)

# merge
monthly = monthly.merge(momentum, on=["ticker", "timestamp"], how="left")


# Save

In [14]:
# save the data to csv
monthly.to_csv('data/trade_daily.csv', index=False)