In [1]:
import pandas as pd
import numpy as np
from functions import *
import re

%load_ext autoreload
%autoreload 2

pd.set_option('future.no_silent_downcasting', True)

# Import data

In [2]:
# set paths to data
path = '/Users/johan/Library/CloudStorage/GoogleDrive-johan.oelgaard@gmail.com/My Drive/04 Økonomi/10 Thesis/Data'

# read daily market data from eikon
daily = 'eikon_daily.xlsx'
eikon_dfs = pd.read_excel(path + '/' + daily, sheet_name=None)
eikon_keys = eikon_dfs.keys()

# Load in trading data

In [3]:
# load trade data
trade_values_df = eikon_dfs['Trade Values'].iloc[:,1:]
# set up multi-index for the columns
trade_values_df.columns = pd.MultiIndex.from_arrays(trade_values_df.iloc[:2].values)
# drop the first two rows as they are now headers
trade_values_df = trade_values_df.iloc[2:].reset_index(drop=True)
# set the first column as index
trade_values_df.set_index(trade_values_df.columns[0], inplace=True)
trade_values_df.index.name = "timestamp"
trade_values_df = trade_values_df.sort_index(axis=1, level=0)
# keep only trade close values
trade_values_df = trade_values_df.loc[:, (slice(None), ['Trade Close','Trade Volume'])]
# set 0 values to NaN
trade_values_df = trade_values_df.replace(0, np.nan)

# backward fill the data for each ticker
idx = pd.IndexSlice

# loop over the tickers that are actually in the df
for ticker in trade_values_df.columns.get_level_values(0).unique():
    # extract the sub-dataframe for this ticker using .loc with IndexSlice
    subdf = trade_values_df.loc[:, idx[ticker, :]]
    
    # find the index range where the ticker has any valid data
    valid_idx = subdf.dropna(how='all').index

    # use backward fill in the date range
    trade_values_df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]] = trade_values_df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]].bfill()

# stack first level of columns to rows
trade_values_df = trade_values_df.stack(level=0,future_stack=True).reset_index()
trade_values_df = trade_values_df.dropna()
# rename columns
trade_values_df.columns = ['timestamp', 'ticker', 'adjclose', 'volume']
# set first column as index
trade_values_df.set_index('timestamp', inplace=True)

# calculate the daily returns
trade_values_df = trade_values_df.sort_values(by=['ticker', 'timestamp'], ascending=[True, True])
trade_values_df['stkre'] = trade_values_df.groupby('ticker', group_keys=False)['adjclose'].pct_change()


  return Index(sequences[0], name=names)


# Load index data

In [4]:
# load index data
omxcpi = eikon_dfs['OMXCPI'].iloc[:,1:]
# set first row as header
omxcpi.columns = omxcpi.iloc[0]
# drop the first row as it is now header
omxcpi = omxcpi.iloc[1:].reset_index(drop=True)
# set the first column as index
omxcpi.set_index(omxcpi.columns[0], inplace=True)
omxcpi.index.name = "timestamp"
omxcpi = omxcpi.sort_index(axis=1)
# keep only closing values
omxcpi = omxcpi.loc[:,'Trade Close']
# convert to dataframe
omxcpi = pd.DataFrame(omxcpi)
# rename columns
omxcpi.columns = ['OMXCPI']

omxcpi = omxcpi.sort_index(ascending=True)
omxcpi['mktre'] = omxcpi['OMXCPI'].pct_change()

  return Index(sequences[0], name=names)


# Calculate beta

In [5]:
# calculate the rolling beta
# join the two dataframes on index
beta = trade_values_df.join(omxcpi, how='left')
# drop Trade Close and OMXCPI columns and calculate beta
beta = beta.drop(columns=['adjclose', 'volume', 'OMXCPI']).dropna()
beta = beta.groupby('ticker').apply(rolling_beta, include_groups=False)

# create df
beta = beta.reset_index()
beta.columns = ['ticker', 'timestamp', 'beta']
# set the index to timestamp
beta.set_index('timestamp', inplace=True)

# save the beta to csv
beta.to_csv('data/beta.csv')

# Load additional trading data

In [6]:
turnover_df = eikon_dfs['Turnover'].iloc[:,1:]
ask_df = eikon_dfs['Ask'].iloc[:,1:]
bid_df = eikon_dfs['Bid'].iloc[:,1:]


turnover_df.columns = pd.MultiIndex.from_arrays(turnover_df.iloc[:2].values)
turnover_df = turnover_df.iloc[2:].reset_index(drop=True)
turnover_df.set_index(turnover_df.columns[0], inplace=True)
turnover_df.index.name = "timestamp"  

ask_df.columns = pd.MultiIndex.from_arrays(ask_df.iloc[:2].values)
ask_df = ask_df.iloc[2:].reset_index(drop=True)
ask_df.set_index(ask_df.columns[0], inplace=True)
ask_df.index.name = "timestamp"

bid_df.columns = pd.MultiIndex.from_arrays(bid_df.iloc[:2].values)
bid_df = bid_df.iloc[2:].reset_index(drop=True)
bid_df.set_index(bid_df.columns[0], inplace=True)
bid_df.index.name = "timestamp"


turnover_df = turnover_df.stack(level=0, future_stack=True).reset_index().set_index('timestamp')
turnover_df.drop(columns=['TRNOVR_UNS'], inplace=True)
# turnover_df.dropna(inplace=True)
turnover_df.columns = ['ticker', 'turnover']

ask_df = ask_df.stack(level=0, future_stack=True).reset_index().set_index('timestamp')
ask_df.drop(columns=['ASK'], inplace=True)
# ask_df.dropna(inplace=True)
ask_df.columns = [ 'ticker', 'ask']

bid_df = bid_df.stack(level=0,future_stack=True).reset_index().set_index('timestamp')
bid_df.drop(columns=['BID'], inplace=True)
# bid_df.dropna(inplace=True)
bid_df.columns = ['ticker', 'bid']

# merge w. trade values on index and ticker
# df = trade_values_df.reset_index()
df = trade_values_df.copy()
df = df.merge(turnover_df, on=['timestamp', 'ticker'], how='left')
df = df.merge(ask_df, on=['timestamp', 'ticker'], how='left')
df = df.merge(bid_df, on=['timestamp', 'ticker'], how='left')
df = df.merge(omxcpi, on=['timestamp'], how='left')

# prerequisites -------------------------------------------------------------
df = df.sort_values(['ticker', 'timestamp'])         # already have the right order
cols_to_ffill = ['turnover', 'ask', 'bid']           # numeric columns to fill

# Identify where row have *any* real data?
has_val = df[cols_to_ffill].notna().any(axis=1)      # boolean Series, same length as df

#     Inside every ticker, mark rows that lie
#     (a) *after* the first real data point, AND
#     (b) *before* the last real data point.
g = df['ticker']                                     # short alias

# cummax() of True/False gives a running “ever seen True so far?”
left_ok  = has_val.groupby(g).cummax()               # after (or at) 1st real value
right_ok = has_val.iloc[::-1].groupby(g.iloc[::-1]) \
                        .cummax().iloc[::-1]         # before (or at) last real value

mask = left_ok & right_ok                            # True only inside the window

# compute a forward fill *inside each ticker* once
filled = df.groupby(g, group_keys=False)[cols_to_ffill].ffill()

# put the filled numbers back, but **only** where `mask` is True
df.loc[mask, cols_to_ffill] = filled.loc[mask]

# optionally get rid of rows that are still all-NaN
df.dropna(subset=cols_to_ffill, how='all', inplace=True)

  return Index(sequences[0], name=names)
  return Index(sequences[0], name=names)
  return Index(sequences[0], name=names)


In [21]:
import pandas as pd
df = df.reset_index()

# 0) make sure timestamp is a datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# 1) build weekly stock returns (long form)
weekly = (
    df
    .set_index('timestamp')
    .groupby('ticker')['adjclose']
    .resample('W-FRI')
    .last()                          # Friday close per ticker
    .to_frame('weekly_price')
    .reset_index()
)
weekly['r_stock'] = (
    weekly
    .groupby('ticker')['weekly_price']
    .pct_change()
)

# 2) compute equal-weight market return
weekly['r_mkt'] = (
    weekly
    .groupby('timestamp')['r_stock']
    .transform('mean')
)

# 3) switch to a MultiIndex for vectorized rolling
weekly = (
    weekly
    .set_index(['ticker','timestamp'])
    .sort_index()
)

# 4) rolling cov & var over a 3-year window (≈1095 days)
#    require at least 52 obs
cov = (
    weekly['r_stock']
    .groupby(level='ticker')
    .rolling('1095D', min_periods=52, level='timestamp')
    .cov(weekly['r_mkt'])
)
var = (
    weekly['r_mkt']
    .groupby(level='ticker')
    .rolling('1095D', min_periods=52, level='timestamp')
    .var()
)

# 5) β = Cov / Var  (this lines up by MultiIndex)
weekly['beta'] = cov.div(var)

# 6) grab each month-end’s β
beta_monthly = (
    weekly['beta']
    .groupby(level='ticker')
    .resample('M', level='timestamp')
    .last()
    .reset_index()    # columns: ticker, timestamp (month-end), beta
)

# result
beta_monthly.head()


TypeError: BaseWindow.__init__() got an unexpected keyword argument 'level'

In [22]:
weekly

Unnamed: 0_level_0,Unnamed: 1_level_0,weekly_price,r_stock,r_mkt
ticker,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAB.CO,2001-01-05,2719.270469,,
AAB.CO,2001-01-12,2823.857795,0.038462,0.010016
AAB.CO,2001-01-19,2823.857795,0.000000,0.019115
AAB.CO,2001-01-26,3695.593155,0.308704,0.008796
AAB.CO,2001-02-02,4026.612041,0.089571,0.002202
...,...,...,...,...
ZELA.CO,2025-03-21,564.000000,-0.036721,-0.002667
ZELA.CO,2025-03-28,553.000000,-0.019504,-0.019766
ZELA.CO,2025-04-04,426.200000,-0.229295,-0.072581
ZELA.CO,2025-04-11,427.700000,0.003519,0.002835


In [8]:
df

Unnamed: 0_level_0,ticker,adjclose,volume,stkre,turnover,ask,bid,OMXCPI,mktre
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-01-02,AAB.CO,2719.270469,106.131407,,290510,2771.564132,2666.976806,249.7,
2001-01-03,AAB.CO,2771.564132,18.013655,0.019231,49530,2771.564132,2719.270469,250.5,0.003204
2001-01-04,AAB.CO,2717.178723,15.585062,-0.019623,42184,2719.270469,2666.976806,253.45,0.011776
2001-01-05,AAB.CO,2719.270469,3.824555,0.00077,10250,2771.564132,2719.270469,254.66,0.004774
2001-01-08,AAB.CO,2666.976806,11.836998,-0.019231,51456,2771.564132,2719.270469,255.52,0.003377
...,...,...,...,...,...,...,...,...,...
2025-04-10,ZELA.CO,428.9,463688,0.050196,207283895.9,433.3,432.7,1209.34,0.033562
2025-04-11,ZELA.CO,427.7,310947,-0.002798,132637246.65,428.8,428.4,1226.65,0.014314
2025-04-14,ZELA.CO,451.8,224516,0.056348,100948224.85,452.9,452.3,1260.73,0.027783
2025-04-15,ZELA.CO,445,261342,-0.015051,117300125.35,445,444.6,1267.97,0.005743


# Calculate additional metrics

In [9]:
# set variables to numeric
df['adjclose'] = pd.to_numeric(df['adjclose'], errors='coerce')
df['volume'] = pd.to_numeric(df['volume'], errors='coerce')
df['stkre'] = pd.to_numeric(df['stkre'], errors='coerce')
df['turnover'] = pd.to_numeric(df['turnover'], errors='coerce')
df['ask'] = pd.to_numeric(df['ask'], errors='coerce')
df['bid'] = pd.to_numeric(df['bid'], errors='coerce')

# set index to datetime
df.reset_index(inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
# set index to timestamp
# df.set_index('timestamp', inplace=True)


# calc addl metrics
df["baspread"] = ((df.ask - df.bid) / (df.ask + df.bid) / 2).where((df.ask + df.bid) / 2 != 0)
df['dkk_vol'] = df['adjclose'] * df['volume']
df['zerotrade'] = np.where(df['dkk_vol'] == 0, 1, 0)
df['ill'] = (df['stkre'].abs() / df['dkk_vol']).replace(np.inf, 0)

# group by ticker & month-end, aggregating stkre with max and everything else with mean
monthly = (
    df
    .groupby(
        ['ticker', pd.Grouper(key='timestamp', freq='ME')]
    )
    .agg(
        volume      =('volume',   'mean'),
        maxret      =('stkre',    'max'),
        retvol      =('stkre',    'std'),
        turn        =('turnover','mean'),
        std_turn    =('turnover','std'),
        # ask         =('ask',      'mean'),
        # bid         =('bid',      'mean'),
        baspread    =('baspread','mean'),
        dkk_vol     =('dkk_vol',  'mean'),
        std_dkk_vol =('dkk_vol',  'std'),
        zerotrade   =('zerotrade','mean'),
        ill         =('ill',      'mean'),
    )
    .reset_index()
)


In [10]:
monthly

Unnamed: 0,ticker,timestamp,volume,maxret,retvol,turn,std_turn,baspread,dkk_vol,std_dkk_vol,zerotrade,ill
0,AAB.CO,2001-01-31,31.589957,0.141591,0.042131,1.031435e+05,9.668302e+04,0.010146,1.007836e+05,9.845082e+04,0.0,4.050091e-07
1,AAB.CO,2001-02-28,25.625476,0.107692,0.039264,1.040104e+05,1.466508e+05,0.012716,1.024128e+05,1.461388e+05,0.0,1.352057e-06
2,AAB.CO,2001-03-31,10.727008,0.048340,0.023578,4.074295e+04,3.059270e+04,0.012423,3.894207e+04,2.905703e+04,0.0,1.542239e-06
3,AAB.CO,2001-04-30,11.219758,0.062500,0.033441,3.931894e+04,4.641319e+04,0.010110,3.881989e+04,4.634822e+04,0.0,1.616075e-06
4,AAB.CO,2001-05-31,8.805583,0.054545,0.032558,2.695000e+04,2.470331e+04,0.010964,2.671999e+04,2.435900e+04,0.0,1.637343e-06
...,...,...,...,...,...,...,...,...,...,...,...,...
50263,ZELA.CO,2024-12-31,229929.277778,0.052288,0.026187,1.664209e+08,2.382756e+08,0.000376,1.716557e+08,2.616763e+08,0.0,1.615536e-10
50264,ZELA.CO,2025-01-31,160835.909091,0.049498,0.023436,1.169134e+08,4.341962e+07,0.000400,1.168313e+08,4.319272e+07,0.0,1.502253e-10
50265,ZELA.CO,2025-02-28,193842.050000,0.070494,0.030878,1.400748e+08,6.534724e+07,0.000426,1.400271e+08,6.522766e+07,0.0,1.806692e-10
50266,ZELA.CO,2025-03-31,516700.809524,0.378887,0.095034,3.047488e+08,3.478142e+08,0.000461,3.041278e+08,3.571356e+08,0.0,1.573758e-10


In [None]:
# green_appendix_factors.py - v0.2  (07-May-2025)
"""
Daily → month-end implementation of the price/liquidity characteristics from
Green, Hand & Zhang (2017) that can be computed **without** accounting data.

This revision fixes
• cumret() - now rolled *within* each ticker group (bug #1).
• rel_spread - now guards against 0/0 (bug #2).

-------------------------------------------------------------------------------
INPUT
    df : pandas.DataFrame with a daily DatetimeIndex named ``timestamp``
         columns: ticker, adjclose, volume, [turnover], [ask], [bid]
    market_returns : optional pandas Series of daily market returns (same index)

OUTPUT
    factors : MultiIndex (timestamp, ticker) DataFrame at month-end containing
        mom1m, mom6m, mom12m, mom36m, maxret, retvol, ill, dolvol, std_dolvol,
        std_turn, zerotrade, baspread, turn, beta, betasq, idiovol, pricedelay
"""
from __future__ import annotations

import numpy as np
import pandas as pd
from typing import Optional


def  cumret(mret: pd.Series, upper: int, lower: int) -> pd.Series:
    """Grouped cumulative return from month -upper to -lower (inclusive).

    * ``mret``  - Series with MultiIndex (ticker, timestamp) of *monthly* returns
    * ``upper`` - oldest lag  (e.g. 6 for months t-2 … t-6)
    * ``lower`` - most recent lag *excluded* (1 skips the last month)
    """
    if upper <= lower:
        raise ValueError("upper lag must be > lower lag")

    def _one(g: pd.Series) -> pd.Series:
        shifted = 1 + g.shift(lower + 1)  # +1 because we work with *returns*, not price
        window = upper - lower
        return (
            shifted.rolling(window, min_periods=window)
            .apply(np.prod, raw=True)
            .sub(1.0)
        )

    return mret.groupby(level=0, group_keys=False).apply(_one)


def compute_green_price_based_factors(
    df: pd.DataFrame,
    market_returns: Optional[pd.Series] = None,
) -> pd.DataFrame:
    """Compute Green et al. (2017) price/liquidity characteristics."""

    # ------------------------------------------------------------------
    # House-keeping & daily helpers
    # ------------------------------------------------------------------
    df = df.copy().sort_index()
    df.index.name = "timestamp"

    df["ret"] = df.groupby("ticker")["adjclose"].pct_change()
    df["dollar_vol"] = df["adjclose"] * df["volume"]

    if "turnover" not in df.columns:
        df["turnover"] = (
            df.groupby("ticker")["volume"].transform(lambda x: x / x.rolling(21, min_periods=1).mean())
        )

    if {"ask", "bid"}.issubset(df.columns):
        spread_den = (df["ask"] + df["bid"]) / 2
        df["rel_spread"] = np.where(spread_den != 0, (df["ask"] - df["bid"]) / spread_den, np.nan)

    g = df.groupby("ticker", group_keys=False)

    # ------------------------------------------------------------------
    # Month-end panel
    # ------------------------------------------------------------------
    price_m = g["adjclose"].resample("M", level="timestamp").last()
    mret = price_m.groupby(level=0).pct_change()

    # mom1m = mret
    # mom6m =  cumret(mret, 6, 1)      # months t-2 … t-6
    # mom12m =  cumret(mret, 12, 1)    # months t-2 … t-12
    # mom36m =  cumret(mret, 36, 13)   # months t-14 … t-36

    maxret = g["ret"].resample("M", level="timestamp").max().shift(1)
    retvol = g["ret"].resample("M", level="timestamp").std().shift(1)
    ill = g.apply(lambda x: (x["ret"].abs() / x["dollar_vol"]).resample("M").mean())
    dolvol = g["dollar_vol"].resample("M", level="timestamp").mean().shift(2).apply(np.log)
    std_dolvol = g["dollar_vol"].resample("M", level="timestamp").std()
    std_turn = g["turnover"].resample("M", level="timestamp").std()
    zerotrade = g["volume"].apply(lambda x: (x == 0).resample("M").mean())
    baspread = (
        g["rel_spread"].resample("M", level="timestamp").mean() if "rel_spread" in df.columns else None
    )
    turn = g["turnover"].resample("M", level="timestamp").mean().rolling(3, min_periods=3).mean()

    # ------------------------------------------------------------------
    # Weekly window for β, idioσ, price-delay (optional)
    # ------------------------------------------------------------------
    addl = []
    if market_returns is not None:
        weekly_ret = (1 + df["ret"]).groupby(
            [df["ticker"], df.index.to_period("W").to_timestamp("W-SAT")]
        ).prod() - 1
        weekly_ret = weekly_ret.unstack(level=0)

        weekly_mkt = market_returns.resample("W-SAT").apply(lambda x: (1 + x).prod() - 1)
        weekly_ret = weekly_ret.loc[weekly_mkt.index]
        window = 156  # ≈3 years

        betas, idiovols, pricedelays = {}, {}, {}
        for tic in weekly_ret.columns:
            wr = weekly_ret[tic]
            cov = wr.rolling(window).cov(weekly_mkt)
            var = weekly_mkt.rolling(window).var()
            beta = cov / var
            betas[tic] = beta
            idiovols[tic] = (wr - beta * weekly_mkt).rolling(window).std()

            # price-delay: Hou & Moskowitz (2005)
            r_full = wr
            lagged = pd.concat([weekly_mkt.shift(i) for i in range(5)], axis=1)
            with np.errstate(invalid="ignore"):
                r2_full = r_full.rolling(window).apply(lambda y: np.corrcoef(y, lagged.loc[y.index, 0])[0, 1] ** 2, raw=False)
                r2_lag0 = r2_full.copy()
                for i in range(1, 5):
                    r2_full += r_full.rolling(window).apply(lambda y: np.corrcoef(y, lagged.loc[y.index, i])[0, 1] ** 2, raw=False)
                pricedelays[tic] = 1 - r2_lag0 / r2_full

        beta_df = pd.concat(betas, axis=1).stack().rename("beta")
        idiovol_df = pd.concat(idiovols, axis=1).stack().rename("idiovol")
        pricedelay_df = pd.concat(pricedelays, axis=1).stack().rename("pricedelay")
        betasq_df = (beta_df ** 2).rename("betasq")
        addl = [beta_df, betasq_df, idiovol_df, pricedelay_df]

    # ------------------------------------------------------------------
    # Assemble & return
    # ------------------------------------------------------------------
    factors = (
        pd.concat(
            [
                mom1m.rename("mom1m"),
                mom6m.rename("mom6m"),
                mom12m.rename("mom12m"),
                mom36m.rename("mom36m"),
                maxret.rename("maxret"),
                retvol.rename("retvol"),
                ill.rename("ill"),
                dolvol.rename("dolvol"),
                std_dolvol.rename("std_dolvol"),
                std_turn.rename("std_turn"),
                zerotrade.rename("zerotrade"),
                baspread.rename("baspread") if baspread is not None else None,
                turn.rename("turn"),
                *addl,
            ],
            axis=1,
        )
        .replace([np.inf, -np.inf], np.nan)
    )

    return factors.dropna(how="all")


# --------------------------------------------------------------------
# Minimal example (commented-out):
# --------------------------------------------------------------------
# import yfinance as yf
# raw = yf.download("AAPL MSFT", start="1990-01-01", auto_adjust=True, progress=False)
# raw = raw.stack(level=1).rename_axis(["timestamp", "ticker"]).reset_index()
# raw.rename(columns={"Close": "adjclose", "Volume": "volume"}, inplace=True)
# raw.set_index("timestamp", inplace=True)
f = compute_green_price_based_factors(df)
print(f.head())


ZeroDivisionError: float division by zero

In [None]:
# set timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')

# calculate the bid-ask spread
df['baspread'] = (df['ask'] - df['bid'])/ ((df['ask'] + df['bid'])/2)


df['turn'] = 


