# <a id='toc1_'></a>[Cleaning and creation of a C25 dataset for trading](#toc0_)
To restrict ourself to stocks with enough liquidity, we will use the C25 index as the only stocks in our trading universe, though training is done on the full dataset.

**Table of contents**<a id='toc0_'></a>    
- [Cleaning and creation of a C25 dataset for trading](#toc1_)    
- [Import packages](#toc2_)    
- [C25](#toc3_)    
  - [Import data](#toc3_1_)    
  - [Constituents of the C25 index since 2016](#toc3_2_)    
  - [Index price for trading months](#toc3_3_)    
- [Copenhagen Benchmark Index](#toc4_)    
  - [Constituents of the C25 index since 2016](#toc4_1_)    
  - [Index price for trading months](#toc4_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc2_'></a>[Import packages](#toc0_)

In [1]:
import pandas as pd
import numpy as np

In [2]:
rf = pd.read_csv('../data/rf_rate.csv')
rf['timestamp'] = pd.to_datetime(rf['timestamp'])
rf['discount_m'] = (1+rf['discount'])**(1/12) - 1
rf = rf[['timestamp', 'discount_m']]

# <a id='toc3_'></a>[C25](#toc0_)

## <a id='toc3_1_'></a>[Import data](#toc0_)

In [3]:
# set paths to data
path = '../rawdata/omxc25.xlsx'
constituents = pd.read_excel(path, sheet_name='leaverjoiner')
path = '../rawdata/omxc25.xlsx'
price = pd.read_excel(path, sheet_name='pricehistory')

## <a id='toc3_2_'></a>[Constituents of the C25 index since 2016](#toc0_)

In [4]:
constituents.columns = constituents.iloc[2]
constituents = constituents.iloc[3:]
constituents.reset_index(drop=True, inplace=True)
constituents.rename(columns={'2':None,'Status':'status', 'Issuer': 'name', 'Code': 'ticker', 'Date': 'timestamp'}, inplace=True)
constituents['timestamp'] = pd.to_datetime(constituents['timestamp'], format='%Y-%m-%d')
constituents.rename_axis(columns=None, inplace=True)
constituents.rename_axis(columns=None, inplace=True)
# split the 'ticker' column by ^
constituents['ticker'] = constituents['ticker'].str.split('^').str[0]
# remove trailing whitespace from 'ticker'
constituents['ticker'] = constituents['ticker'].str.strip()

min_date = constituents['timestamp'].min()
max_date = pd.Timestamp.today()

# create a date range from min_date to max_date
date_range = pd.date_range(start=min_date, end=max_date, freq='ME')

constituents['timestamp'] = np.where(
    constituents['status'] == 'Joiner', constituents['timestamp'] + pd.offsets.MonthEnd(0),
    constituents['timestamp'] - pd.offsets.MonthEnd(1))

# manual adjust for HLUNa.CO and HLUNb.CO as the split only existed for 10 days in the index
constituents['timestamp'] = np.where(((constituents['status'] == 'Joiner') & (constituents['ticker'] == 'HLUNb.CO')) |
                                      ((constituents['status'] == 'Joiner') & (constituents['ticker'] == 'HLUNa.CO')), 
                                     constituents['timestamp'] - pd.offsets.MonthEnd(1), constituents['timestamp'])

# dropping HLUNa.CO and HLUNb.CO as they are not in the index anymore
constituents = constituents[~constituents['ticker'].isin(['HLUNa.CO', 'HLUNb.CO'])]


In [5]:
constituents = constituents.sort_values(['ticker', 'timestamp', 'status'])

# joiner/leaver events into live intervals

as_of = pd.Timestamp.today().floor('D') # last day

records = []
for ticker, g in constituents.groupby('ticker', sort=False):
    g = g.sort_values('timestamp')
    start = None
    name  = None
    for _, r in g.iterrows():
        if r['status'] == 'Joiner':
            start = r['timestamp']
            name  = r['name']
        elif r['status'] == 'Leaver' and start is not None:
            records.append(
                dict(ticker=ticker, name=name, start=start, end=r['timestamp'])
            )
            start = None
            name  = None
    # still in the index (no later Leaver)
    if start is not None:
        records.append(
            dict(ticker=ticker, name=name, start=start, end=as_of)
        )

intervals = pd.DataFrame(records)

# explode each interval to one row per month-end
rows = []
for _, r in intervals.iterrows():
    months = pd.date_range(r['start'], r['end'], freq='ME')
    rows.extend(
        {'timestamp': d, 'ticker': r['ticker'], 'name': r['name']}
        for d in months
    )

constit_rolling = (
    pd.DataFrame(rows)
      .sort_values(['timestamp', 'ticker'])
      .reset_index(drop=True)
)

# constit_rolling = constit_rolling[constit_rolling['timestamp'] >= '2020-12-31']
# constit_rolling = constit_rolling[constit_rolling['timestamp'] < '2025-01-31']
# constit_rolling.reset_index(drop=True, inplace=True)

# save the rolling constituents to a csv file
constit_rolling.to_csv('../data/omxc25_constit.csv', index=False)
c25 = constit_rolling.copy(deep=True)

## <a id='toc3_3_'></a>[Index price for trading months](#toc0_)

In [6]:
# index price
price.columns = price.iloc[27]
price = price.iloc[28:]
price.rename(columns={'27':None, 'Exchange Date': 'timestamp', 'Close': 'close'}, inplace=True)
price['timestamp'] = pd.to_datetime(price['timestamp'], format='%Y-%m-%d')
price = price[['timestamp', 'close']]
price.rename_axis(columns=None, inplace=True)
price.sort_values(by='timestamp', inplace=True)
price.reset_index(drop=True, inplace=True)

initial_price = price.iloc[0]['close']

# get month end prices
price = (price
         .set_index('timestamp')
         .resample('ME')
         .last()
         .reset_index())

price.loc[-1] = [pd.Timestamp('2021-01-01'), initial_price]
price = price.sort_index().reset_index(drop=True)
price['close'] = pd.to_numeric(price['close'], errors='raise')
price['return'] = price['close'].pct_change()
price = price.merge(rf, on='timestamp', how='left')
price['discount_m'] = price['discount_m'].bfill()
price['return'] = price['return'] - price['discount_m']
price.drop(columns='discount_m', inplace=True)
price['cumulative'] = (1 + price['return']).cumprod() * 100
price.loc[0, 'cumulative'] = 100.0  # set the first value to 100

# save to csv
price.to_csv('../data/omxc25_price.csv', index=False)

# <a id='toc4_'></a>[Copenhagen Benchmark Index](#toc0_)
The OMX Copenhagen Benchmark Index consists of the 50 to 80 largest and most traded stocks, representing the majority of sectors (though currently only 41 stocks are included).


In [7]:
# set paths to data
path = '../rawdata/omxcb.xlsx'

constituents = pd.read_excel(path, sheet_name='leaverjoiner')
price = pd.read_excel(path, sheet_name='pricehistory')

## <a id='toc4_1_'></a>[Constituents of the C25 index since 2016](#toc0_)

In [8]:
constituents = pd.read_excel(path, sheet_name='leaverjoiner')
constituents.columns = constituents.iloc[2]
constituents = constituents.iloc[3:]
constituents.reset_index(drop=True, inplace=True)
constituents.rename(columns={'2':None,'Status':'status', 'Issuer': 'name', 'Code': 'ticker', 'Date': 'timestamp'}, inplace=True)
constituents['timestamp'] = pd.to_datetime(constituents['timestamp'], format='%Y-%m-%d')
constituents.rename_axis(columns=None, inplace=True)
constituents.rename_axis(columns=None, inplace=True)
# split the 'ticker' column by ^
constituents['ticker'] = constituents['ticker'].str.split('^').str[0]
# remove trailing whitespace from 'ticker'
constituents['ticker'] = constituents['ticker'].str.strip()

min_date = constituents['timestamp'].min()
max_date = pd.Timestamp.today()

# create a date range from min_date to max_date
date_range = pd.date_range(start=min_date, end=max_date, freq='ME')

constituents['timestamp'] = np.where(
    constituents['status'] == 'Joiner', constituents['timestamp'] + pd.offsets.MonthEnd(0),
    constituents['timestamp'] - pd.offsets.MonthEnd(1))

# manual adjust for HLUNa.CO and HLUNb.CO as the split only existed for 10 days in the index
constituents['timestamp'] = np.where(((constituents['status'] == 'Joiner') & (constituents['ticker'] == 'HLUNb.CO')) |
                                      ((constituents['status'] == 'Joiner') & (constituents['ticker'] == 'HLUNa.CO')), 
                                     constituents['timestamp'] - pd.offsets.MonthEnd(1), constituents['timestamp'])

In [9]:
constituents = constituents.sort_values(['ticker', 'timestamp', 'status'])

# joiner/leaver events into live intervals

as_of = pd.Timestamp.today().floor('D') # last day

records = []
for ticker, g in constituents.groupby('ticker', sort=False):
    g = g.sort_values('timestamp')
    start = None
    name  = None
    for _, r in g.iterrows():
        if r['status'] == 'Joiner':
            start = r['timestamp']
            name  = r['name']
        elif r['status'] == 'Leaver' and start is not None:
            records.append(
                dict(ticker=ticker, name=name, start=start, end=r['timestamp'])
            )
            start = None
            name  = None
    # still in the index (no later Leaver)
    if start is not None:
        records.append(
            dict(ticker=ticker, name=name, start=start, end=as_of)
        )

intervals = pd.DataFrame(records)

# explode each interval to one row per month-end
rows = []
for _, r in intervals.iterrows():
    months = pd.date_range(r['start'], r['end'], freq='ME')
    rows.extend(
        {'timestamp': d, 'ticker': r['ticker'], 'name': r['name']}
        for d in months
    )

constit_rolling = (
    pd.DataFrame(rows)
      .sort_values(['timestamp', 'ticker'])
      .reset_index(drop=True)
)

# ensure atleast the c25 constituents are included
constit_rolling = constit_rolling.merge(c25[['timestamp', 'name', 'ticker']], on=['timestamp', 'name', 'ticker'], how='outer')

# save the rolling constituents to a csv file
constit_rolling.to_csv('../data/omxcb_constit.csv', index=False)

## <a id='toc4_2_'></a>[Index price for trading months](#toc0_)

In [10]:
# index price
price.columns = price.iloc[27]
price = price.iloc[28:]
price.rename(columns={'27':None, 'Exchange Date': 'timestamp', 'Close': 'close'}, inplace=True)
price['timestamp'] = pd.to_datetime(price['timestamp'], format='%Y-%m-%d')
price = price[['timestamp', 'close']]
price.rename_axis(columns=None, inplace=True)
price.sort_values(by='timestamp', inplace=True)
price.reset_index(drop=True, inplace=True)

initial_price = price.iloc[0]['close']

# get month end prices
price = (price
         .set_index('timestamp')
         .resample('ME')
         .last()
         .reset_index())

price.loc[-1] = [pd.Timestamp('2021-01-01'), initial_price]
price = price.sort_index().reset_index(drop=True)
price['close'] = pd.to_numeric(price['close'], errors='raise')
price['return'] = price['close'].pct_change()
price = price.merge(rf, on='timestamp', how='left')
price['discount_m'] = price['discount_m'].bfill()
price['return'] = price['return'] - price['discount_m']
price.drop(columns='discount_m', inplace=True)
price['cumulative'] = (1 + price['return']).cumprod() * 100
price.loc[0, 'cumulative'] = 100.0  # set the first value to 100

# save to csv
price.to_csv('../data/omxcb_price.csv', index=False)