In [1]:
import pandas as pd
import numpy as np
from functions import *
import re

%load_ext autoreload
%autoreload 2

In [2]:
# set paths to data
path_market = '/Users/johan/Library/CloudStorage/GoogleDrive-johan.oelgaard@gmail.com/My Drive/04 Økonomi/10 Thesis/Data'
# path_finacials = '/Users/johan/Library/CloudStorage/GoogleDrive-johan.oelgaard@gmail.com/My Drive/04 Økonomi/10 Thesis/Data/Financials'

# read monthly market data from eikon
monthly = 'eikon_monthly.xlsx'
# daily = 'eikon_daily.xlsx'
oxford = 'oxford_economics.xlsx'
eikon_dfs = pd.read_excel(path_market + '/' + monthly, sheet_name=None)
eikon_keys = eikon_dfs.keys()
oxford_df = pd.read_excel(path_market + '/' + oxford)

print(eikon_keys)

dict_keys(['OMX Copenhagen_PI | Leavers and', 'Unique Stocks', 'OMXCPI', 'NACE', 'Outstanding Shares', 'Trade Values', 'PE Ratio', 'Turnover', 'Ask', 'Bid'])


### Clean trade data

In [3]:
# clean trade data
trade_values_df = eikon_dfs['Trade Values'].iloc[:,1:]

# set up multi-index for the columns
trade_values_df.columns = pd.MultiIndex.from_arrays(trade_values_df.iloc[:2].values)

# drop the first two rows as they are now headers
trade_values_df = trade_values_df.iloc[2:].reset_index(drop=True)

# set the first column as index
trade_values_df.set_index(trade_values_df.columns[0], inplace=True)
trade_values_df.index.name = "Timestamp"
trade_values_df = trade_values_df.sort_index(axis=1, level=0)

  return Index(sequences[0], name=names)


In [4]:
# required columns:
required_columns = {"Trade Close", "Trade High", "Trade Low", "Trade Open", "Trade Volume"}

# extract all tickers from the first level of the columns
tickers = trade_values_df.columns.levels[0]

valid_tickers = []

for ticker in tickers:
    # the sub-columns (second-level) for this particular ticker
    subcols = set(trade_values_df[ticker].columns)
    
    # check if all required columns are present
    if required_columns.issubset(subcols):
        
        # now check how many valid rows the ticker has.
        subdf = trade_values_df[ticker][list(required_columns)]
        
        # count rows that are non-null in *all* required columns:
        non_null_rows = subdf.dropna(how="any").shape[0]
        
        if non_null_rows >= 3: # at least 3 months w. data (removes ~80 tickers)
            valid_tickers.append(ticker)

# filter the original df to keep only valid tickers and all their second-level columns:
trade_df = trade_values_df.loc[:, (valid_tickers, slice(None))]

# # display or continue working with the cleaned df
# display(trade_df)

### Extract valid stocks and informtion on them

In [5]:
# clean stock names
stocks_df = eikon_dfs['Unique Stocks'].iloc[:,0:3]
# rename Code to Ticker
stocks_df.rename(columns={'Code': 'Ticker'}, inplace=True)

# use valid_tickers to filter the stocks_df
stocks_df = stocks_df[stocks_df['Ticker'].isin(valid_tickers)].reset_index(drop=True)
# display(stocks_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stocks_df.rename(columns={'Code': 'Ticker'}, inplace=True)


In [6]:
nace_df = eikon_dfs['NACE'].iloc[1:,1:3]

# rename columns
nace_df.columns = ['Ticker', 'NACE']

# identify the NACE codes
nace_df['NACE'] = nace_df['NACE'].str.extract(r'\((\d+(?:\.\d+)?)\)$')

# manually map remaining NACE codes to companies
manual_nace = {'CEMAT.CO':'68.20',
               'CICC.CO^L01':'70.10',
               'DAI.CO^A02':'70.10',
               'GR4.CO^A05':'80.10',
               'GR4n1.CO^J04':'80.10',
               'GR4n2.CO^J04':'80.10',
               'IFAC.CO^D03':'64.30',
               'INVb.CO^F05':'64.30',
               'IPFCa.CO^G02':'70.10',
               'IPFCb.CO^G02':'70.10',
               'OBJCa.CO^D02':'62.01',
               'OBJCb.CO^D02':'62.01',
               'ORSTED.CO':'35.11',
               'POFLSb.CO^H06':'64.30',
               'POKAP.CO^B06':'64.30',
               'RADIb.CO^C04':'32.50',
               'TRMC.CO^H02':'64.19',
               'VEND.CO^C02':'64.19'}

for ticker, nace_code in manual_nace.items():
    if ticker in nace_df['Ticker'].values:
        nace_df.loc[nace_df['Ticker'] == ticker, 'NACE'] = nace_code
    else:   
        print(f"Ticker {ticker} not found in NACE DataFrame.")

# split the NACE codes into separate columns
nace_df['NACE Industry'] = nace_df['NACE'].str.split('.', expand=True)[0]
nace_df['NACE Sub-industry'] = nace_df['NACE'].str.split('.', expand=True)[1]

In [8]:
shares_df = eikon_dfs['Outstanding Shares'].iloc[:,1:]

# make first row the header
shares_df.columns = shares_df.iloc[0]
shares_df = shares_df[1:]

# rename the first column to 'Ticker'
shares_df.rename(columns={shares_df.columns[0]: 'Ticker'}, inplace=True)

# set columns to type numeric and findf the valid first occurrence
shares_df.iloc[:, 1:] = shares_df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
shares_df['Shares'] = shares_df.apply(lambda row: first_valid(row, shares_df.columns[1:]),axis=1)

# drop all columns except 'Ticker' and 'Shares'
shares_df = shares_df[['Ticker', 'Shares']]

display(shares_df)

Unnamed: 0,Ticker,Shares
1,AAB.CO,2.696297e+06
2,AABn.CO^F06,2.696297e+06
3,AABn.CO^J03,2.696297e+06
4,AARHUS.CO^D12,1.620366e+07
5,AARUTD.CO^K05,4.000000e+06
...,...,...
419,VWSn.CO^F04,1.009867e+09
420,WALLS.CO^I10,2.145066e+08
421,WEVE.CO^C05,3.010864e+06
422,WEWER.CO^L05,5.525400e+04


In [None]:
# join the stocks_df with the nace_df df
stocks_df = stocks_df.merge(shares_df, how='left', on='Ticker')
stocks_df = stocks_df.merge(nace_df, how='left', on='Ticker')

# display(stocks_df)

# save as stocks
stocks_df.to_csv('data/stocks.csv', index=False)

Unnamed: 0,Name,Code incl. Expiration,Ticker,Shares,NACE,NACE Industry,NACE Sub-industry
0,Aalborg Boldspil,AAB.CO,AAB.CO,2.696297e+06,93.12,93,12
1,Aarhus Lokalbank,AARHUS.CO^D12 (expired),AARHUS.CO^D12,1.620366e+07,64.19,64,19
2,Aak Denmark Hldg,AARUTD.CO^K05 (expired),AARUTD.CO^K05,4.000000e+06,64.20,64,20
3,Affitech,AFFI.CO^J12 (expired),AFFI.CO^J12,4.877215e+08,72.19,72,19
4,Agat Ejendomme,AGATE.CO,AGATE.CO,1.177833e+08,68.10,68,10
...,...,...,...,...,...,...,...
336,Vestas Wind,VWS.CO,VWS.CO,1.009867e+09,28.11,28,11
337,Selskab 1979,WALLS.CO^I10 (expired),WALLS.CO^I10,2.145066e+08,68.20,68,20
338,Magasin Du Nord,WEVE.CO^C05 (expired),WEVE.CO^C05,3.010864e+06,47.71,47,71
339,Wewers,WEWER.CO^L05 (expired),WEWER.CO^L05,5.525400e+04,23.32,23,32


### Clean P/E, Turnover, Bid, and Ask data

In [10]:
pe_ratio_df = eikon_dfs['PE Ratio'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
pe_ratio_df.columns = pd.MultiIndex.from_arrays(pe_ratio_df.iloc[:2].values)

# drop the first two rows as they are now headers
pe_ratio_df = pe_ratio_df.iloc[2:].reset_index(drop=True)

# set the first column as index
pe_ratio_df.set_index(pe_ratio_df.columns[0], inplace=True)
pe_ratio_df.index.name = "Timestamp"

# filter to only include valid tickers
pe_ratio_df = pe_ratio_df.loc[:, (valid_tickers, slice(None))]

# rename all the columns called 'PERATIO' to 'PE Ratio'
pe_ratio_df.columns = [(ticker, 'PE Ratio') if col == 'PERATIO' else (ticker, col) for ticker, col in pe_ratio_df.columns]

# Count columns (tickers) where all values are NaN
count_no_pe = pe_ratio_df.isna().all(axis=0).sum()

print(f"Companies with no PE ratio in the entire period: {count_no_pe}")

# # Get the names of columns (tickers) where all values are NaN
# no_pe_columns = pe_ratio_df.columns[pe_ratio_df.isna().all(axis=0)]

# # Print the names of these columns
# print("Companies with no PE ratio in the entire period:")
# for col in no_pe_columns:
#     print(col[0])

Companies with no PE ratio in the entire period: 96


  return Index(sequences[0], name=names)


In [11]:
turnover_df = eikon_dfs['Turnover'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
turnover_df.columns = pd.MultiIndex.from_arrays(turnover_df.iloc[:2].values)

# drop the first two rows as they are now headers
turnover_df = turnover_df.iloc[2:].reset_index(drop=True)

# set the first column as index
turnover_df.set_index(turnover_df.columns[0], inplace=True)
turnover_df.index.name = "Timestamp"

# filter to only include valid tickers
turnover_df = turnover_df.loc[:, (valid_tickers, slice(None))]

  return Index(sequences[0], name=names)


In [12]:
ask_df = eikon_dfs['Ask'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
ask_df.columns = pd.MultiIndex.from_arrays(ask_df.iloc[:2].values)

# drop the first two rows as they are now headers
ask_df = ask_df.iloc[2:].reset_index(drop=True)

# set the first column as index
ask_df.set_index(ask_df.columns[0], inplace=True)
ask_df.index.name = "Timestamp"
# filter to only include valid tickers
ask_df = ask_df.loc[:, (valid_tickers, slice(None))]

  return Index(sequences[0], name=names)


In [13]:
bid_df = eikon_dfs['Bid'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
bid_df.columns = pd.MultiIndex.from_arrays(bid_df.iloc[:2].values)

# drop the first two rows as they are now headers
bid_df = bid_df.iloc[2:].reset_index(drop=True)

# set the first column as index
bid_df.set_index(bid_df.columns[0], inplace=True)
bid_df.index.name = "Timestamp"

# filter to only include valid tickers
bid_df = bid_df.loc[:, (valid_tickers, slice(None))]

  return Index(sequences[0], name=names)


### Join all dataframes

In [14]:
df = trade_df.join([pe_ratio_df, turnover_df, ask_df, bid_df],how='outer')

# Sort columns by the first level of the multi-index
df = df.sort_index(axis=1, level=0)

# display(df)

In [None]:
# backward fill data if there are gaps in the date range
# create an IndexSlice for easier multi-index slicing
pd.set_option('future.no_silent_downcasting', True)
idx = pd.IndexSlice

# loop over the tickers that are actually in the df
for ticker in df.columns.get_level_values(0).unique():
    # extract the sub-dataframe for this ticker using .loc with IndexSlice
    subdf = df.loc[:, idx[ticker, :]]
    
    # find the index range where the ticker has any valid data
    valid_idx = subdf.dropna(how='all').index

    # use backward fill in the date range
    df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]] = df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]].bfill()

# for now, drop all PE Ratios until we get for the remaining ~90 tickers
df.drop(columns=[(ticker, 'PE Ratio') for ticker in df.columns.get_level_values(0).unique()], inplace=True)

# display the updated df
display(df)

# save df
df.to_csv('data/trade.csv')