In [1]:
import pandas as pd
import numpy as np
from functions import *
import re

%load_ext autoreload
%autoreload 2

In [None]:
# set paths to data
path_market = '/Users/johan/Library/CloudStorage/GoogleDrive-johan.oelgaard@gmail.com/My Drive/04 Økonomi/10 Thesis/Data'
# path_finacials = '/Users/johan/Library/CloudStorage/GoogleDrive-johan.oelgaard@gmail.com/My Drive/04 Økonomi/10 Thesis/Data/Financials'

# read monthly market data from eikon
monthly = 'eikon_monthly.xlsx'
# daily = 'eikon_daily.xlsx'
oxford = 'oxford_economics.xlsx'
eikon_dfs = pd.read_excel(path_market + '/' + monthly, sheet_name=None)
eikon_keys = eikon_dfs.keys()
oxford_df = pd.read_excel(path_market + '/' + oxford)

print(eikon_keys)

dict_keys(['OMX Copenhagen_PI | Leavers and', 'Unique Stocks', 'OMXCPI', 'NACE', 'Outstanding Shares', 'Trade Values', 'PE Ratio', 'Turnover', 'Ask', 'Bid'])


### Clean trade data

In [3]:
# clean trade data
trade_values_df = eikon_dfs['Trade Values'].iloc[:,1:]

# set up multi-index for the columns
trade_values_df.columns = pd.MultiIndex.from_arrays(trade_values_df.iloc[:2].values)

# drop the first two rows as they are now headers
trade_values_df = trade_values_df.iloc[2:].reset_index(drop=True)

# set the first column as index
trade_values_df.set_index(trade_values_df.columns[0], inplace=True)
trade_values_df.index.name = "Timestamp"
trade_values_df = trade_values_df.sort_index(axis=1, level=0)

  return Index(sequences[0], name=names)


In [4]:
# required columns:
required_columns = {"Trade Close", "Trade High", "Trade Low", "Trade Open", "Trade Volume"}

# extract all tickers from the first level of the columns
tickers = trade_values_df.columns.levels[0]

valid_tickers = []

for ticker in tickers:
    # the sub-columns (second-level) for this particular ticker
    subcols = set(trade_values_df[ticker].columns)
    
    # check if all required columns are present
    if required_columns.issubset(subcols):
        
        # now check how many valid rows the ticker has.
        subdf = trade_values_df[ticker][list(required_columns)]
        
        # count rows that are non-null in *all* required columns:
        non_null_rows = subdf.dropna(how="any").shape[0]
        
        if non_null_rows >= 3: # at least 3 months w. data (removes ~80 tickers)
            valid_tickers.append(ticker)

# filter the original df to keep only valid tickers and all their second-level columns:
trade_df = trade_values_df.loc[:, (valid_tickers, slice(None))]

# # display or continue working with the cleaned df
# display(trade_df)

### Extract valid stocks and informtion on them

In [5]:
# clean stock names
stocks_df = eikon_dfs['Unique Stocks'].iloc[:,0:3]
# rename Code to Ticker
stocks_df.rename(columns={'Code': 'Ticker'}, inplace=True)

# use valid_tickers to filter the stocks_df
stocks_df = stocks_df[stocks_df['Ticker'].isin(valid_tickers)].reset_index(drop=True)
# display(stocks_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stocks_df.rename(columns={'Code': 'Ticker'}, inplace=True)


In [6]:
nace_df = eikon_dfs['NACE'].iloc[1:,1:3]

# rename columns
nace_df.columns = ['Ticker', 'NACE']

# identify the NACE codes
nace_df['NACE'] = nace_df['NACE'].str.extract(r'\((\d+(?:\.\d+)?)\)$')

# manually map remaining NACE codes to companies
manual_nace = {'CEMAT.CO':'68.20',
               'CICC.CO^L01':'70.10',
               'DAI.CO^A02':'70.10',
               'GR4.CO^A05':'80.10',
               'GR4n1.CO^J04':'80.10',
               'GR4n2.CO^J04':'80.10',
               'IFAC.CO^D03':'64.30',
               'INVb.CO^F05':'64.30',
               'IPFCa.CO^G02':'70.10',
               'IPFCb.CO^G02':'70.10',
               'OBJCa.CO^D02':'62.01',
               'OBJCb.CO^D02':'62.01',
               'ORSTED.CO':'35.11',
               'POFLSb.CO^H06':'64.30',
               'POKAP.CO^B06':'64.30',
               'RADIb.CO^C04':'32.50',
               'TRMC.CO^H02':'64.19',
               'VEND.CO^C02':'64.19'}

for ticker, nace_code in manual_nace.items():
    if ticker in nace_df['Ticker'].values:
        nace_df.loc[nace_df['Ticker'] == ticker, 'NACE'] = nace_code
    else:   
        print(f"Ticker {ticker} not found in NACE DataFrame.")

# split the NACE codes into separate columns
nace_df['NACE Industry'] = nace_df['NACE'].str.split('.', expand=True)[0]
nace_df['NACE Sub-industry'] = nace_df['NACE'].str.split('.', expand=True)[1]

In [7]:
# print tickers w. missing NACE codes
missing_nace = nace_df[nace_df['NACE'].isna()]
print("Missing NACE codes:")
print(missing_nace[['Ticker', 'NACE']])

Missing NACE codes:
           Ticker NACE
393  TPSLn.CO^F04  NaN
394  TPSLn.CO^J06  NaN


In [8]:
shares_df = eikon_dfs['Outstanding Shares'].iloc[:,1:]

# make first row the header
shares_df.columns = shares_df.iloc[0]
shares_df = shares_df[1:]

# rename the first column to 'Ticker'
shares_df.rename(columns={shares_df.columns[0]: 'Ticker'}, inplace=True)

# set columns to type numeric and findf the valid first occurrence
shares_df.iloc[:, 1:] = shares_df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
shares_df['Shares'] = shares_df.apply(lambda row: first_valid(row, shares_df.columns[1:]),axis=1)

# drop all columns except 'Ticker' and 'Shares'
shares_df = shares_df[['Ticker', 'Shares']]

display(shares_df)

Unnamed: 0,Ticker,Shares
1,AAB.CO,2.696297e+06
2,AABn.CO^F06,2.696297e+06
3,AABn.CO^J03,2.696297e+06
4,AARHUS.CO^D12,1.620366e+07
5,AARUTD.CO^K05,4.000000e+06
...,...,...
419,VWSn.CO^F04,1.009867e+09
420,WALLS.CO^I10,2.145066e+08
421,WEVE.CO^C05,3.010864e+06
422,WEWER.CO^L05,5.525400e+04


In [9]:
# join the stocks_df with the nace_df df
stocks_df = stocks_df.merge(shares_df, how='left', on='Ticker')
stocks_df = stocks_df.merge(nace_df, how='left', on='Ticker')

display(stocks_df)

Unnamed: 0,Name,Code incl. Expiration,Ticker,Shares,NACE,NACE Industry,NACE Sub-industry
0,Aalborg Boldspil,AAB.CO,AAB.CO,2.696297e+06,93.12,93,12
1,Aarhus Lokalbank,AARHUS.CO^D12 (expired),AARHUS.CO^D12,1.620366e+07,64.19,64,19
2,Aak Denmark Hldg,AARUTD.CO^K05 (expired),AARUTD.CO^K05,4.000000e+06,64.20,64,20
3,Affitech,AFFI.CO^J12 (expired),AFFI.CO^J12,4.877215e+08,72.19,72,19
4,Agat Ejendomme,AGATE.CO,AGATE.CO,1.177833e+08,68.10,68,10
...,...,...,...,...,...,...,...
336,Vestas Wind,VWS.CO,VWS.CO,1.009867e+09,28.11,28,11
337,Selskab 1979,WALLS.CO^I10 (expired),WALLS.CO^I10,2.145066e+08,68.20,68,20
338,Magasin Du Nord,WEVE.CO^C05 (expired),WEVE.CO^C05,3.010864e+06,47.71,47,71
339,Wewers,WEWER.CO^L05 (expired),WEWER.CO^L05,5.525400e+04,23.32,23,32


### Clean P/E, Turnover, Bid, and Ask data

In [7]:
pe_ratio_df = eikon_dfs['PE Ratio'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
pe_ratio_df.columns = pd.MultiIndex.from_arrays(pe_ratio_df.iloc[:2].values)

# drop the first two rows as they are now headers
pe_ratio_df = pe_ratio_df.iloc[2:].reset_index(drop=True)

# set the first column as index
pe_ratio_df.set_index(pe_ratio_df.columns[0], inplace=True)
pe_ratio_df.index.name = "Timestamp"

# filter to only include valid tickers
pe_ratio_df = pe_ratio_df.loc[:, (valid_tickers, slice(None))]

# Count columns (tickers) where all values are NaN
count_no_pe = pe_ratio_df.isna().all(axis=0).sum()

print(f"Companies with no PE ratio in the entire period: {count_no_pe}")

# Get the names of columns (tickers) where all values are NaN
no_pe_columns = pe_ratio_df.columns[pe_ratio_df.isna().all(axis=0)]

# Print the names of these columns
print("Companies with no PE ratio in the entire period:")
for col in no_pe_columns:
    print(col[0])

Companies with no PE ratio in the entire period: 96
Companies with no PE ratio in the entire period:
AARUTD.CO^K05
AFFI.CO^J12
ALBCa.CO^F02
ALBCb.CO^F02
ALMBPbn.CO^B04
BANKTR.CO^A08
BHJ.CO^L04
BIOPbn.CO^E06
BIOPbn.CO^J04
BIOS.CO^B07
BKLC.CO^A02
BRIT.CO^H02
CHEMb.CO^F03
CICC.CO^L01
CIMBER.CO^E12
CONSH.CO^L04
CUR.CO^D09
CWOCb.CO^J01
D1912a.CO^F03
D1912b.CO^F03
DAI.CO^A02
DATR.CO^G03
DKAP.CO^A05
EBHn.CO^E07
ECOM.CO^H04
EGHHn.CO^E02
ESI.CO^I02
FALCK.CO^B05
FALCK.CO^G04
FIP.CO^G04
FJORD.CO^H11
FO-AIR.CO^F14
FORAS.CO^J05
FORSTn.CO^F07
FRINV.CO^A02
FUNKI.CO^J04
GFHb.CO^B05
GFHc.CO^B05
GR4.CO^A05
GREENH.CO
GWE.CO^D12
HAFSa.CO^G04
HAFSb.CO^G04
HEFAb.CO^B05
HEFAs.CO^B05
HLJC.CO^I01
HLUNa.CO
HLUNb.CO
IDATA.CO^H02
IFAC.CO^D03
INC.CO^C04
INVb.CO^F05
IPFCa.CO^G02
IPFCb.CO^G02
ISS.CO^F05
JAMO.CO^I01
JKGb.CO^B04
JUNC.CO^D04
KAP.CO^D10
KEOPSn2.CO^B07
KMPNb.CO^E05
LDBCb.CO^H01
MAXn.CO^G06
MMINV.CO^G02
NAVI.CO^H02
NEG.CO^E04
NOWA.CO^L05
NUNA.CO^G16
OBJCb.CO^D02
PENNEO.CO^C25
POTAb.CO^F05
RADIb.CO^C04
ROS

  return Index(sequences[0], name=names)


In [8]:
turnover_df = eikon_dfs['Turnover'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
turnover_df.columns = pd.MultiIndex.from_arrays(turnover_df.iloc[:2].values)

# drop the first two rows as they are now headers
turnover_df = turnover_df.iloc[2:].reset_index(drop=True)

# set the first column as index
turnover_df.set_index(turnover_df.columns[0], inplace=True)
turnover_df.index.name = "Timestamp"

# filter to only include valid tickers
turnover_df = turnover_df.loc[:, (valid_tickers, slice(None))]

  return Index(sequences[0], name=names)


In [9]:
ask_df = eikon_dfs['Ask'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
ask_df.columns = pd.MultiIndex.from_arrays(ask_df.iloc[:2].values)

# drop the first two rows as they are now headers
ask_df = ask_df.iloc[2:].reset_index(drop=True)

# set the first column as index
ask_df.set_index(ask_df.columns[0], inplace=True)
ask_df.index.name = "Timestamp"
# filter to only include valid tickers
ask_df = ask_df.loc[:, (valid_tickers, slice(None))]

  return Index(sequences[0], name=names)


In [10]:
bid_df = eikon_dfs['Bid'].iloc[:,1:]

# convert the extracted values to strings to prevent dtype inference issues
bid_df.columns = pd.MultiIndex.from_arrays(bid_df.iloc[:2].values)

# drop the first two rows as they are now headers
bid_df = bid_df.iloc[2:].reset_index(drop=True)

# set the first column as index
bid_df.set_index(bid_df.columns[0], inplace=True)
bid_df.index.name = "Timestamp"

# filter to only include valid tickers
bid_df = bid_df.loc[:, (valid_tickers, slice(None))]

  return Index(sequences[0], name=names)


In [11]:
# display the cleaned DataFrame
display(pe_ratio_df)
display(turnover_df)
display(ask_df)
display(bid_df)

Unnamed: 0_level_0,AAB.CO,AARHUS.CO^D12,AARUTD.CO^K05,AFFI.CO^J12,AGATE.CO,AGFEb.CO,ALBCa.CO^F02,ALBCb.CO^F02,ALKb.CO,ALMB.CO,...,VJBA.CO,VORD.CO^A14,VORDn.CO^D04,VTHa.CO^E03,VTJB.CO^E03,VWS.CO,WALLS.CO^I10,WEVE.CO^C05,WEWER.CO^L05,ZELA.CO
Unnamed: 0_level_1,PE Ratio,PE Ratio,PERATIO,PERATIO,PE Ratio,PE Ratio,PERATIO,PERATIO,PE Ratio,PE Ratio,...,PE Ratio,PE Ratio,PERATIO,PERATIO,PERATIO,PE Ratio,PE Ratio,PERATIO,PERATIO,PE Ratio
Timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-03-31,,,,,,61.09,,,37.07,32.9,...,6.66,,,,,30.08,,,,
2025-02-28,,,,,,5.91,,,40.7,33.79,...,7.18,,,,,29.37,,,,
2025-01-31,,,,,,5.24,,,42.94,31.28,...,5.44,,,,,268.62,,,,
2024-12-31,,,,,,5.18,,,44.58,29.43,...,5.2,,,,,261.97,,,,
2024-11-30,,,,,,5.4,,,44.16,28.16,...,5.01,,,,,285.98,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006-05-31,,12.272729,,,16.770218,,,,0.107,0.805716,...,16.14572,73.57,,,,,56.88,,,
2006-04-30,,1.04,,,15.839587,,,,0.1065,0.942168,...,2.569647,21.25,,,,,0.3429,,,
2006-03-31,,1.027273,,,43.991168,,,,0.1065,0.94298,...,2.397084,20.95,,,,,0.3429,,,
2006-02-28,,0.937273,,,39.891362,,,,0.09,0.833331,...,2.458592,20.55,,,,,0.3429,,,


Unnamed: 0_level_0,AAB.CO,AARHUS.CO^D12,AARUTD.CO^K05,AFFI.CO^J12,AGATE.CO,AGFEb.CO,ALBCa.CO^F02,ALBCb.CO^F02,ALKb.CO,ALMB.CO,...,VJBA.CO,VORD.CO^A14,VORDn.CO^D04,VTHa.CO^E03,VTJB.CO^E03,VWS.CO,WALLS.CO^I10,WEVE.CO^C05,WEWER.CO^L05,ZELA.CO
Unnamed: 0_level_1,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,...,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover,Turnover
Timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-03-31,229430,,,,718760,3288270,,,641050040,1407262510,...,55420600,,,,,7116919720,,,,5991152250
2025-02-28,106690,,,,728000,3237570,,,753122380,526543620,...,73488330,,,,,7880510300,,,,2801495960
2025-01-31,445780,,,,1932060,1476680,,,546700400,702186130,...,50707920,,,,,7558676840,,,,2572094360
2024-12-31,155480,,,,1114810,1814450,,,600302620,531818850,...,45232170,,,,,6750228180,,,,2995576070
2024-11-30,182150,,,,558800,2741300,,,645320210,341338490,...,24542840,,,,,10524957850,,,,3587543140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001-05-31,504856,380975,7094.243,44803074.1,506771667.41,514826.5,,89.506,82449376.44,3239362,...,3128355,302844,,189.31,8205.689,4651320424,,6343.559,116,
2001-04-30,446718,669030,10970.953,7828368.95,99709160.88,454340,,11.2,62446436,2089608,...,3068245,1404554,,120473.684,9181.995,4315348880,159.268,3656.834,885,
2001-03-31,438566,2045865,51498.272,42753921.94,146736322.4,80870,116.7,147.6,42281993.69,4887626,...,7381276,1146487,,80.29,18496.089,5327587992,,10316.932,220.8,
2001-02-28,2047656,1911514,13668.485,28954390.88,225252517.16,61150,93.127,600.6,46326149.57,6349141,...,5240633,1363983,,118.524,33111.471,3841219984,,5899.299,160,


Unnamed: 0_level_0,AAB.CO,AARHUS.CO^D12,AARUTD.CO^K05,AFFI.CO^J12,AGATE.CO,AGFEb.CO,ALBCa.CO^F02,ALBCb.CO^F02,ALKb.CO,ALMB.CO,...,VJBA.CO,VORD.CO^A14,VORDn.CO^D04,VTHa.CO^E03,VTJB.CO^E03,VWS.CO,WALLS.CO^I10,WEVE.CO^C05,WEWER.CO^L05,ZELA.CO
Unnamed: 0_level_1,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,...,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close,Ask Close
Timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-03-31,31,,,,1.41,0.806,,,138.5,16.92,...,4.49,,,,,100.7,,,,538.5
2025-02-28,33.6,,,,1.41,0.8,,,153.6,16.19,...,4.84,,,,,101.45,,,,665
2025-01-31,34.2,,,,1.34,0.7,,,163.6,15.01,...,4.5,,,,,99.08,,,,738
2024-12-31,34.6,,,,1.45,0.676,,,158.7,14.07,...,4.29,,,,,98.74,,,,716.5
2024-11-30,35.4,,,,1.45,0.716,,,162.1,13.68,...,4.13,,,,,109.7,,,,730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001-05-31,3137.619772,33.421277,185,91.801443,145.34824,19.071839,1167,695,7.497089,10.233894,...,12.472356,361.3155,,280,480,77.000014,,492,650,
2001-04-30,3346.794423,33.421277,179,80.507259,131.126771,11.443103,,700,6.788703,9.42168,...,12.386928,352.97745,,300,460,70.374431,3,484.5,600,
2001-03-31,3660.556401,33.421277,182,81.086448,131.608854,11.443103,,700,7.526606,8.85313,...,12.301501,352.97745,,,405,64.465128,3,500,690,
2001-02-28,3765.143726,32.998222,165,110.045894,135.947608,12.714559,,,8.028379,10.964886,...,12.386928,358.53615,,,390,71.80699,3,455,800,


Unnamed: 0_level_0,AAB.CO,AARHUS.CO^D12,AARUTD.CO^K05,AFFI.CO^J12,AGATE.CO,AGFEb.CO,ALBCa.CO^F02,ALBCb.CO^F02,ALKb.CO,ALMB.CO,...,VJBA.CO,VORD.CO^A14,VORDn.CO^D04,VTHa.CO^E03,VTJB.CO^E03,VWS.CO,WALLS.CO^I10,WEVE.CO^C05,WEWER.CO^L05,ZELA.CO
Unnamed: 0_level_1,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,...,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close,Bid Close
Timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-03-31,30.8,,,,1.37,0.794,,,138.3,16.9,...,4.48,,,,,100.6,,,,537.5
2025-02-28,32.6,,,,1.38,0.786,,,153.4,16.17,...,4.82,,,,,101.4,,,,664
2025-01-31,33.2,,,,1.32,0.694,,,163.2,14.99,...,4.49,,,,,99.04,,,,737
2024-12-31,33.6,,,,1.44,0.67,,,158.4,14.05,...,4.26,,,,,98.68,,,,715.5
2024-11-30,34.8,,,,1.42,0.702,,,161.9,13.67,...,4.11,,,,,109.65,,,,729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001-05-31,3006.885615,32.575168,180,91.222254,144.143031,16.528927,,,7.408541,10.071451,...,12.301501,358.53615,,270,470,76.641874,,490,600,
2001-04-30,3189.913435,32.575168,171,77.032126,130.644687,10.934521,,,6.729671,9.015573,...,12.216074,348.3452,,310,450,70.195362,2.7,475,400,
2001-03-31,3529.822244,32.575168,177,75.294559,130.644687,10.171647,1167,700,7.438057,8.284581,...,12.216074,348.3452,,260,400,62.8535,2.7,495,400,
2001-02-28,3660.556401,32.575168,163,107.14995,134.98344,6.35728,1167,700,7.969347,10.315115,...,12.216074,353.9039,,310,385,71.090711,2.6,454,,


### Join all dataframes

In [13]:
df = trade_df.join([pe_ratio_df, turnover_df, ask_df, bid_df],how='outer')

# Sort columns by the first level of the multi-index
df = df.sort_index(axis=1, level=0)

display(df)

Unnamed: 0_level_0,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AARHUS.CO^D12,...,WEWER.CO^L05,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO
Unnamed: 0_level_1,Ask Close,Bid Close,PE Ratio,Trade Close,Trade High,Trade Low,Trade Open,Trade Volume,Turnover,Ask Close,...,Turnover,Ask Close,Bid Close,PE Ratio,Trade Close,Trade High,Trade Low,Trade Open,Trade Volume,Turnover
Timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2001-01-31,4183.493029,4026.612041,,4026.612041,4183.493029,2614.683143,2771.564132,691.536948,2255122,31.306006,...,,,,,,,,,,
2001-02-28,3765.143726,3660.556401,,3765.143726,4235.786692,3399.088086,4131.199367,506.103389,2047656,32.998222,...,160,,,,,,,,,
2001-03-31,3660.556401,3529.822244,,3503.675412,3765.143726,3451.381749,3765.143726,117.279985,438566,33.421277,...,220.8,,,,,,,,,
2001-04-30,3346.794423,3189.913435,,3189.913435,4183.493029,3189.913435,4183.493029,126.305935,446718,33.421277,...,885,,,,,,,,,
2001-05-31,3137.619772,3006.885615,,2876.151458,3399.088086,2876.151458,3137.619772,165.335521,504856,33.421277,...,116,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-30,35.4,34.8,,34.6,39.2,34.6,37.4,4963,182150,,...,,730,729,,730,893,672.5,789.5,4650485,3587543140
2024-12-31,34.6,33.6,,34.6,35.4,33,34.8,4570,155480,,...,,716.5,715.5,,715.5,819,587,728,4138727,2995576070
2025-01-31,34.2,33.2,,34.2,35,30.8,34.8,13429,445780,,...,,738,737,,735.5,808,682.5,718.5,3538390,2572094360
2025-02-28,33.6,32.6,,32.6,34.4,32.2,34.2,3204,106690,,...,,665,664,,663,788,657,714.5,3876841,2801495960


In [14]:
# backward fill data if there are gaps in the date range
# create an IndexSlice for easier multi-index slicing
pd.set_option('future.no_silent_downcasting', True)
idx = pd.IndexSlice

# loop over the tickers that are actually in the df
for ticker in df.columns.get_level_values(0).unique():
    # extract the sub-dataframe for this ticker using .loc with IndexSlice
    subdf = df.loc[:, idx[ticker, :]]
    
    # find the index range where the ticker has any valid data
    valid_idx = subdf.dropna(how='all').index

    # use backward fill in the date range
    df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]] = df.loc[valid_idx.max():valid_idx.min(), idx[ticker, :]].bfill()

# display the updated df
display(df)

# # save df
# df.to_csv('data/df.csv')

Unnamed: 0_level_0,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AAB.CO,AARHUS.CO^D12,...,WEWER.CO^L05,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO,ZELA.CO
Unnamed: 0_level_1,Ask Close,Bid Close,PE Ratio,Trade Close,Trade High,Trade Low,Trade Open,Trade Volume,Turnover,Ask Close,...,Turnover,Ask Close,Bid Close,PE Ratio,Trade Close,Trade High,Trade Low,Trade Open,Trade Volume,Turnover
Timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2001-01-31,4183.493029,4026.612041,,4026.612041,4183.493029,2614.683143,2771.564132,691.536948,2255122,31.306006,...,,,,,,,,,,
2001-02-28,3765.143726,3660.556401,,3765.143726,4235.786692,3399.088086,4131.199367,506.103389,2047656,32.998222,...,160,,,,,,,,,
2001-03-31,3660.556401,3529.822244,,3503.675412,3765.143726,3451.381749,3765.143726,117.279985,438566,33.421277,...,220.8,,,,,,,,,
2001-04-30,3346.794423,3189.913435,,3189.913435,4183.493029,3189.913435,4183.493029,126.305935,446718,33.421277,...,885,,,,,,,,,
2001-05-31,3137.619772,3006.885615,,2876.151458,3399.088086,2876.151458,3137.619772,165.335521,504856,33.421277,...,116,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-30,35.4,34.8,,34.6,39.2,34.6,37.4,4963,182150,,...,,730,729,,730,893,672.5,789.5,4650485,3587543140
2024-12-31,34.6,33.6,,34.6,35.4,33,34.8,4570,155480,,...,,716.5,715.5,,715.5,819,587,728,4138727,2995576070
2025-01-31,34.2,33.2,,34.2,35,30.8,34.8,13429,445780,,...,,738,737,,735.5,808,682.5,718.5,3538390,2572094360
2025-02-28,33.6,32.6,,32.6,34.4,32.2,34.2,3204,106690,,...,,665,664,,663,788,657,714.5,3876841,2801495960
