In [1]:
"""Data transformation for UK LSE data.

This script transforms the following data
1. Fundamental data

It is designed to be not following functional form or
objective orientated form to experiment different data
manipulations in notebooks easily.

All final data will be stored in a dictionary called `final`
"""
# library
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

# file path
fundamental_path = "~/db/asset_pricing/Finage_LSE_data/fundamental_quarter.csv"

# date range
min_date = "2009-01-01"
max_date = "2022-01-01"

In [2]:
"""Fundamental data"""
# path
raw = pd.read_csv(fundamental_path)
raw["acceptedDate"] = pd.to_datetime(raw["acceptedDate"])
raw["symbol"] = raw["symbol"].str.replace(".L", "", regex=False)

raw[raw["symbol"] == "3IN"]

Unnamed: 0,date,symbol,fillingDate,acceptedDate,period,revenue,costOfRevenue,grossProfit,grossProfitRatio,researchAndDevelopmentExpenses,generalAndAdministrativeExpenses,sellingAndMarketingExpenses,otherExpenses,operatingExpenses,costAndExpenses,interestExpense,depreciationAndAmortization,ebitda,ebitdaratio,operatingIncome,operatingIncomeRatio,totalOtherIncomeExpensesNet,incomeBeforeTax,incomeBeforeTaxRatio,incomeTaxExpense,netIncome,netIncomeRatio,eps,epsdiluted,weightedAverageShsOut,weightedAverageShsOutDil,link,finalLink
0,2020-09-30,3IN,2020-09-30,2020-09-30,Q2,57250000.0,0.0,0.0,0.0,0.0,1000000.0,0.0,0.0,250000.0,0.0,500000.0,0.0,56500000.0,0.9869,0.0,0.0,0.0,56000000.0,0.978166,0.0,56000000.0,0.978166,0.066,0.066,847600000.0,847600000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
1,2020-06-30,3IN,2020-06-30,2020-06-30,Q1,57250000.0,0.0,0.0,0.0,0.0,1000000.0,0.0,0.0,250000.0,0.0,500000.0,0.0,56500000.0,0.9869,0.0,0.0,0.0,56000000.0,0.978166,0.0,56000000.0,0.978166,0.066,0.066,847600000.0,847600000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
2,2020-03-31,3IN,2020-03-31,2020-03-31,Q4,57250000.0,0.0,0.0,0.0,0.0,1000000.0,0.0,0.0,250000.0,0.0,500000.0,0.0,56500000.0,0.9869,0.0,0.0,0.0,56000000.0,0.978166,0.0,56000000.0,0.978166,0.066,0.066,847600000.0,847600000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
3,2020-01-31,3IN,2020-01-31,2020-01-31,Q3,57250000.0,0.0,0.0,0.0,0.0,1000000.0,0.0,0.0,250000.0,0.0,500000.0,0.0,56500000.0,0.9869,0.0,0.0,0.0,56000000.0,0.978166,0.0,56000000.0,0.978166,0.066,0.066,847600000.0,847600000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
4,2019-06-30,3IN,2019-06-30,2019-06-30,Q1,67075000.0,0.0,0.0,0.0,0.0,850000.0,0.0,0.0,1600000.0,0.0,650000.0,0.0,65275000.0,0.973164,0.0,0.0,0.0,64625000.0,0.963474,0.0,64625000.0,0.963474,0.07975,0.07975,810400000.0,810400000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
5,2019-03-31,3IN,2019-03-31,2019-03-31,Q4,67075000.0,0.0,0.0,0.0,0.0,850000.0,0.0,0.0,1600000.0,0.0,650000.0,0.0,65275000.0,0.973164,0.0,0.0,0.0,64625000.0,0.963474,0.0,64625000.0,0.963474,0.07975,0.07975,810400000.0,810400000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
6,2019-01-31,3IN,2019-01-31,2019-01-31,Q3,67075000.0,0.0,0.0,0.0,0.0,850000.0,0.0,0.0,1600000.0,0.0,650000.0,0.0,65275000.0,0.973164,0.0,0.0,0.0,64625000.0,0.963474,0.0,64625000.0,0.963474,0.07975,0.07975,810400000.0,810400000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
7,2018-06-30,3IN,2018-06-30,2018-06-30,Q1,120850000.0,0.0,0.0,0.0,0.0,750000.0,0.0,0.0,200000.0,0.0,1525000.0,0.0,121425000.0,1.004758,0.0,0.0,0.0,119900000.0,0.992139,0.0,119900000.0,0.992139,0.118,0.118,1016500000.0,1016500000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
8,2018-03-31,3IN,2018-03-31,2018-03-31,Q4,120850000.0,0.0,0.0,0.0,0.0,750000.0,0.0,0.0,200000.0,0.0,1525000.0,0.0,121425000.0,1.004758,0.0,0.0,0.0,119900000.0,0.992139,0.0,119900000.0,0.992139,0.118,0.118,1016500000.0,1016500000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...
9,2018-01-31,3IN,2018-01-31,2018-01-31,Q3,120850000.0,0.0,0.0,0.0,0.0,750000.0,0.0,0.0,200000.0,0.0,1525000.0,0.0,121425000.0,1.004758,0.0,0.0,0.0,119900000.0,0.992139,0.0,119900000.0,0.992139,0.118,0.118,1016500000.0,1016500000.0,https://find-and-update.company-information.se...,https://find-and-update.company-information.se...


In [3]:

missing_code = "-99.99"
drop_labels = [
    "date", "fillingDate", "period", "link", "finalLink"
]
rename_cols = {"acceptedDate": "date"}
time_index = pd.date_range(
    start=min_date,
    end=max_date,
    freq="M"
)
symbols = raw["symbol"].unique()
index = pd.MultiIndex.from_product(
    [symbols, time_index], names=["symbol", "date"]
)
replace_int = [0, float("inf")]

# transform quarter data to monthly data
data = raw.drop(labels=drop_labels, axis=1)\
    .rename(mapper=rename_cols, axis=1)\
    .set_index(["symbol", "date"])\
    .sort_values(by=["symbol", "date"])

data.loc["3IN"]

Unnamed: 0_level_0,revenue,costOfRevenue,grossProfit,grossProfitRatio,researchAndDevelopmentExpenses,generalAndAdministrativeExpenses,sellingAndMarketingExpenses,otherExpenses,operatingExpenses,costAndExpenses,interestExpense,depreciationAndAmortization,ebitda,ebitdaratio,operatingIncome,operatingIncomeRatio,totalOtherIncomeExpensesNet,incomeBeforeTax,incomeBeforeTaxRatio,incomeTaxExpense,netIncome,netIncomeRatio,eps,epsdiluted,weightedAverageShsOut,weightedAverageShsOutDil
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2008-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-23125000.0,0.0,0.0,0.0,0.0,23125000.0,0.0,0.0,23125000.0,0.0,0.0,17950000.0,0.0,0.035889,0.035889,499429100.0,499429100.0
2008-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-23125000.0,0.0,0.0,0.0,0.0,23125000.0,0.0,0.0,23125000.0,0.0,0.0,17950000.0,0.0,0.035889,0.035889,499429100.0,499429100.0
2008-06-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-23125000.0,0.0,0.0,0.0,0.0,23125000.0,0.0,0.0,23125000.0,0.0,0.0,17950000.0,0.0,0.035889,0.035889,499429100.0,499429100.0
2008-09-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-23125000.0,0.0,0.0,0.0,0.0,23125000.0,0.0,0.0,23125000.0,0.0,0.0,17950000.0,0.0,0.035889,0.035889,499429100.0,499429100.0
2009-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13100000.0,0.0,0.0,0.0,0.0,13100000.0,0.0,0.0,13100000.0,0.0,0.0,10650000.0,0.0,0.019,0.019,557052900.0,557052900.0
2009-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13100000.0,0.0,0.0,0.0,0.0,13100000.0,0.0,0.0,13100000.0,0.0,0.0,10650000.0,0.0,0.019,0.019,557052900.0,557052900.0
2009-06-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13100000.0,0.0,0.0,0.0,0.0,13100000.0,0.0,0.0,13100000.0,0.0,0.0,10650000.0,0.0,0.019,0.019,557052900.0,557052900.0
2009-09-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13100000.0,0.0,0.0,0.0,0.0,13100000.0,0.0,0.0,13100000.0,0.0,0.0,10650000.0,0.0,0.019,0.019,557052900.0,557052900.0
2010-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-25800000.0,0.0,0.0,0.0,0.0,25800000.0,0.0,0.0,25800000.0,0.0,0.0,22575000.0,0.0,0.039055,0.039055,576308100.0,576308100.0
2010-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-25800000.0,0.0,0.0,0.0,0.0,25800000.0,0.0,0.0,25800000.0,0.0,0.0,22575000.0,0.0,0.039055,0.039055,576308100.0,576308100.0


In [4]:
# transform data
# replace missing data with median
data = data\
    .replace(replace_int, np.nan)\
    .groupby("date")\
    .apply(lambda x: np.log(x) - np.log(x.shift(1)))\
    .groupby("date")\
    .apply(lambda x: x.fillna(x.median()))\

data.loc["3IN"]

  result = func(self.values, **kwargs)


Unnamed: 0_level_0,revenue,costOfRevenue,grossProfit,grossProfitRatio,researchAndDevelopmentExpenses,generalAndAdministrativeExpenses,sellingAndMarketingExpenses,otherExpenses,operatingExpenses,costAndExpenses,interestExpense,depreciationAndAmortization,ebitda,ebitdaratio,operatingIncome,operatingIncomeRatio,totalOtherIncomeExpensesNet,incomeBeforeTax,incomeBeforeTaxRatio,incomeTaxExpense,netIncome,netIncomeRatio,eps,epsdiluted,weightedAverageShsOut,weightedAverageShsOutDil
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2008-01-31,-0.017222,-0.419832,-0.322499,-0.02914,-0.832797,0.085501,-0.086076,1.186581,0.046042,0.047791,-0.017249,0.097928,-0.15042,0.024456,-0.466299,-0.245111,-0.172586,-0.30017,-0.092979,-0.286163,-0.230124,-0.160028,0.121067,0.110846,0.000802,0.003498
2008-03-31,-0.006685,-0.221302,-0.188538,-0.026992,-1.510443,0.177089,0.20505,,0.039593,0.068059,-0.017249,0.097426,-0.027629,-0.043642,-0.482388,-0.257806,-0.086743,-0.30017,-0.13301,-0.470095,-0.486739,-0.160911,0.041352,0.032749,-0.014856,-0.014856
2008-06-30,0.084319,-0.419832,-0.187151,0.007025,-1.306601,-0.076723,0.374693,1.186581,-0.047706,0.08368,-0.008625,0.152667,-0.027629,-0.034221,-0.465223,-0.287609,-0.104622,-0.321658,-0.143337,-0.460702,-0.407965,-0.211999,-0.123323,-0.115437,-0.095029,-0.081771
2008-09-30,0.031331,-0.213613,-0.217011,-0.002029,-1.621936,-0.116239,0.374693,1.186581,0.052085,0.08368,0.0,-0.059439,-0.218795,-0.046006,-0.565905,-0.309764,0.081762,-0.397488,-0.281786,-0.470095,-0.310528,-0.16047,-0.010027,-0.010482,-0.014856,-0.002219
2009-01-31,0.146316,0.056902,-0.121243,0.070291,-0.589291,-0.174789,-0.139407,,-0.070961,-0.094007,-0.016307,0.081007,-0.01608,-0.061607,0.099808,0.057643,-0.340219,-0.421289,-0.079066,-0.01005,-0.31291,-0.064068,-0.008167,-0.051493,-0.058756,-0.066966
2009-03-31,0.203683,0.056902,0.052111,-0.012473,-1.26638,-0.174789,0.620697,,-0.033523,0.019551,-0.174958,0.021035,-0.033347,-0.068508,0.092179,-0.166002,-0.230826,-0.449661,-0.129223,0.193386,-0.343969,-0.140644,0.004502,-0.017291,-0.152803,-0.129766
2009-06-30,0.06581,0.004997,0.006638,-0.028055,-1.252173,-0.174789,0.176951,,-0.070961,-0.190758,-0.226752,0.208535,-0.006975,-0.043112,0.067204,-0.166002,-0.472687,-0.432774,-0.072393,0.268801,-0.343969,-0.049331,-0.008167,-0.053673,-0.255396,-0.245594
2009-09-30,0.088948,0.102598,-0.090682,-0.047613,-1.580235,-0.04256,1.07049,,-0.033523,-0.007572,-0.207483,-0.40943,-0.01608,-0.075409,0.084549,-0.321706,-0.455401,-0.42669,-0.129223,-0.01005,-0.405252,-0.117575,-0.176929,-0.169776,-0.095016,-0.082264
2010-01-31,0.211034,0.022606,-0.146496,-0.03303,-0.585325,-0.047908,-1.151157,,-0.292902,-0.078024,0.034222,-0.095067,0.014547,0.02495,-0.085802,0.102953,-0.149857,-0.080105,0.118753,0.094311,-0.331396,0.022845,-0.212076,-0.191965,-0.079436,-0.069431
2010-03-31,0.075437,0.019272,-0.132404,-0.088171,-1.930513,0.164808,-0.150899,,-0.335506,0.005278,-0.073962,-0.099823,-0.155817,0.021155,-0.070716,0.117155,-0.159849,-0.287511,0.095474,0.06355,-0.375269,0.022854,-0.212076,-0.191965,-0.162535,-0.148895


In [5]:
data = data\
    .reset_index()\
    .set_index("date")\
    .groupby("symbol")\
    .resample("1M")\
    .ffill()\
    .drop(labels=["symbol"], axis=1)\
    .reindex(index)\
    .astype("float")

data.loc["3IN"]

Unnamed: 0_level_0,revenue,costOfRevenue,grossProfit,grossProfitRatio,researchAndDevelopmentExpenses,generalAndAdministrativeExpenses,sellingAndMarketingExpenses,otherExpenses,operatingExpenses,costAndExpenses,interestExpense,depreciationAndAmortization,ebitda,ebitdaratio,operatingIncome,operatingIncomeRatio,totalOtherIncomeExpensesNet,incomeBeforeTax,incomeBeforeTaxRatio,incomeTaxExpense,netIncome,netIncomeRatio,eps,epsdiluted,weightedAverageShsOut,weightedAverageShsOutDil
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2009-01-31,0.146316,0.056902,-0.121243,0.070291,-0.589291,-0.174789,-0.139407,,-0.070961,-0.094007,-0.016307,0.081007,-0.01608,-0.061607,0.099808,0.057643,-0.340219,-0.421289,-0.079066,-0.01005,-0.31291,-0.064068,-0.008167,-0.051493,-0.058756,-0.066966
2009-02-28,0.146316,0.056902,-0.121243,0.070291,-0.589291,-0.174789,-0.139407,,-0.070961,-0.094007,-0.016307,0.081007,-0.01608,-0.061607,0.099808,0.057643,-0.340219,-0.421289,-0.079066,-0.01005,-0.31291,-0.064068,-0.008167,-0.051493,-0.058756,-0.066966
2009-03-31,0.203683,0.056902,0.052111,-0.012473,-1.26638,-0.174789,0.620697,,-0.033523,0.019551,-0.174958,0.021035,-0.033347,-0.068508,0.092179,-0.166002,-0.230826,-0.449661,-0.129223,0.193386,-0.343969,-0.140644,0.004502,-0.017291,-0.152803,-0.129766
2009-04-30,0.203683,0.056902,0.052111,-0.012473,-1.26638,-0.174789,0.620697,,-0.033523,0.019551,-0.174958,0.021035,-0.033347,-0.068508,0.092179,-0.166002,-0.230826,-0.449661,-0.129223,0.193386,-0.343969,-0.140644,0.004502,-0.017291,-0.152803,-0.129766
2009-05-31,0.203683,0.056902,0.052111,-0.012473,-1.26638,-0.174789,0.620697,,-0.033523,0.019551,-0.174958,0.021035,-0.033347,-0.068508,0.092179,-0.166002,-0.230826,-0.449661,-0.129223,0.193386,-0.343969,-0.140644,0.004502,-0.017291,-0.152803,-0.129766
2009-06-30,0.06581,0.004997,0.006638,-0.028055,-1.252173,-0.174789,0.176951,,-0.070961,-0.190758,-0.226752,0.208535,-0.006975,-0.043112,0.067204,-0.166002,-0.472687,-0.432774,-0.072393,0.268801,-0.343969,-0.049331,-0.008167,-0.053673,-0.255396,-0.245594
2009-07-31,0.06581,0.004997,0.006638,-0.028055,-1.252173,-0.174789,0.176951,,-0.070961,-0.190758,-0.226752,0.208535,-0.006975,-0.043112,0.067204,-0.166002,-0.472687,-0.432774,-0.072393,0.268801,-0.343969,-0.049331,-0.008167,-0.053673,-0.255396,-0.245594
2009-08-31,0.06581,0.004997,0.006638,-0.028055,-1.252173,-0.174789,0.176951,,-0.070961,-0.190758,-0.226752,0.208535,-0.006975,-0.043112,0.067204,-0.166002,-0.472687,-0.432774,-0.072393,0.268801,-0.343969,-0.049331,-0.008167,-0.053673,-0.255396,-0.245594
2009-09-30,0.088948,0.102598,-0.090682,-0.047613,-1.580235,-0.04256,1.07049,,-0.033523,-0.007572,-0.207483,-0.40943,-0.01608,-0.075409,0.084549,-0.321706,-0.455401,-0.42669,-0.129223,-0.01005,-0.405252,-0.117575,-0.176929,-0.169776,-0.095016,-0.082264
2009-10-31,0.088948,0.102598,-0.090682,-0.047613,-1.580235,-0.04256,1.07049,,-0.033523,-0.007572,-0.207483,-0.40943,-0.01608,-0.075409,0.084549,-0.321706,-0.455401,-0.42669,-0.129223,-0.01005,-0.405252,-0.117575,-0.176929,-0.169776,-0.095016,-0.082264
