In [2]:
import pandas as pd
import numpy as np

df = pd.read_stata("./serrano/serrano_2024_Stata/serrano.dta")

In [11]:
display(len(df))
display(df["ORGNR"].nunique())

15327519

1550192

In [15]:
df.head()

Unnamed: 0,ORGNR,ser_jurform,ser_year,ser_pnr,bransch_sni1,ser_aas,ser_inregyr,bransch_sni2,bransch_sni3,bransch_sni4,...,br10g_agtskel,br10h_resarb,br13a_ksklev,br13b_kskknc,br13c_kskov,br15a_lskknc,br15b_lskov,br15c_obllan,ser_regdat,regfall
0,1324107000.0,31.0,2000.0,38237.0,,0.0,1.0,,,,...,,,,,,,,,2000-02-13,SCB
1,2021000000.0,81.0,1998.0,10333.0,75111.0,0.0,1.0,,,,...,,,,,,,,,1975-01-01,SCB
2,2021000000.0,81.0,1998.0,10317.0,75232.0,1.0,1.0,,,,...,,,,,,,,,1975-01-05,SCB
3,2021000000.0,81.0,1999.0,10317.0,75232.0,1.0,1.0,,,,...,,,,,,,,,1975-01-05,SCB
4,2021000000.0,81.0,2000.0,10317.0,75232.0,1.0,1.0,,,,...,,,,,,,,,1975-01-05,SCB


# General Filters

In [None]:
"""
These filter should be thought through and written about in the methodology section.
We want to avoid cherry-picking the data, but we also want to avoid including data that is not relevant to the study.

We also want to avoid survuvalrship bias, i.e., we want to avoid including only the companies that have survived.
"""
# Maybe add something about negative equity, i.e., companies that are bankrupt or close to bankruptcy.
filtered_df = df[(df['ser_year'] >= 2012) & # just keep 2012-2019, stable period with no major economic crises
                 (df['ser_year'] <= 2019) & # and stable interest rates, bnp growth, inflation
                 (df['ser_jurform'] == 49) & # aktiebolag
                 (df['ser_aktiv'] == 1) & # active companies
                 (df['ser_ftgkategori'] == 30) & # private companies, i.e., not state-owned etc.
                 (df['ser_stklf'] >= 3) & # companies with at least 10 employee.
                 (df['br09_tillgsu'] > 0) & # remove companies with no assets (there aren't many of those)
                 (df['knc_kncfall'] == 1)] # only include independet companies, i.e., not subsidiaries or parent companies
# unsure about the last one, might remove to make the dataset larger.

In [27]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())

96627

26033

In [29]:
# Sort the dataframe so ORGNR are together and sorted by year after that.
filtered_df = filtered_df.sort_values(by=["ORGNR", "ser_year"])

In [31]:
display(filtered_df.head(20))

Unnamed: 0,ORGNR,ser_jurform,ser_year,ser_pnr,bransch_sni1,ser_aas,ser_inregyr,bransch_sni2,bransch_sni3,bransch_sni4,...,br10g_agtskel,br10h_resarb,br13a_ksklev,br13b_kskknc,br13c_kskov,br15a_lskknc,br15b_lskov,br15c_obllan,ser_regdat,regfall
417293,5560012000.0,49.0,2012.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,1114.0,4102.0,0.0,3352.0,0.0,0.0,0.0,1867-10-13,BOL
417294,5560012000.0,49.0,2013.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,1397.0,4198.0,0.0,3280.0,0.0,0.0,0.0,1867-10-13,BOL
417295,5560012000.0,49.0,2014.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,2154.0,3881.0,0.0,3705.0,0.0,0.0,0.0,1867-10-13,BOL
417296,5560012000.0,49.0,2015.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,603.0,5276.0,0.0,3521.0,0.0,0.0,0.0,1867-10-13,BOL
417297,5560012000.0,49.0,2016.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,513.0,4124.0,0.0,3772.0,0.0,0.0,0.0,1867-10-13,BOL
417298,5560012000.0,49.0,2017.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,0.0,4798.0,0.0,3832.0,0.0,0.0,0.0,1867-10-13,BOL
417299,5560012000.0,49.0,2018.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,380.0,4979.0,0.0,3973.0,0.0,0.0,0.0,1867-10-13,BOL
417300,5560012000.0,49.0,2019.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.0,781.0,4441.0,0.0,4613.0,0.0,0.0,0.0,1867-10-13,BOL
417335,5560012000.0,49.0,2012.0,73480.0,63220.0,1.0,1.0,,,,...,0.0,371.0,301.0,0.0,133.0,0.0,0.0,0.0,1893-06-17,BOL
417336,5560012000.0,49.0,2013.0,73421.0,63220.0,1.0,1.0,,,,...,0.0,44.0,288.0,0.0,994.0,0.0,0.0,0.0,1893-06-17,BOL


In [32]:
filtered_df["ser_year"].value_counts()

ser_year
2018.0    12486
2017.0    12337
2016.0    12280
2019.0    12258
2012.0    11983
2015.0    11886
2013.0    11752
2014.0    11645
Name: count, dtype: int64

In [None]:
"""
Only include companies that fit all criteria in order, no year should be missing from the series.

Maybe do this.
"""

# Growth Variable

In [33]:
"""
Adding a variable for growth.

This is according to OECD's definition of high-growth firms,
which is defined as firms with an average annualized growth greater than 20%
per annum over a three-year period, and with ten or more employees at the beginning of the period.
"""

# Calculate the annual growth rate for each company and add it as a new column
filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()

# Identify high-growth periods and create a new binary column
filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
    lambda x: ((x['TURNOVER_GROWTH'] > 0.20).rolling(window=3).sum() == 3)
).reset_index(level=0, drop=True).astype(int)



  filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()
  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(


In [36]:
filtered_df["HIGH_GROWTH"].value_counts()

HIGH_GROWTH
0    94248
1      890
Name: count, dtype: int64

# Industry Variable

In [34]:
"""
There are a lot of columns in the dataset that is about the comapnies industry, a lot of different SNI codes.
But we've decided to use the 'bransch_borsbransch_konv' column as the industry variable. Which is a conversion of the SNI codes
to fewer branches.

Some companies have changed industry over time, so we determine the most frequent industry for each company and add it as a new column
to each row assicated with that company.
"""

# add one industry to all companies.
dict_of_industries = {
    10: 'Energy & Environment',
    15: 'Materials',
    20: 'Industrial goods',
    22: 'Construction industry',
    25: 'Shopping goods',
    30: 'Convenience goods',
    35: 'Health & Education',
    40: 'Finance & Real estate',
    45: 'IT & Electronics',
    50: 'Telecom & Media',
    60: 'Corporate services',
    98: 'Other',
    99: 'SNI07 missing'
}

# Determine the most frequent 'bransch_borsbransch_konv' value for each company
most_frequent_industry = filtered_df.groupby('ORGNR')['bransch_borsbransch_konv'].agg(lambda x: x.mode()[0])

# Map the most frequent 'bransch_borsbransch_konv' value to the corresponding industry name
most_frequent_industry = most_frequent_industry.map(dict_of_industries)

# Add the new 'INDUSTRY' column to the DataFrame
filtered_df = filtered_df.merge(most_frequent_industry.rename('INDUSTRY'), on='ORGNR')

# Filter out rows where the 'INDUSTRY' column is 'SNI07 missing' or 'Other'
filtered_df = filtered_df[~filtered_df['INDUSTRY'].isin(['SNI07 missing', 'Other'])]

# Independent Variables

In [38]:
"""
Firm size

Some articles use the log of sales, others the log of assets. We'll add both to the dataset and see which one works best.
"""

filtered_df["SIZE_LOG_ASSETS"] = np.log(filtered_df["br09_tillgsu"])
filtered_df["SIZE_LOG_SALES"] = np.log(filtered_df["rr01_ntoms"])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
"""
Asset tangibility

Most common thing to add as a proxy is the ratio of tangible fixed assets to total assets.
Maybe include intangible assets as well, are such assets (patents, brands etc) used as collateral?
"""

# tangible assets / total assets
filtered_df["ASSET_TANGIBILITY"] = filtered_df["br02_matanlsu"] / filtered_df["br09_tillgsu"]

In [41]:
"""
liquidity on debt

Cole use cash and cash equivalents to total assets, but we could also use current ratio or quick ratio.
"""

# Liquid assets (cash and market securities) to total assets
filtered_df["LIQUIDITY"] = filtered_df["br07_kplackaba"] / filtered_df["br09_tillgsu"]

# Current ratio: current assets divided by current liabilities
filtered_df["CURRENT_RATIO"] = filtered_df["br08_omstgsu"] / filtered_df["br13_ksksu"]

# Quick ratio: (current assets - inventory) divided by current liabilities
filtered_df["QUICK_RATIO"] = (filtered_df["br08_omstgsu"] - filtered_df["br06c_lagersu"]) / filtered_df["br13_ksksu"]


In [None]:
"""
Profitability

There are many measures for this, RoA, RoE, operating margin, net margin etc.
"""

# Profit margin according to serrano is already defined as 'ny_vinstprc'

In [42]:
"""
Tax shields

For now I only find definition in Vanacker & Manigart (2010).
"""

# Debt tax shields: Interest expenses / Total Assets
filtered_df['DEBT_TAX_SHIELDS'] = filtered_df['rr09_finkostn'] / filtered_df['br09_tillgsu']

# Non-debt tax shields: Depreciation and amortization / Total Assets
filtered_df['NON_DEBT_TAX_SHIELDS'] = filtered_df['rr05_avskriv'] / filtered_df['br09_tillgsu']

In [None]:
"""
Tax

Do we mean the corporate tax in Sweden or the % tax the companies actually pay?
"""

In [43]:
"""
Growth opportunities
"""

# % change in total assets
filtered_df['GROWTH_OPPORTUNITIES_ASSETS'] = filtered_df.groupby('ORGNR')['br09_tillgsu'].pct_change()

# % change in sales
filtered_df['GROWTH_OPPORTUNITIES_SALES'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()


  filtered_df['GROWTH_OPPORTUNITIES_SALES'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()


In [58]:
"""
Age

Need to discuss this since age could be year incorporated vs todays date, or 2019, or for every year of observation
for one company.
"""

# Change reg date to only year
filtered_df["ser_regdat"] = pd.to_datetime(filtered_df["ser_regdat"])
filtered_df["ser_regdat"] = filtered_df["ser_regdat"].dt.year

# Age of the company (year of observation - year of registration)
filtered_df["AGE"] = filtered_df["ser_year"] - filtered_df["ser_regdat"]

# Dependent Variables

In [67]:
"""
Leverage
"""

# Total debt to total assets
filtered_df['LEVERAGE_TOTAL'] = (filtered_df['br15_lsksu'] + filtered_df['br13_ksksu']) / filtered_df['br09_tillgsu']

# Long-term debt to total assets (all non-current liabilities, could be other things)
filtered_df['LEVERAGE_LONG'] = filtered_df['br15_lsksu'] / filtered_df['br09_tillgsu']

# Short-term debt to total assets (all current liabilities, could be other things)
filtered_df['LEVERAGE_SHORT'] = filtered_df['br13_ksksu'] / filtered_df['br09_tillgsu']

# Misc.

In [47]:
# save to a excel file
filtered_df.head(10000).to_excel("filtered_df.xlsx", index=False)

# Split the dataset in HGF vs non-HGF

In [68]:
# Identify HGF ORGNR
high_growth_ids = filtered_df[filtered_df["HIGH_GROWTH"] == 1]["ORGNR"].unique()

# Create a new DataFrame with only high-growth firms
high_growth_df = filtered_df[filtered_df["ORGNR"].isin(high_growth_ids)]

# Create a new DataFrame with only non-high-growth firms
non_high_growth_df = filtered_df[~filtered_df["ORGNR"].isin(high_growth_ids)]

In [69]:
high_growth_df["ORGNR"].nunique(), non_high_growth_df["ORGNR"].nunique()

(654, 24997)

# Descriptive Data

In [85]:
"""
Industry
"""
# Get one row per company (HGF)
unique_HGFs = high_growth_df.drop_duplicates(subset=["ORGNR"])

# Get one row per company (non_HGF)
unique_non_HGFs = non_high_growth_df.drop_duplicates(subset=["ORGNR"])

display(unique_HGFs["INDUSTRY"].value_counts(normalize=True) * 100)
display(unique_non_HGFs["INDUSTRY"].value_counts(normalize=True) * 100)

INDUSTRY
Corporate services       29.357798
Construction industry    23.547401
Health & Education       13.608563
Shopping goods           13.302752
IT & Electronics          8.409786
Industrial goods          4.434251
Convenience goods         2.599388
Finance & Real estate     2.293578
Materials                 1.376147
Telecom & Media           0.611621
Energy & Environment      0.458716
Name: proportion, dtype: float64

INDUSTRY
Corporate services       23.766852
Shopping goods           21.898628
Construction industry    21.646598
Health & Education        8.369004
Industrial goods          8.120975
Convenience goods         7.172861
IT & Electronics          3.788455
Finance & Real estate     1.992239
Materials                 1.608193
Telecom & Media           1.096132
Energy & Environment      0.540065
Name: proportion, dtype: float64

In [None]:
"""
Leverage
"""
display(high_growth_df["LEVERAGE_SHORT"].describe())
display(non_high_growth_df["LEVERAGE_SHORT"].describe())
display(high_growth_df["LEVERAGE_LONG"].describe())
display(non_high_growth_df["LEVERAGE_LONG"].describe())
display(high_growth_df["LEVERAGE_TOTAL"].describe())
display(non_high_growth_df["LEVERAGE_TOTAL"].describe())

Leverage


count    3953.000000
mean        0.568664
std         0.544862
min         0.000000
25%         0.386338
50%         0.550236
75%         0.710369
max        31.113208
Name: LEVERAGE_SHORT, dtype: float64

count    91184.000000
mean         0.592268
std         15.881997
min         -1.548780
25%          0.324608
50%          0.476401
75%          0.654322
max       3390.000000
Name: LEVERAGE_SHORT, dtype: float64

count    3954.000000
mean        0.113571
std         0.181478
min        -0.042481
25%         0.000000
50%         0.009972
75%         0.171328
max         1.642065
Name: LEVERAGE_LONG, dtype: float64

count    91183.000000
mean         0.130688
std          0.413232
min         -0.636735
25%          0.000000
50%          0.024280
75%          0.209802
max         47.798479
Name: LEVERAGE_LONG, dtype: float64

count    3953.000000
mean        0.682264
std         0.545206
min         0.000000
25%         0.514799
50%         0.680286
75%         0.827275
max        31.113208
Name: LEVERAGE_TOTAL, dtype: float64

count    91183.000000
mean         0.722960
std         15.893056
min         -1.548780
25%          0.448402
50%          0.626178
75%          0.798417
max       3390.000000
Name: LEVERAGE_TOTAL, dtype: float64

In [None]:
"""
Firm size
"""
display(high_growth_df["SIZE_LOG_ASSETS"].describe())
display(non_high_growth_df["SIZE_LOG_ASSETS"].describe())
display(high_growth_df["SIZE_LOG_SALES"].describe())
display(non_high_growth_df["SIZE_LOG_SALES"].describe())

Firm size


count    3954.000000
mean        9.018550
std         1.163944
min         3.970292
25%         8.225971
50%         9.007857
75%         9.753943
max        14.915306
Name: SIZE_LOG_ASSETS, dtype: float64

count    91184.000000
mean         8.907804
std          1.107447
min          0.000000
25%          8.188133
50%          8.896862
75%          9.593355
max         17.388181
Name: SIZE_LOG_ASSETS, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    3954.000000
mean            -inf
std              NaN
min             -inf
25%         9.288851
50%         9.887536
75%        10.538363
max        14.877591
Name: SIZE_LOG_SALES, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    9.118300e+04
mean             -inf
std               NaN
min              -inf
25%      9.256699e+00
50%      9.743495e+00
75%      1.033251e+01
max      1.677761e+01
Name: SIZE_LOG_SALES, dtype: float64

In [88]:
"""
Asset tangibility
"""
display(high_growth_df["ASSET_TANGIBILITY"].describe())
display(non_high_growth_df["ASSET_TANGIBILITY"].describe())

count    3954.000000
mean        0.169221
std         0.228527
min         0.000000
25%         0.009585
50%         0.054953
75%         0.249964
max         0.970651
Name: ASSET_TANGIBILITY, dtype: float64

count    91183.000000
mean         0.221056
std          0.247280
min         -0.050699
25%          0.020932
50%          0.109855
75%          0.373882
max          4.521327
Name: ASSET_TANGIBILITY, dtype: float64

In [89]:
"""
liquidity
"""
display(high_growth_df["LIQUIDITY"].describe())
display(non_high_growth_df["LIQUIDITY"].describe())
display(high_growth_df["CURRENT_RATIO"].describe())
display(non_high_growth_df["CURRENT_RATIO"].describe())
display(high_growth_df["QUICK_RATIO"].describe())
display(non_high_growth_df["QUICK_RATIO"].describe())

count    3954.000000
mean        0.265680
std         0.229791
min        -0.020896
25%         0.063360
50%         0.219327
75%         0.418655
max         1.128364
Name: LIQUIDITY, dtype: float64

count    91180.000000
mean         0.249396
std          0.229492
min         -4.939394
25%          0.052165
50%          0.196163
75%          0.393600
max          8.175355
Name: LIQUIDITY, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    3953.000000
mean             inf
std              NaN
min         0.032141
25%         1.075175
50%         1.390253
75%         1.852775
max              inf
Name: CURRENT_RATIO, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    9.118000e+04
mean              inf
std               NaN
min     -1.170000e+02
25%      1.062398e+00
50%      1.451737e+00
75%      2.038955e+00
max               inf
Name: CURRENT_RATIO, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    3953.000000
mean             inf
std              NaN
min         0.031549
25%         0.962319
50%         1.303095
75%         1.741088
max              inf
Name: QUICK_RATIO, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    9.118000e+04
mean              inf
std               NaN
min     -1.170000e+02
25%      8.388281e-01
50%      1.245865e+00
75%      1.793570e+00
max               inf
Name: QUICK_RATIO, dtype: float64

In [90]:
"""
Profitability
"""
display(high_growth_df["ny_vinstprc"].describe())
display(non_high_growth_df["ny_vinstprc"].describe())

count    3949.000000
mean       -0.205603
std         8.996376
min      -509.500000
25%         0.016031
50%         0.051669
75%         0.103688
max         3.522124
Name: ny_vinstprc, dtype: float64

count    91045.000000
mean        -0.835448
std         94.945864
min     -18213.000000
25%          0.009574
50%          0.039355
75%          0.086472
max       2959.000000
Name: ny_vinstprc, dtype: float64

In [91]:
"""
Tax shields
"""
display(high_growth_df["DEBT_TAX_SHIELDS"].describe())
display(non_high_growth_df["DEBT_TAX_SHIELDS"].describe())
display(high_growth_df["NON_DEBT_TAX_SHIELDS"].describe())
display(non_high_growth_df["NON_DEBT_TAX_SHIELDS"].describe())

count    3954.000000
mean       -0.009746
std         0.029259
min        -0.932665
25%        -0.011207
50%        -0.003361
75%        -0.000633
max         0.000000
Name: DEBT_TAX_SHIELDS, dtype: float64

count    91184.000000
mean        -0.017525
std          1.196678
min       -255.000000
25%         -0.013609
50%         -0.004751
75%         -0.000807
max          0.000000
Name: DEBT_TAX_SHIELDS, dtype: float64

count    3954.000000
mean       -0.038624
std         0.060307
min        -2.184781
25%        -0.055460
50%        -0.016987
75%        -0.003870
max         0.000000
Name: NON_DEBT_TAX_SHIELDS, dtype: float64

count    91184.000000
mean        -0.054140
std          0.838829
min       -175.000000
25%         -0.070087
50%         -0.027760
75%         -0.008415
max          0.023667
Name: NON_DEBT_TAX_SHIELDS, dtype: float64

In [92]:
"""
Growth opportunities
"""
display(high_growth_df["GROWTH_OPPORTUNITIES_ASSETS"].describe())
display(non_high_growth_df["GROWTH_OPPORTUNITIES_ASSETS"].describe())
display(high_growth_df["GROWTH_OPPORTUNITIES_SALES"].describe())
display(non_high_growth_df["GROWTH_OPPORTUNITIES_SALES"].describe())

count    3300.000000
mean        0.328112
std         0.616845
min        -0.941886
25%         0.039103
50%         0.217969
75%         0.452785
max        14.164602
Name: GROWTH_OPPORTUNITIES_ASSETS, dtype: float64

count    66187.000000
mean         0.094952
std          0.668693
min         -0.999795
25%         -0.054201
50%          0.037675
75%          0.163815
max        124.681818
Name: GROWTH_OPPORTUNITIES_ASSETS, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    3297.000000
mean             inf
std              NaN
min        -1.000000
25%         0.140787
50%         0.283160
75%         0.489406
max              inf
Name: GROWTH_OPPORTUNITIES_SALES, dtype: float64

  sqr = _ensure_numeric((avg - values) ** 2)


count    6.612500e+04
mean              inf
std               NaN
min     -1.000000e+00
25%     -2.594172e-02
50%      4.413629e-02
75%      1.483587e-01
max               inf
Name: GROWTH_OPPORTUNITIES_SALES, dtype: float64

In [93]:
"""
Age
"""
display(high_growth_df["AGE"].describe())
display(non_high_growth_df["AGE"].describe())

count    3954.000000
mean       45.833333
std         2.114432
min        42.000000
25%        44.000000
50%        46.000000
75%        48.000000
max        49.000000
Name: AGE, dtype: float64

count    91184.000000
mean        45.528744
std          2.299701
min         42.000000
25%         44.000000
50%         46.000000
75%         48.000000
max         49.000000
Name: AGE, dtype: float64

# Tests