In [1]:
import pandas as pd
import numpy as np

df = pd.read_stata("./serrano/serrano_2024_Stata/serrano.dta")

In [162]:
display(len(df))
display(df["ORGNR"].nunique())

15327519

1550192

# General Filters

In [163]:
"""
These filter should be thought through and written about in the methodology section.
We want to avoid cherry-picking the data, but we also want to avoid including data that is not relevant to the study.

We also want to avoid survuvalrship bias, i.e., we want to avoid including only the companies that have survived.
"""
# Maybe add something about negative equity, i.e., companies that are bankrupt or close to bankruptcy.
filtered_df = df[(df['ser_year'] >= 2011) & # 2012-2019 for final dataset, stable period with no major economic crises, here 2011 so we can get growth measures for 2012
                 (df['ser_year'] <= 2019) & # and stable interest rates, bnp growth, inflation
                 (df['ser_jurform'] == 49) & # aktiebolag
                 (df['ser_aktiv'] == 1) & # active companies
                 (df['ser_ftgkategori'] == 30) & # private companies, i.e., not state-owned etc.
                 (df['ser_stklf'] >= 2) & # companies with at least 10 employee for final dataset. Include smaller here to then drop
                 (df['knc_kncfall'] == 1) & # only include independet companies, i.e., not subsidiaries or parent companies THIS IS UP FOR DEBATE
                 (df['rr01_ntoms'] > 0) & # remove companies with no turnover
                 (df['ser_inregyr'] == 1)] # only include companies that are registered at SCB (just three observations are not)
# unsure about the last one, might remove to make the dataset larger.

In [164]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())

310859

72117

In [165]:
# Sort the dataframe so ORGNR are together and sorted by year after that.
filtered_df = filtered_df.sort_values(by=["ORGNR", "ser_year"])

In [166]:
filtered_df["ser_year"].value_counts()

ser_year
2011.0    35012
2018.0    34734
2016.0    34699
2012.0    34672
2017.0    34580
2019.0    34415
2013.0    34368
2015.0    34352
2014.0    34027
Name: count, dtype: int64

In [68]:
"""
Only include companies that fit all criteria in order, no year should be missing from the series.

Maybe do this.
"""

'\nOnly include companies that fit all criteria in order, no year should be missing from the series.\n\nMaybe do this.\n'

# Growth Variable

In [167]:
"""
Adding a variable for growth.

This is according to OECD's definition of high-growth firms,
which is defined as firms with an average annualized growth greater than 20%
per annum over a three-year period, and with ten or more employees at the beginning of the period.
"""

# Calculate the annual growth rate for each company and add it as a new column
filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()

# Identify high-growth periods and create a new binary column
filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
    lambda x: ((x['TURNOVER_GROWTH'] > 0.20).rolling(window=3).sum() == 3)
).reset_index(level=0, drop=True).astype(int)



  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(


In [168]:
"""
Asset growth here. It is a dependet variable in the model but needs to be defined
here before we delete the observations below to reduce the amount of NaNs
"""

# % change in total assets
filtered_df['GROWTH_OPPORTUNITIES_ASSETS'] = filtered_df.groupby('ORGNR')['br09_tillgsu'].pct_change()

  filtered_df['GROWTH_OPPORTUNITIES_ASSETS'] = filtered_df.groupby('ORGNR')['br09_tillgsu'].pct_change()


In [169]:
"""
Drop 2011 observaitons. It was only included to calculate the growth rate for 2012.

Drop companies w/ fewer than 10 employees. They were included to calculate the growth for companies that are included in the final dataset.
"""
filtered_df = filtered_df[filtered_df["ser_year"] != 2011]
filtered_df = filtered_df[filtered_df["ser_stklf"] != 2]

In [175]:
# drop companies w/ growth rates that are NaN
filtered_df = filtered_df.dropna(subset=["TURNOVER_GROWTH"])

# drop companies w/ growth rates that are exactly 0. For now I don't know how to handle these or where they stem from.
# Since we only include compaines with positive turnover, this should not be a problem.
filtered_df = filtered_df[filtered_df["TURNOVER_GROWTH"] != 0]

# Drop companies with NaN in the growth rate for assets
filtered_df = filtered_df.dropna(subset=["GROWTH_OPPORTUNITIES_ASSETS"])

In [177]:
filtered_df["HIGH_GROWTH"].value_counts()

HIGH_GROWTH
0    87113
1     2738
Name: count, dtype: int64

# Industry Variable

In [178]:
"""
There are a lot of columns in the dataset that is about the comapnies industry, a lot of different SNI codes.
But we've decided to use the 'bransch_borsbransch_konv' column as the industry variable. Which is a conversion of the SNI codes
to fewer branches.

Some companies have changed industry over time, so we determine the most frequent industry for each company and add it as a new column
to each row assicated with that company.
"""

# add one industry to all companies.
dict_of_industries = {
    10: 'Energy & Environment',
    15: 'Materials',
    20: 'Industrial goods',
    22: 'Construction industry',
    25: 'Shopping goods',
    30: 'Convenience goods',
    35: 'Health & Education',
    40: 'Finance & Real estate',
    45: 'IT & Electronics',
    50: 'Telecom & Media',
    60: 'Corporate services',
    98: 'Other',
    99: 'SNI07 missing'
}

# Determine the most frequent 'bransch_borsbransch_konv' value for each company
most_frequent_industry = filtered_df.groupby('ORGNR')['bransch_borsbransch_konv'].agg(lambda x: x.mode()[0])

# Map the most frequent 'bransch_borsbransch_konv' value to the corresponding industry name
most_frequent_industry = most_frequent_industry.map(dict_of_industries)

# Add the new 'INDUSTRY' column to the DataFrame
filtered_df = filtered_df.merge(most_frequent_industry.rename('INDUSTRY'), on='ORGNR')

# Filter out rows where the 'INDUSTRY' column is 'SNI07 missing' or 'Other'
filtered_df = filtered_df[~filtered_df['INDUSTRY'].isin(['SNI07 missing', 'Other'])]

# Variables

In [182]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())

88322

23593

In [183]:
"""
Some more filtering to remove weird values.

Negative assets, negative debt etc.

It removes about 150 observations.
"""

# drop observations with no assets
filtered_df = filtered_df[filtered_df["br09_tillgsu"] > 0]

# drop observations with negative tangible assets
filtered_df = filtered_df[filtered_df["br02_matanlsu"] >= 0]

# drop observations with negative current assets
filtered_df = filtered_df[filtered_df["br08_omstgsu"] > 0]

# drop observations with negative inventories
filtered_df = filtered_df[filtered_df["br06c_lagersu"] >= 0]

# drop observations with positive depreciation
filtered_df = filtered_df[filtered_df["rr05_avskriv"] <= 0]

# drop one weird outlier when it comes to interest expenses
filtered_df = filtered_df[filtered_df["rr09_finkostn"] > -200000]

# drop observations with negative long-term debt
filtered_df = filtered_df[filtered_df["br15_lsksu"] >= 0]

# drop observations with negative short-term debt
filtered_df = filtered_df[filtered_df["br13_ksksu"] > 0]

In [184]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())

88322

23593

## Independent

In [185]:
"""
Firm size

Some articles use the log of sales, others the log of assets. We'll add both to the dataset and see which one works best.
"""

filtered_df["SIZE_LOG_ASSETS"] = np.log(filtered_df["br09_tillgsu"])
filtered_df["SIZE_LOG_SALES"] = np.log(filtered_df["rr01_ntoms"])

In [186]:
"""
Asset tangibility

Most common thing to add as a proxy is the ratio of tangible fixed assets to total assets.
Maybe include intangible assets as well, are such assets (patents, brands etc) used as collateral?

There are quite a lot of companies with 0 tangible assets (8000 observations), which is a bit strange. Maybe we should remove those.
"""

# tangible assets / total assets
filtered_df["ASSET_TANGIBILITY"] = filtered_df["br02_matanlsu"] / filtered_df["br09_tillgsu"]

In [187]:
"""
liquidity on debt

Cole use cash and cash equivalents to total assets, but we could also use current ratio or quick ratio.
"""

# Liquid assets (cash and market securities) to total assets
filtered_df["LIQUIDITY"] = filtered_df["br07_kplackaba"] / filtered_df["br09_tillgsu"]

# Current ratio: current assets divided by current liabilities
filtered_df["CURRENT_RATIO"] = filtered_df["br08_omstgsu"] / filtered_df["br13_ksksu"]

# Quick ratio: (current assets - inventory) divided by current liabilities
filtered_df["QUICK_RATIO"] = (filtered_df["br08_omstgsu"] - filtered_df["br06c_lagersu"]) / filtered_df["br13_ksksu"]


In [188]:
"""
Profitability

There are many measures for this, RoA, RoE, operating margin, net margin etc.
"""

# Profit margin according to serrano is already defined as 'ny_vinstprc'

filtered_df['ny_vinstprc'].value_counts(dropna=False)

ny_vinstprc
0.000000    63
0.062500     6
0.052632     4
0.061224     4
0.064516     4
            ..
0.046418     1
0.023444     1
0.049516     1
0.009657     1
0.022095     1
Name: count, Length: 87966, dtype: int64

In [189]:
"""
Tax shields

For now I only find definition in Vanacker & Manigart (2010).
"""

# Debt tax shields: Interest expenses / Total Assets
filtered_df['DEBT_TAX_SHIELDS'] = filtered_df['rr09_finkostn'] / filtered_df['br09_tillgsu']

# Non-debt tax shields: Depreciation and amortization / Total Assets
filtered_df['NON_DEBT_TAX_SHIELDS'] = filtered_df['rr05_avskriv'] / filtered_df['br09_tillgsu']

In [125]:
"""
Tax

Do we mean the corporate tax in Sweden or the % tax the companies actually pay?
"""

'\nTax\n\nDo we mean the corporate tax in Sweden or the % tax the companies actually pay?\n'

In [None]:
"""
Growth opportunities

Some have change in sales too. But that gets weird for us since that is what we use to determine HGFs.

This is defined further up in the code. Before filtering stuff needed to calculate it.
"""


'\nGrowth opportunities\n\nSome have change in sales too. But that gets weird for us since that is what we use to determine HGFs.\n'

In [192]:
"""
Age

Need to discuss this since age could be year incorporated vs todays date, or 2019, or for every year of observation
for one company.
"""

# Change reg date to only year
filtered_df["ser_regdat"] = pd.to_datetime(filtered_df["ser_regdat"])
filtered_df["ser_regdat"] = filtered_df["ser_regdat"].dt.year

# Age of the company (year of observation - year of registration)
filtered_df["AGE"] = filtered_df["ser_year"] - filtered_df["ser_regdat"]

## Dependent

In [193]:
"""
Leverage
"""

# Total debt to total assets
filtered_df['LEVERAGE_TOTAL'] = (filtered_df['br15_lsksu'] + filtered_df['br13_ksksu']) / filtered_df['br09_tillgsu']

# Long-term debt to total assets (all non-current liabilities, could be other things)
filtered_df['LEVERAGE_LONG'] = filtered_df['br15_lsksu'] / filtered_df['br09_tillgsu']

# Short-term debt to total assets (all current liabilities, could be other things)
filtered_df['LEVERAGE_SHORT'] = filtered_df['br13_ksksu'] / filtered_df['br09_tillgsu']

# More filters

In [196]:
display(len(filtered_df))
display(len(filtered_df["ORGNR"].unique()))

88322

23593

In [197]:
# only keep companies that have at least 3 years of data
filtered_df = filtered_df.groupby("ORGNR").filter(lambda x: len(x) >= 3)

In [198]:
display(len(filtered_df))
display(len(filtered_df["ORGNR"].unique()))

73977

13535

# Misc.

In [97]:
# save to a excel file
filtered_df.head(10000).to_excel("filtered_df.xlsx", index=False)

# cheking stuff
#filtered_df["br15_lsksu"].value_counts(dropna=False)
# see if any company have negative br09_tillgsu
#filtered_df[filtered_df["br15_lsksu"] < 0]
#filtered_df["rr09_finkostn"].describe()

# Hausmann test

In [199]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())
display(filtered_df.head(10))

73977

13535

Unnamed: 0,ORGNR,ser_jurform,ser_year,ser_pnr,bransch_sni1,ser_aas,ser_inregyr,bransch_sni2,bransch_sni3,bransch_sni4,...,ASSET_TANGIBILITY,LIQUIDITY,CURRENT_RATIO,QUICK_RATIO,DEBT_TAX_SHIELDS,NON_DEBT_TAX_SHIELDS,AGE,LEVERAGE_TOTAL,LEVERAGE_LONG,LEVERAGE_SHORT
0,5560012000.0,49.0,2012.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.567698,0.26315,1.88865,1.508854,-5.3e-05,-0.033968,145.0,0.197813,0.0,0.197813
1,5560012000.0,49.0,2013.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.554335,0.269109,2.054694,1.592003,-2.5e-05,-0.0356,146.0,0.189206,0.0,0.189206
2,5560012000.0,49.0,2014.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.535492,0.352897,2.421039,2.003559,-0.0006,-0.036429,147.0,0.189541,0.0,0.189541
3,5560012000.0,49.0,2015.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.686091,0.199553,1.476867,1.154143,-4.8e-05,-0.035296,148.0,0.209233,0.0,0.209233
4,5560012000.0,49.0,2016.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.668372,0.227705,1.706054,1.357396,-0.000121,-0.047258,149.0,0.191455,0.0,0.191455
5,5560012000.0,49.0,2017.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.638896,0.232426,1.583497,1.250087,-4.9e-05,-0.048297,150.0,0.210988,0.0,0.210988
6,5560012000.0,49.0,2018.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.605343,0.262756,1.667672,1.351653,-4.9e-05,-0.048278,151.0,0.217614,0.0,0.217614
7,5560012000.0,49.0,2019.0,81622.0,52112.0,1.0,1.0,52421.0,,,...,0.556849,0.281147,1.754805,1.403247,-7.2e-05,-0.047247,152.0,0.218142,0.0,0.218142
8,5560012000.0,49.0,2012.0,73480.0,63220.0,1.0,1.0,,,,...,0.878116,0.004208,0.295818,0.178967,-0.030916,-0.034801,119.0,0.949498,0.686306,0.263192
9,5560012000.0,49.0,2013.0,73421.0,63220.0,1.0,1.0,,,,...,0.889958,0.001924,0.277304,0.192099,-0.01767,-0.036914,120.0,0.937894,0.712036,0.225857


In [244]:
hausmann_df = filtered_df.set_index(["ORGNR", "ser_year"])

In [245]:
hausmann_df = filtered_df.set_index(["ORGNR", "ser_year"])

y = hausmann_df["LEVERAGE_TOTAL"]
X = hausmann_df[["SIZE_LOG_ASSETS", "ASSET_TANGIBILITY", "LIQUIDITY", "ny_vinstprc", "DEBT_TAX_SHIELDS", "NON_DEBT_TAX_SHIELDS", "GROWTH_OPPORTUNITIES_ASSETS", "AGE"]]

In [246]:
y.isna().sum()
X.isna().sum()

SIZE_LOG_ASSETS                0
ASSET_TANGIBILITY              0
LIQUIDITY                      0
ny_vinstprc                    0
DEBT_TAX_SHIELDS               0
NON_DEBT_TAX_SHIELDS           0
GROWTH_OPPORTUNITIES_ASSETS    0
AGE                            0
dtype: int64

In [217]:
import statsmodels.api as sm
from linearmodels import PanelOLS, RandomEffects
from linearmodels.panel.results import compare
from scipy import stats

In [228]:
"""
Model Comparison
"""

# Add a constant term
X = sm.add_constant(X)

# Run fixed effects model
fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

# Run random effects model
re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Compare the models
comparison = compare({'random': re_results, 'fixed': fe_results})
print(comparison)

                           Model Comparison                          
                                            random              fixed
---------------------------------------------------------------------
Dep. Variable                       LEVERAGE_TOTAL     LEVERAGE_TOTAL
Estimator                            RandomEffects           PanelOLS
No. Observations                             73977              73977
Cov. Est.                               Unadjusted         Unadjusted
R-squared                                   0.9986             0.9986
R-Squared (Within)                          0.9986             0.9986
R-Squared (Between)                         0.9984             0.9980
R-Squared (Overall)                         0.9982             0.9979
F-statistic                              6.403e+06          5.494e+06
P-value (F-stat)                            0.0000             0.0000
const                                       1.0012             0.9227
                    

In [229]:
def hausman_test(fe_results, re_results):
    """
    Manual implementation of Hausman test for panel data
    """
    # Get coefficients (excluding constant)
    b_fe = fe_results.params[1:]
    b_re = re_results.params[1:]
    
    # Get covariance matrices (excluding constant)
    var_fe = fe_results.cov.iloc[1:, 1:]
    var_re = re_results.cov.iloc[1:, 1:]
    
    # Calculate difference
    diff = b_fe - b_re
    
    # Calculate variance of difference
    var_diff = var_fe - var_re
    
    # Calculate test statistic
    stat = diff.T @ np.linalg.inv(var_diff) @ diff
    
    # Degrees of freedom = number of parameters being tested
    df = len(b_fe)
    
    # Calculate p-value
    p_value = 1 - stats.chi2.cdf(stat, df)
    
    return {
        'stat': stat,
        'df': df,
        'p-value': p_value
    }

In [None]:
"""
Hausman Test w/o industry dummies TOTAL LEVERAGE
"""

fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Perform the Hausman test
result = hausman_test(fe_results, re_results)
print(f"Hausman test statistic: {result['stat']:.4f}")
print(f"Degrees of freedom: {result['df']}")
print(f"P-value: {result['p-value']:.4f}")

Hausman test statistic: 560.8820
Degrees of freedom: 8
P-value: 0.0000


In [247]:
"""
Hausman Test w/o industry dummies LONG TERM LEVERAGE
"""

y = hausmann_df["LEVERAGE_LONG"]

# After running your models:
fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Perform the Hausman test
result = hausman_test(fe_results, re_results)
print(f"Hausman test statistic: {result['stat']:.4f}")
print(f"Degrees of freedom: {result['df']}")
print(f"P-value: {result['p-value']:.4f}")

Hausman test statistic: 236.8650
Degrees of freedom: 7
P-value: 0.0000


In [248]:
"""
Hausman Test w/o industry dummies SHORT TERM LEVERAGE
"""

y = hausmann_df["LEVERAGE_SHORT"]

# After running your models:
fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Perform the Hausman test
result = hausman_test(fe_results, re_results)
print(f"Hausman test statistic: {result['stat']:.4f}")
print(f"Degrees of freedom: {result['df']}")
print(f"P-value: {result['p-value']:.4f}")

Hausman test statistic: 699.5164
Degrees of freedom: 7
P-value: 0.0000


# Split the dataset in HGF vs non-HGF

In [232]:
# Identify HGF ORGNR
high_growth_ids = filtered_df[filtered_df["HIGH_GROWTH"] == 1]["ORGNR"].unique()

# Create a new DataFrame with only high-growth firms
HGF_df = filtered_df[filtered_df["ORGNR"].isin(high_growth_ids)]

# Create a new DataFrame with only non-high-growth firms
non_HGF_df = filtered_df[~filtered_df["ORGNR"].isin(high_growth_ids)]

In [233]:
display(len(HGF_df), len(non_HGF_df))
display(HGF_df["ORGNR"].nunique(), non_HGF_df["ORGNR"].nunique())

7573

66404

1543

11992

# Descriptive Data

In [249]:
"""
Industry
"""
# Get one row per company (HGF)
unique_HGFs = HGF_df.drop_duplicates(subset=["ORGNR"])

# Get one row per company (non_HGF)
unique_non_HGFs = non_HGF_df.drop_duplicates(subset=["ORGNR"])

display(unique_HGFs["INDUSTRY"].value_counts(normalize=True) * 100)
display(unique_non_HGFs["INDUSTRY"].value_counts(normalize=True) * 100)

INDUSTRY
Corporate services       28.645496
Construction industry    25.858717
Shopping goods           13.998704
Health & Education       12.248866
IT & Electronics          7.453014
Industrial goods          4.601426
Convenience goods         2.851588
Finance & Real estate     2.203500
Materials                 1.101750
Telecom & Media           0.648088
Energy & Environment      0.388853
Name: proportion, dtype: float64

INDUSTRY
Corporate services       23.073716
Shopping goods           21.856237
Construction industry    20.630420
Industrial goods          9.614743
Convenience goods         9.281187
Health & Education        8.255504
IT & Electronics          2.468312
Materials                 1.967979
Finance & Real estate     1.576051
Telecom & Media           0.675450
Energy & Environment      0.600400
Name: proportion, dtype: float64

In [250]:
"""
Leverage
"""
display(HGF_df["LEVERAGE_SHORT"].describe())
display(non_HGF_df["LEVERAGE_SHORT"].describe())
display(HGF_df["LEVERAGE_LONG"].describe())
display(non_HGF_df["LEVERAGE_LONG"].describe())
display(HGF_df["LEVERAGE_TOTAL"].describe())
display(non_HGF_df["LEVERAGE_TOTAL"].describe())

count    7573.000000
mean        0.562987
std         0.429717
min         0.023494
25%         0.390278
50%         0.541679
75%         0.701504
max        31.113208
Name: LEVERAGE_SHORT, dtype: float64

count    66404.000000
mean         0.540321
std         13.158724
min          0.006915
25%          0.310723
50%          0.454412
75%          0.624196
max       3390.000000
Name: LEVERAGE_SHORT, dtype: float64

count    7573.000000
mean        0.111263
std         0.171996
min         0.000000
25%         0.000000
50%         0.015308
75%         0.173780
max         2.289017
Name: LEVERAGE_LONG, dtype: float64

count    66404.000000
mean         0.127206
std          0.273006
min          0.000000
25%          0.000000
50%          0.027056
75%          0.212255
max         24.758333
Name: LEVERAGE_LONG, dtype: float64

count    7573.000000
mean        0.674250
std         0.434519
min         0.046232
25%         0.504898
50%         0.674433
75%         0.822850
max        31.113208
Name: LEVERAGE_TOTAL, dtype: float64

count    66404.000000
mean         0.667527
std         13.163028
min          0.009867
25%          0.432203
50%          0.601027
75%          0.768646
max       3390.000000
Name: LEVERAGE_TOTAL, dtype: float64

In [236]:
"""
Firm size
"""
display(HGF_df["SIZE_LOG_ASSETS"].describe())
display(non_HGF_df["SIZE_LOG_ASSETS"].describe())
display(HGF_df["SIZE_LOG_SALES"].describe())
display(non_HGF_df["SIZE_LOG_SALES"].describe())

count    7573.000000
mean        8.926602
std         1.043498
min         3.970292
25%         8.212026
50%         8.876824
75%         9.573524
max        14.915306
Name: SIZE_LOG_ASSETS, dtype: float64

count    66404.000000
mean         9.023488
std          1.056268
min          0.000000
25%          8.322880
50%          9.009814
75%          9.675520
max         16.914990
Name: SIZE_LOG_ASSETS, dtype: float64

count    7573.000000
mean        9.891820
std         0.876005
min         3.401197
25%         9.340579
50%         9.828872
75%        10.397848
max        14.877591
Name: SIZE_LOG_SALES, dtype: float64

count    66404.000000
mean         9.936591
std          0.857748
min          0.000000
25%          9.372034
50%          9.828791
75%         10.414581
max         15.618990
Name: SIZE_LOG_SALES, dtype: float64

In [237]:
"""
Asset tangibility
"""
display(HGF_df["ASSET_TANGIBILITY"].describe())
display(non_HGF_df["ASSET_TANGIBILITY"].describe())

count    7573.000000
mean        0.177103
std         0.229440
min         0.000000
25%         0.011557
50%         0.062571
75%         0.271989
max         0.965418
Name: ASSET_TANGIBILITY, dtype: float64

count    66404.000000
mean         0.233076
std          0.250085
min          0.000000
25%          0.024988
50%          0.125544
75%          0.398043
max          0.996224
Name: ASSET_TANGIBILITY, dtype: float64

In [238]:
"""
liquidity
"""
display(HGF_df["LIQUIDITY"].describe())
display(non_HGF_df["LIQUIDITY"].describe())
display(HGF_df["CURRENT_RATIO"].describe())
display(non_HGF_df["CURRENT_RATIO"].describe())
display(HGF_df["QUICK_RATIO"].describe())
display(non_HGF_df["QUICK_RATIO"].describe())

count    7573.000000
mean        0.258323
std         0.225511
min        -0.069223
25%         0.060870
50%         0.209459
75%         0.411429
max         1.128364
Name: LIQUIDITY, dtype: float64

count    66404.000000
mean         0.250880
std          0.224304
min         -0.408312
25%          0.055579
50%          0.200361
75%          0.394517
max          1.003517
Name: LIQUIDITY, dtype: float64

count    7573.000000
mean        1.614017
std         1.043885
min         0.032141
25%         1.068923
50%         1.391516
75%         1.874849
max        19.233420
Name: CURRENT_RATIO, dtype: float64

count    66404.000000
mean         1.816853
std          1.779467
min          0.000295
25%          1.089659
50%          1.493739
75%          2.087352
max        142.170569
Name: CURRENT_RATIO, dtype: float64

count    7573.000000
mean        1.493734
std         1.038730
min         0.031549
25%         0.943405
50%         1.295042
75%         1.765012
max        19.233420
Name: QUICK_RATIO, dtype: float64

count    66404.000000
mean         1.528183
std          1.418701
min         -2.636364
25%          0.842592
50%          1.263855
75%          1.818007
max         75.191919
Name: QUICK_RATIO, dtype: float64

In [239]:
"""
Profitability
"""
display(HGF_df["ny_vinstprc"].describe())
display(non_HGF_df["ny_vinstprc"].describe())

count    7573.000000
mean       -0.107102
std         6.597640
min      -509.500000
25%         0.015355
50%         0.049389
75%         0.103736
max        12.133333
Name: ny_vinstprc, dtype: float64

count    66404.000000
mean         0.062392
std         12.603442
min      -1284.750000
25%          0.011747
50%          0.039991
75%          0.084316
max       2959.000000
Name: ny_vinstprc, dtype: float64

In [240]:
"""
Tax shields
"""
display(HGF_df["DEBT_TAX_SHIELDS"].describe())
display(non_HGF_df["DEBT_TAX_SHIELDS"].describe())
display(HGF_df["NON_DEBT_TAX_SHIELDS"].describe())
display(non_HGF_df["NON_DEBT_TAX_SHIELDS"].describe())

count    7573.000000
mean       -0.010304
std         0.029739
min        -0.932665
25%        -0.011986
50%        -0.004143
75%        -0.000752
max         0.000000
Name: DEBT_TAX_SHIELDS, dtype: float64

count    66404.000000
mean        -0.014617
std          0.990007
min       -255.000000
25%         -0.013329
50%         -0.004789
75%         -0.000817
max          0.000000
Name: DEBT_TAX_SHIELDS, dtype: float64

count    7573.000000
mean       -0.042112
std         0.060978
min        -2.184781
25%        -0.060248
50%        -0.020652
75%        -0.005390
max         0.000000
Name: NON_DEBT_TAX_SHIELDS, dtype: float64

count    66404.000000
mean        -0.054373
std          0.681823
min       -175.000000
25%         -0.074169
50%         -0.030294
75%         -0.009915
max          0.000000
Name: NON_DEBT_TAX_SHIELDS, dtype: float64

In [241]:
"""
Growth opportunities
"""
display(HGF_df["GROWTH_OPPORTUNITIES_ASSETS"].describe())
display(non_HGF_df["GROWTH_OPPORTUNITIES_ASSETS"].describe())

count    7573.000000
mean        0.343416
std         0.663544
min        -0.941886
25%         0.033670
50%         0.219330
75%         0.467949
max        16.844920
Name: GROWTH_OPPORTUNITIES_ASSETS, dtype: float64

count    66404.000000
mean         0.087891
std          0.383707
min         -0.999795
25%         -0.052287
50%          0.040334
75%          0.159217
max         43.918590
Name: GROWTH_OPPORTUNITIES_ASSETS, dtype: float64

In [242]:
"""
Age
"""
display(HGF_df["AGE"].describe())
display(non_HGF_df["AGE"].describe())

count    7573.000000
mean        9.406576
std         8.258079
min         1.000000
25%         4.000000
50%         7.000000
75%        11.000000
max        59.000000
Name: AGE, dtype: float64

count    66404.000000
mean        18.835959
std         14.217480
min          1.000000
25%          8.000000
50%         15.000000
75%         26.000000
max        152.000000
Name: AGE, dtype: float64

# Tests

In [254]:
# Define your dependent variables and controls
dependent_vars = ['LEVERAGE_TOTAL', 'LEVERAGE_LONG', 'LEVERAGE_SHORT']
control_vars = ["SIZE_LOG_ASSETS", "SIZE_LOG_SALES", "ASSET_TANGIBILITY", "GROWTH_OPPORTUNITIES_ASSETS", "LIQUIDITY", "QUICK_RATIO", "CURRENT_RATIO", 
               "ny_vinstprc", "DEBT_TAX_SHIELDS", "NON_DEBT_TAX_SHIELDS", "AGE"]

# Set up for panel data analysis (repeat for both samples)
def run_panel_regressions(data, sample_name):
    """Run fixed effects regressions for all dependent variables on a given sample"""
    
    print(f"\n=== {sample_name} Companies Results ===\n")
    
    # Set panel data index
    panel_data = data.set_index(["ORGNR", "ser_year"])
    
    # Add constant
    X = panel_data[control_vars]
    X = sm.add_constant(X)
    
    # Run regressions for each dependent variable
    for dep_var in dependent_vars:
        y = panel_data[dep_var]
        
        # Run fixed effects model (entity effects only, no time effects)
        fe_model = PanelOLS(y, X, entity_effects=True)
        fe_results = fe_model.fit()
        
        print(f"\nResults for {dep_var}:")
        print(fe_results.summary.tables[1])  # Print coefficients table
        print(f"R-squared: {fe_results.rsquared:.4f}")
        print(f"Within R-squared: {fe_results.rsquared_within:.4f}")

# Run for both samples
run_panel_regressions(HGF_df, "High Growth")
run_panel_regressions(non_HGF_df, "Non-High Growth")

# For formal statistical comparison between groups, you can use a Chow test approach
# This tests if coefficients differ between sub-samples
def chow_test_approach(full_df, high_growth_df, low_growth_df, dep_var):
    """Implement a version of Chow test logic for panel data"""
    
    # Set panel data indices
    full_panel = full_df.set_index(["ORGNR", "ser_year"])
    high_panel = high_growth_df.set_index(["ORGNR", "ser_year"])
    low_panel = low_growth_df.set_index(["ORGNR", "ser_year"])
    
    # Add constant
    X_full = sm.add_constant(full_panel[control_vars])
    X_high = sm.add_constant(high_panel[control_vars])
    X_low = sm.add_constant(low_panel[control_vars])
    
    # Run models
    y_full = full_panel[dep_var]
    y_high = high_panel[dep_var]
    y_low = low_panel[dep_var]
    
    full_model = PanelOLS(y_full, X_full, entity_effects=True)
    high_model = PanelOLS(y_high, X_high, entity_effects=True)
    low_model = PanelOLS(y_low, X_low, entity_effects=True)
    
    full_results = full_model.fit()
    high_results = high_model.fit()
    low_results = low_model.fit()
    
    # Calculate sum of squared residuals
    rss_full = full_results.resids.values.T @ full_results.resids.values
    rss_high = high_results.resids.values.T @ high_results.resids.values
    rss_low = low_results.resids.values.T @ low_results.resids.values
    
    # Calculate F-statistic
    n_full = len(y_full)
    n_params = len(control_vars) + 1  # +1 for constant
    f_stat = ((rss_full - (rss_high + rss_low)) / n_params) / ((rss_high + rss_low) / (n_full - 2 * n_params))
    
    # Print results
    print(f"\nCoefficient Difference Test for {dep_var}:")
    print(f"F-statistic: {f_stat:.4f}")
    # Calculate p-value using F distribution with (k, n-2k) degrees of freedom
    from scipy import stats
    p_value = 1 - stats.f.cdf(f_stat, n_params, n_full - 2 * n_params)
    print(f"P-value: {p_value:.4f}")
    print(f"{'Coefficients differ significantly between groups' if p_value < 0.05 else 'No significant difference in coefficients'}")

# Run comparison test for each dependent variable
for dep_var in dependent_vars:
    chow_test_approach(filtered_df, HGF_df, non_HGF_df, dep_var)



=== High Growth Companies Results ===


Results for LEVERAGE_TOTAL:
                                      Parameter Estimates                                      
                             Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-----------------------------------------------------------------------------------------------
const                           1.6423     0.1272     12.914     0.0000      1.3930      1.8916
SIZE_LOG_ASSETS                -0.3257     0.0201    -16.210     0.0000     -0.3651     -0.2863
SIZE_LOG_SALES                  0.1797     0.0203     8.8678     0.0000      0.1400      0.2194
ASSET_TANGIBILITY               0.2112     0.0641     3.2961     0.0010      0.0856      0.3368
GROWTH_OPPORTUNITIES_ASSETS     0.0337     0.0077     4.3758     0.0000      0.0186      0.0488
LIQUIDITY                      -0.2058     0.0415    -4.9557     0.0000     -0.2872     -0.1244
QUICK_RATIO                    -0.0034     0.0422    -0.0815     0.