In [260]:
import pandas as pd
import numpy as np

In [261]:
df = pd.read_stata("./serrano/serrano_2024_Stata/serrano.dta")

In [262]:
display(len(df))
display(df["ORGNR"].nunique())

15327519

1550192

# General Filters

In [263]:
"""
These filter should be thought through and written about in the methodology section.
We want to avoid cherry-picking the data, but we also want to avoid including data that is not relevant to the study.

We also want to avoid survuvalrship bias, i.e., we want to avoid including only the companies that have survived.
"""
# Maybe add something about negative equity, i.e., companies that are bankrupt or close to bankruptcy.
filtered_df = df[(df['ser_year'] >= 2011) & # 2012-2019 for final dataset, stable period with no major economic crises, here 2011 so we can get growth measures for 2012
                 (df['ser_year'] <= 2019) & # and stable interest rates, bnp growth, inflation
                 (df['ser_jurform'] == 49) & # aktiebolag
                 (df['ser_aktiv'] == 1) & # active companies
                 (df['ser_ftgkategori'] == 30) & # private companies, i.e., not state-owned etc.
                 (df['ser_stklf'] >= 3) & # companies with at least 10 employee.
                 (df['knc_kncfall'] == 1) & # only include independet companies, i.e., not subsidiaries or parent companies THIS IS UP FOR DEBATE
                 (df['rr01_ntoms'] > 0) & # remove companies with no turnover
                 (df['ser_inregyr'] == 1)] # only include companies that are registered at SCB (just three observations are not)

In [264]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())

108627

27822

In [265]:
# Sort the dataframe so ORGNR are together and sorted by year after that.
filtered_df = filtered_df.sort_values(by=["ORGNR", "ser_year"])

In [266]:
filtered_df["ser_year"].value_counts()

ser_year
2018.0    12475
2017.0    12317
2016.0    12268
2019.0    12241
2011.0    12128
2012.0    11967
2015.0    11874
2013.0    11732
2014.0    11625
Name: count, dtype: int64

In [68]:
"""
Only include companies that fit all criteria in order, no year should be missing from the series.

Maybe do this.
"""

'\nOnly include companies that fit all criteria in order, no year should be missing from the series.\n\nMaybe do this.\n'

# Growth Variable

In [267]:
"""
Adding a variable for growth.

This is according to OECD's definition of high-growth firms,
which is defined as firms with an average annualized growth greater than 20%
per annum over a three-year period, and with ten or more employees at the beginning of the period.
"""

# Calculate the annual growth rate for each company and add it as a new column
filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()

# Identify high-growth periods and create a new binary column
filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
    lambda x: ((x['TURNOVER_GROWTH'] > 0.20).rolling(window=3).sum() == 3)
).reset_index(level=0, drop=True).astype(int)



  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(


In [268]:
"""
Asset growth here. It is a dependet variable in the model but needs to be defined
here before we delete the observations below to reduce the amount of NaNs
"""

# % change in total assets
filtered_df['GROWTH_OPPORTUNITIES_ASSETS'] = filtered_df.groupby('ORGNR')['br09_tillgsu'].pct_change()

  filtered_df['GROWTH_OPPORTUNITIES_ASSETS'] = filtered_df.groupby('ORGNR')['br09_tillgsu'].pct_change()


In [269]:
"""
Drop 2011 observaitons. It was only included to calculate the growth rate for 2012.
"""
filtered_df = filtered_df[filtered_df["ser_year"] != 2011]

In [270]:
# drop companies w/ growth rates that are NaN
filtered_df = filtered_df.dropna(subset=["TURNOVER_GROWTH"])

# drop companies w/ growth rates that are exactly 0. For now I don't know how to handle these or where they stem from.
# Since we only include compaines with positive turnover, this should not be a problem.
filtered_df = filtered_df[filtered_df["TURNOVER_GROWTH"] != 0]

# Drop companies with NaN in the growth rate for assets, it is only four observations.
filtered_df = filtered_df.dropna(subset=["GROWTH_OPPORTUNITIES_ASSETS"])

In [271]:
filtered_df["HIGH_GROWTH"].value_counts()

HIGH_GROWTH
0    79061
1     1037
Name: count, dtype: int64

# Industry Variable

In [272]:
"""
There are a lot of columns in the dataset that is about the comapnies industry, a lot of different SNI codes.
But we've decided to use the 'bransch_borsbransch_konv' column as the industry variable. Which is a conversion of the SNI codes
to fewer branches.

Some companies have changed industry over time, so we determine the most frequent industry for each company and add it as a new column
to each row assicated with that company.
"""

# add one industry to all companies.
dict_of_industries = {
    10: 'Energy & Environment',
    15: 'Materials',
    20: 'Industrial goods',
    22: 'Construction industry',
    25: 'Shopping goods',
    30: 'Convenience goods',
    35: 'Health & Education',
    40: 'Finance & Real estate',
    45: 'IT & Electronics',
    50: 'Telecom & Media',
    60: 'Corporate services',
    98: 'Other',
    99: 'SNI07 missing'
}

# Determine the most frequent 'bransch_borsbransch_konv' value for each company
most_frequent_industry = filtered_df.groupby('ORGNR')['bransch_borsbransch_konv'].agg(lambda x: x.mode()[0])

# Map the most frequent 'bransch_borsbransch_konv' value to the corresponding industry name
most_frequent_industry = most_frequent_industry.map(dict_of_industries)

# Add the new 'INDUSTRY' column to the DataFrame
filtered_df = filtered_df.merge(most_frequent_industry.rename('INDUSTRY'), on='ORGNR')

# Filter out rows where the 'INDUSTRY' column is 'SNI07 missing' or 'Other'
filtered_df = filtered_df[~filtered_df['INDUSTRY'].isin(['SNI07 missing', 'Other'])]

# Variables

In [273]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())

78844

20663

In [274]:
"""
Some more filtering to remove weird values.

Negative assets, negative debt etc.

It removes about 150 observations.
"""

# drop observations with no assets
filtered_df = filtered_df[filtered_df["br09_tillgsu"] > 0]

# drop observations with negative tangible assets
filtered_df = filtered_df[filtered_df["br02_matanlsu"] >= 0]

# drop observations with negative current assets
filtered_df = filtered_df[filtered_df["br08_omstgsu"] > 0]

# drop observations with negative inventories
filtered_df = filtered_df[filtered_df["br06c_lagersu"] >= 0]

# drop observations with positive depreciation
filtered_df = filtered_df[filtered_df["rr05_avskriv"] <= 0]

# drop one weird outlier when it comes to interest expenses
filtered_df = filtered_df[filtered_df["rr09_finkostn"] > -200000]

# drop observations with negative long-term debt
filtered_df = filtered_df[filtered_df["br15_lsksu"] >= 0]

# drop observations with negative short-term debt
filtered_df = filtered_df[filtered_df["br13_ksksu"] > 0]

In [275]:
display(len(filtered_df))
display(filtered_df["ORGNR"].nunique())

78741

20645

## Independent

In [276]:
"""
Firm size

Some articles use the log of sales, others the log of assets. We'll add both to the dataset and see which one works best.
"""

filtered_df["SIZE_LOG_ASSETS"] = np.log(filtered_df["br09_tillgsu"])
filtered_df["SIZE_LOG_SALES"] = np.log(filtered_df["rr01_ntoms"])

In [277]:
"""
Asset tangibility

Most common thing to add as a proxy is the ratio of tangible fixed assets to total assets.
Maybe include intangible assets as well, are such assets (patents, brands etc) used as collateral?

There are quite a lot of companies with 0 tangible assets (8000 observations), which is a bit strange. Maybe we should remove those.
"""

# tangible assets / total assets
filtered_df["ASSET_TANGIBILITY"] = filtered_df["br02_matanlsu"] / filtered_df["br09_tillgsu"]

In [278]:
"""
liquidity on debt

Cole use cash and cash equivalents to total assets, but we could also use current ratio or quick ratio.
"""

# Liquid assets (cash and market securities) to total assets
filtered_df["LIQUIDITY"] = filtered_df["br07_kplackaba"] / filtered_df["br09_tillgsu"]

# Current ratio: current assets divided by current liabilities
filtered_df["CURRENT_RATIO"] = filtered_df["br08_omstgsu"] / filtered_df["br13_ksksu"]

# Quick ratio: (current assets - inventory) divided by current liabilities
filtered_df["QUICK_RATIO"] = (filtered_df["br08_omstgsu"] - filtered_df["br06c_lagersu"]) / filtered_df["br13_ksksu"]


In [279]:
"""
Profitability

There are many measures for this, RoA, RoE, operating margin, net margin etc.
"""

# Profit margin according to serrano is already defined as 'ny_vinstprc'

filtered_df['ny_vinstprc'].value_counts(dropna=False)

ny_vinstprc
0.000000    51
0.062500     6
0.038462     4
0.052632     4
0.064516     4
            ..
0.147671     1
0.142091     1
0.125329     1
0.068746     1
0.025252     1
Name: count, Length: 78468, dtype: int64

In [280]:
"""
Tax shields

For now I only find definition in Vanacker & Manigart (2010).
"""

# Debt tax shields: Interest expenses / Total Assets
filtered_df['DEBT_TAX_SHIELDS'] = filtered_df['rr09_finkostn'] / filtered_df['br09_tillgsu']

# Non-debt tax shields: Depreciation and amortization / Total Assets
filtered_df['NON_DEBT_TAX_SHIELDS'] = filtered_df['rr05_avskriv'] / filtered_df['br09_tillgsu']

In [125]:
"""
Tax

Do we mean the corporate tax in Sweden or the % tax the companies actually pay?
"""

'\nTax\n\nDo we mean the corporate tax in Sweden or the % tax the companies actually pay?\n'

In [None]:
"""
Growth opportunities

Some have change in sales too. But that gets weird for us since that is what we use to determine HGFs.

This is defined further up in the code. Before filtering stuff needed to calculate it.
"""


'\nGrowth opportunities\n\nSome have change in sales too. But that gets weird for us since that is what we use to determine HGFs.\n'

In [281]:
"""
Age

Need to discuss this since age could be year incorporated vs todays date, or 2019, or for every year of observation
for one company.
"""

# Change reg date to only year
filtered_df["ser_regdat"] = pd.to_datetime(filtered_df["ser_regdat"])
filtered_df["ser_regdat"] = filtered_df["ser_regdat"].dt.year

# Age of the company (year of observation - year of registration)
filtered_df["AGE"] = filtered_df["ser_year"] - filtered_df["ser_regdat"]

## Dependent

In [282]:
"""
Leverage
"""

# Total debt to total assets
filtered_df['LEVERAGE_TOTAL'] = (filtered_df['br15_lsksu'] + filtered_df['br13_ksksu']) / filtered_df['br09_tillgsu']

# Long-term debt to total assets (all non-current liabilities, could be other things)
filtered_df['LEVERAGE_LONG'] = filtered_df['br15_lsksu'] / filtered_df['br09_tillgsu']

# Short-term debt to total assets (all current liabilities, could be other things)
filtered_df['LEVERAGE_SHORT'] = filtered_df['br13_ksksu'] / filtered_df['br09_tillgsu']

# More filters

In [283]:
display(len(filtered_df))
display(len(filtered_df["ORGNR"].unique()))

78741

20645

In [284]:
# only keep companies that have at least 3 years of data
filtered_df = filtered_df.groupby("ORGNR").filter(lambda x: len(x) >= 3)

In [285]:
display(len(filtered_df))
display(len(filtered_df["ORGNR"].unique()))
display(filtered_df["ser_year"].value_counts())

66493

12026

ser_year
2017.0    9078
2016.0    8944
2015.0    8816
2014.0    8789
2018.0    8314
2013.0    7992
2019.0    7447
2012.0    7113
Name: count, dtype: int64

# Hausmann test

In [286]:
hausmann_df = filtered_df.set_index(["ORGNR", "ser_year"])

In [287]:
hausmann_df = filtered_df.set_index(["ORGNR", "ser_year"])

y = hausmann_df["LEVERAGE_TOTAL"]
X = hausmann_df[["SIZE_LOG_ASSETS", "ASSET_TANGIBILITY", "LIQUIDITY", "ny_vinstprc", "DEBT_TAX_SHIELDS", "NON_DEBT_TAX_SHIELDS", "GROWTH_OPPORTUNITIES_ASSETS", "AGE"]]

In [288]:
y.isna().sum()
X.isna().sum()

SIZE_LOG_ASSETS                0
ASSET_TANGIBILITY              0
LIQUIDITY                      0
ny_vinstprc                    0
DEBT_TAX_SHIELDS               0
NON_DEBT_TAX_SHIELDS           0
GROWTH_OPPORTUNITIES_ASSETS    0
AGE                            0
dtype: int64

In [289]:
import statsmodels.api as sm
from linearmodels import PanelOLS, RandomEffects
from linearmodels.panel.results import compare
from scipy import stats

In [290]:
"""
Model Comparison
"""

# Add a constant term
X = sm.add_constant(X)

# Run fixed effects model
fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

# Run random effects model
re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Compare the models
comparison = compare({'random': re_results, 'fixed': fe_results})
print(comparison)

                           Model Comparison                          
                                            random              fixed
---------------------------------------------------------------------
Dep. Variable                       LEVERAGE_TOTAL     LEVERAGE_TOTAL
Estimator                            RandomEffects           PanelOLS
No. Observations                             66493              66493
Cov. Est.                               Unadjusted         Unadjusted
R-squared                                   0.1043             0.0574
R-Squared (Within)                          0.0478             0.0574
R-Squared (Between)                         0.2331            -0.0580
R-Squared (Overall)                         0.1494            -0.0116
F-statistic                                 967.35             414.16
P-value (F-stat)                            0.0000             0.0000
const                                       1.6652             2.3063
                    

In [291]:
def hausman_test(fe_results, re_results):
    """
    Manual implementation of Hausman test for panel data
    """
    # Get coefficients (excluding constant)
    b_fe = fe_results.params[1:]
    b_re = re_results.params[1:]
    
    # Get covariance matrices (excluding constant)
    var_fe = fe_results.cov.iloc[1:, 1:]
    var_re = re_results.cov.iloc[1:, 1:]
    
    # Calculate difference
    diff = b_fe - b_re
    
    # Calculate variance of difference
    var_diff = var_fe - var_re
    
    # Calculate test statistic
    stat = diff.T @ np.linalg.inv(var_diff) @ diff
    
    # Degrees of freedom = number of parameters being tested
    df = len(b_fe)
    
    # Calculate p-value
    p_value = 1 - stats.chi2.cdf(stat, df)
    
    return {
        'stat': stat,
        'df': df,
        'p-value': p_value
    }

In [292]:
"""
Hausman Test w/o industry dummies TOTAL LEVERAGE
"""

fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Perform the Hausman test
result = hausman_test(fe_results, re_results)
print(f"Hausman test statistic: {result['stat']:.4f}")
print(f"Degrees of freedom: {result['df']}")
print(f"P-value: {result['p-value']:.4f}")

Hausman test statistic: 994.4593
Degrees of freedom: 8
P-value: 0.0000


In [293]:
"""
Hausman Test w/o industry dummies LONG TERM LEVERAGE
"""

y = hausmann_df["LEVERAGE_LONG"]

# After running your models:
fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Perform the Hausman test
result = hausman_test(fe_results, re_results)
print(f"Hausman test statistic: {result['stat']:.4f}")
print(f"Degrees of freedom: {result['df']}")
print(f"P-value: {result['p-value']:.4f}")

Hausman test statistic: 609.7035
Degrees of freedom: 8
P-value: 0.0000


In [294]:
"""
Hausman Test w/o industry dummies SHORT TERM LEVERAGE
"""

y = hausmann_df["LEVERAGE_SHORT"]

# After running your models:
fe_model = PanelOLS(y, X, entity_effects=True)
fe_results = fe_model.fit()

re_model = RandomEffects(y, X)
re_results = re_model.fit()

# Perform the Hausman test
result = hausman_test(fe_results, re_results)
print(f"Hausman test statistic: {result['stat']:.4f}")
print(f"Degrees of freedom: {result['df']}")
print(f"P-value: {result['p-value']:.4f}")

Hausman test statistic: 992.6940
Degrees of freedom: 8
P-value: 0.0000


# Split the dataset in HGF vs non-HGF

In [295]:
# Identify HGF ORGNR
high_growth_ids = filtered_df[filtered_df["HIGH_GROWTH"] == 1]["ORGNR"].unique()

# Create a new DataFrame with only high-growth firms
HGF_df = filtered_df[filtered_df["ORGNR"].isin(high_growth_ids)]

# Create a new DataFrame with only non-high-growth firms
non_HGF_df = filtered_df[~filtered_df["ORGNR"].isin(high_growth_ids)]

In [308]:
display(len(HGF_df), len(non_HGF_df))

4010

62483

In [309]:
display(HGF_df["ORGNR"].nunique(), non_HGF_df["ORGNR"].nunique())

755

11271

In [310]:
filtered_df["ser_year"].value_counts()

ser_year
2017.0    9078
2016.0    8944
2015.0    8816
2014.0    8789
2018.0    8314
2013.0    7992
2019.0    7447
2012.0    7113
Name: count, dtype: int64

In [311]:
HGF_df["ser_year"].value_counts()

ser_year
2017.0    638
2016.0    603
2018.0    586
2019.0    539
2015.0    535
2014.0    464
2013.0    372
2012.0    273
Name: count, dtype: int64

In [312]:
non_HGF_df["ser_year"].value_counts()

ser_year
2017.0    8440
2016.0    8341
2014.0    8325
2015.0    8281
2018.0    7728
2013.0    7620
2019.0    6908
2012.0    6840
Name: count, dtype: int64

# Descriptive Data

In [297]:
"""
Industry
"""
# Get one row per company (HGF)
unique_HGFs = HGF_df.drop_duplicates(subset=["ORGNR"])

# Get one row per company (non_HGF)
unique_non_HGFs = non_HGF_df.drop_duplicates(subset=["ORGNR"])

display(unique_HGFs["INDUSTRY"].value_counts(normalize=True) * 100)
display(unique_non_HGFs["INDUSTRY"].value_counts(normalize=True) * 100)

INDUSTRY
Corporate services       29.403974
Construction industry    22.649007
Health & Education       14.966887
Shopping goods           13.245033
IT & Electronics          7.947020
Industrial goods          4.370861
Convenience goods         2.649007
Finance & Real estate     2.251656
Materials                 1.324503
Telecom & Media           0.794702
Energy & Environment      0.397351
Name: proportion, dtype: float64

INDUSTRY
Corporate services       23.094668
Shopping goods           21.391181
Construction industry    20.424097
Industrial goods          9.715198
Convenience goods         9.475645
Health & Education        8.615030
IT & Electronics          2.537486
Materials                 2.014018
Finance & Real estate     1.499423
Energy & Environment      0.621063
Telecom & Media           0.612191
Name: proportion, dtype: float64

In [298]:
"""
Leverage
"""
display(HGF_df["LEVERAGE_SHORT"].describe())
display(non_HGF_df["LEVERAGE_SHORT"].describe())
display(HGF_df["LEVERAGE_LONG"].describe())
display(non_HGF_df["LEVERAGE_LONG"].describe())
display(HGF_df["LEVERAGE_TOTAL"].describe())
display(non_HGF_df["LEVERAGE_TOTAL"].describe())

count    4010.000000
mean        0.570554
std         0.541468
min         0.023494
25%         0.388355
50%         0.551358
75%         0.709473
max        31.113208
Name: LEVERAGE_SHORT, dtype: float64

count    62483.000000
mean         0.488666
std          0.372634
min          0.006915
25%          0.311116
50%          0.454698
75%          0.623500
max         39.583333
Name: LEVERAGE_SHORT, dtype: float64

count    4010.000000
mean        0.107868
std         0.173667
min         0.000000
25%         0.000000
50%         0.009972
75%         0.159496
max         1.642065
Name: LEVERAGE_LONG, dtype: float64

count    62483.000000
mean         0.126017
std          0.273944
min          0.000000
25%          0.000000
50%          0.026549
75%          0.210523
max         24.758333
Name: LEVERAGE_LONG, dtype: float64

count    4010.000000
mean        0.678422
std         0.543294
min         0.046232
25%         0.509733
50%         0.675644
75%         0.820454
max        31.113208
Name: LEVERAGE_TOTAL, dtype: float64

count    62483.000000
mean         0.614682
std          0.518311
min          0.009867
25%          0.430859
50%          0.600070
75%          0.767090
max         64.341667
Name: LEVERAGE_TOTAL, dtype: float64

In [299]:
"""
Firm size
"""
display(HGF_df["SIZE_LOG_ASSETS"].describe())
display(non_HGF_df["SIZE_LOG_ASSETS"].describe())
display(HGF_df["SIZE_LOG_SALES"].describe())
display(non_HGF_df["SIZE_LOG_SALES"].describe())

count    4010.000000
mean        9.091102
std         1.118266
min         3.970292
25%         8.339740
50%         9.077609
75%         9.783549
max        14.915306
Name: SIZE_LOG_ASSETS, dtype: float64

count    62483.000000
mean         9.065300
std          1.050617
min          3.970292
25%          8.370200
50%          9.052633
75%          9.708263
max         16.914990
Name: SIZE_LOG_ASSETS, dtype: float64

count    4010.000000
mean       10.040531
std         0.947360
min         3.401197
25%         9.438710
50%        10.014380
75%        10.612293
max        14.877591
Name: SIZE_LOG_SALES, dtype: float64

count    62483.000000
mean         9.980748
std          0.853063
min          0.000000
25%          9.415890
50%          9.873750
75%         10.457847
max         15.618990
Name: SIZE_LOG_SALES, dtype: float64

In [300]:
"""
Asset tangibility
"""
display(HGF_df["ASSET_TANGIBILITY"].describe())
display(non_HGF_df["ASSET_TANGIBILITY"].describe())

count    4010.000000
mean        0.168595
std         0.226853
min         0.000000
25%         0.010351
50%         0.055191
75%         0.249154
max         0.960242
Name: ASSET_TANGIBILITY, dtype: float64

count    62483.000000
mean         0.232648
std          0.249156
min          0.000000
25%          0.025005
50%          0.126329
75%          0.396537
max          0.996224
Name: ASSET_TANGIBILITY, dtype: float64

In [301]:
"""
liquidity
"""
display(HGF_df["LIQUIDITY"].describe())
display(non_HGF_df["LIQUIDITY"].describe())
display(HGF_df["CURRENT_RATIO"].describe())
display(non_HGF_df["CURRENT_RATIO"].describe())
display(HGF_df["QUICK_RATIO"].describe())
display(non_HGF_df["QUICK_RATIO"].describe())

count    4010.000000
mean        0.264749
std         0.228704
min        -0.036774
25%         0.063627
50%         0.217029
75%         0.418524
max         1.128364
Name: LIQUIDITY, dtype: float64

count    62483.000000
mean         0.250659
std          0.224049
min         -0.408312
25%          0.055101
50%          0.200170
75%          0.395436
max          1.003517
Name: LIQUIDITY, dtype: float64

count    4010.000000
mean        1.613294
std         1.041163
min         0.032141
25%         1.078559
50%         1.397868
75%         1.853233
max        19.233420
Name: CURRENT_RATIO, dtype: float64

count    62483.000000
mean         1.818860
std          1.788058
min          0.001393
25%          1.090317
50%          1.495086
75%          2.091017
max        142.170569
Name: CURRENT_RATIO, dtype: float64

count    4010.000000
mean        1.491585
std         1.025499
min         0.031549
25%         0.966975
50%         1.307252
75%         1.737768
max        19.233420
Name: QUICK_RATIO, dtype: float64

count    62483.000000
mean         1.527634
std          1.408794
min         -2.636364
25%          0.842478
50%          1.263241
75%          1.818738
max         75.191919
Name: QUICK_RATIO, dtype: float64

In [302]:
"""
Profitability
"""
display(HGF_df["ny_vinstprc"].describe())
display(non_HGF_df["ny_vinstprc"].describe())

count    4010.000000
mean       -0.026078
std         3.634250
min      -227.081081
25%         0.016190
50%         0.050427
75%         0.103229
max        12.133333
Name: ny_vinstprc, dtype: float64

count    62483.000000
mean         0.066313
std         12.983075
min      -1284.750000
25%          0.011693
50%          0.039640
75%          0.083556
max       2959.000000
Name: ny_vinstprc, dtype: float64

In [303]:
"""
Tax shields
"""
display(HGF_df["DEBT_TAX_SHIELDS"].describe())
display(non_HGF_df["DEBT_TAX_SHIELDS"].describe())
display(HGF_df["NON_DEBT_TAX_SHIELDS"].describe())
display(non_HGF_df["NON_DEBT_TAX_SHIELDS"].describe())

count    4010.000000
mean       -0.010002
std         0.028378
min        -0.932665
25%        -0.011754
50%        -0.003890
75%        -0.000699
max         0.000000
Name: DEBT_TAX_SHIELDS, dtype: float64

count    62483.000000
mean        -0.010728
std          0.031646
min         -2.537516
25%         -0.013221
50%         -0.004723
75%         -0.000801
max          0.000000
Name: DEBT_TAX_SHIELDS, dtype: float64

count    4010.000000
mean       -0.039746
std         0.065234
min        -2.184781
25%        -0.055659
50%        -0.018208
75%        -0.004588
max         0.000000
Name: NON_DEBT_TAX_SHIELDS, dtype: float64

count    62483.000000
mean        -0.051629
std          0.063131
min         -2.876916
25%         -0.073616
50%         -0.030245
75%         -0.009936
max          0.000000
Name: NON_DEBT_TAX_SHIELDS, dtype: float64

In [304]:
"""
Growth opportunities
"""
display(HGF_df["GROWTH_OPPORTUNITIES_ASSETS"].describe())
display(non_HGF_df["GROWTH_OPPORTUNITIES_ASSETS"].describe())

count    4010.000000
mean        0.319846
std         0.601307
min        -0.941886
25%         0.028279
50%         0.212282
75%         0.447926
max        14.164602
Name: GROWTH_OPPORTUNITIES_ASSETS, dtype: float64

count    62483.000000
mean         0.077882
std          0.310472
min         -0.987598
25%         -0.054841
50%          0.037108
75%          0.152619
max         13.611268
Name: GROWTH_OPPORTUNITIES_ASSETS, dtype: float64

In [305]:
"""
Age
"""
display(HGF_df["AGE"].describe())
display(non_HGF_df["AGE"].describe())

count    4010.000000
mean        9.964339
std         8.816212
min         1.000000
25%         4.000000
50%         7.000000
75%        12.000000
max        55.000000
Name: AGE, dtype: float64

count    62483.000000
mean        19.190916
std         14.285346
min          1.000000
25%          9.000000
50%         16.000000
75%         26.000000
max        152.000000
Name: AGE, dtype: float64

# Tests

In [306]:
# Define your dependent variables and controls
dependent_vars = ['LEVERAGE_TOTAL', 'LEVERAGE_LONG', 'LEVERAGE_SHORT']
control_vars = ["SIZE_LOG_ASSETS", "SIZE_LOG_SALES", "ASSET_TANGIBILITY", "GROWTH_OPPORTUNITIES_ASSETS", "LIQUIDITY", "QUICK_RATIO", "CURRENT_RATIO", 
               "ny_vinstprc", "DEBT_TAX_SHIELDS", "NON_DEBT_TAX_SHIELDS", "AGE"]

# Set up for panel data analysis (repeat for both samples)
def run_panel_regressions(data, sample_name):
    """Run fixed effects regressions for all dependent variables on a given sample"""
    
    print(f"\n=== {sample_name} Companies Results ===\n")
    
    # Set panel data index
    panel_data = data.set_index(["ORGNR", "ser_year"])
    
    # Add constant
    X = panel_data[control_vars]
    X = sm.add_constant(X)
    
    # Run regressions for each dependent variable
    for dep_var in dependent_vars:
        y = panel_data[dep_var]
        
        # Run fixed effects model (entity effects only, no time effects)
        fe_model = PanelOLS(y, X, entity_effects=True)
        fe_results = fe_model.fit()
        
        print(f"\nResults for {dep_var}:")
        print(fe_results.summary.tables[1])  # Print coefficients table
        print(f"R-squared: {fe_results.rsquared:.4f}")
        print(f"Within R-squared: {fe_results.rsquared_within:.4f}")

# Run for both samples
run_panel_regressions(HGF_df, "High Growth")
run_panel_regressions(non_HGF_df, "Non-High Growth")

# For formal statistical comparison between groups, you can use a Chow test approach
# This tests if coefficients differ between sub-samples
def chow_test_approach(full_df, high_growth_df, low_growth_df, dep_var):
    """Implement a version of Chow test logic for panel data"""
    
    # Set panel data indices
    full_panel = full_df.set_index(["ORGNR", "ser_year"])
    high_panel = high_growth_df.set_index(["ORGNR", "ser_year"])
    low_panel = low_growth_df.set_index(["ORGNR", "ser_year"])
    
    # Add constant
    X_full = sm.add_constant(full_panel[control_vars])
    X_high = sm.add_constant(high_panel[control_vars])
    X_low = sm.add_constant(low_panel[control_vars])
    
    # Run models
    y_full = full_panel[dep_var]
    y_high = high_panel[dep_var]
    y_low = low_panel[dep_var]
    
    full_model = PanelOLS(y_full, X_full, entity_effects=True)
    high_model = PanelOLS(y_high, X_high, entity_effects=True)
    low_model = PanelOLS(y_low, X_low, entity_effects=True)
    
    full_results = full_model.fit()
    high_results = high_model.fit()
    low_results = low_model.fit()
    
    # Calculate sum of squared residuals
    rss_full = full_results.resids.values.T @ full_results.resids.values
    rss_high = high_results.resids.values.T @ high_results.resids.values
    rss_low = low_results.resids.values.T @ low_results.resids.values
    
    # Calculate F-statistic
    n_full = len(y_full)
    n_params = len(control_vars) + 1  # +1 for constant
    f_stat = ((rss_full - (rss_high + rss_low)) / n_params) / ((rss_high + rss_low) / (n_full - 2 * n_params))
    
    # Print results
    print(f"\nCoefficient Difference Test for {dep_var}:")
    print(f"F-statistic: {f_stat:.4f}")
    # Calculate p-value using F distribution with (k, n-2k) degrees of freedom
    from scipy import stats
    p_value = 1 - stats.f.cdf(f_stat, n_params, n_full - 2 * n_params)
    print(f"P-value: {p_value:.4f}")
    print(f"{'Coefficients differ significantly between groups' if p_value < 0.05 else 'No significant difference in coefficients'}")

# Run comparison test for each dependent variable
for dep_var in dependent_vars:
    chow_test_approach(filtered_df, HGF_df, non_HGF_df, dep_var)



=== High Growth Companies Results ===


Results for LEVERAGE_TOTAL:
                                      Parameter Estimates                                      
                             Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-----------------------------------------------------------------------------------------------
const                           2.1779     0.2156     10.101     0.0000      1.7551      2.6006
SIZE_LOG_ASSETS                -0.4639     0.0332    -13.957     0.0000     -0.5291     -0.3987
SIZE_LOG_SALES                  0.2395     0.0322     7.4335     0.0000      0.1764      0.3027
ASSET_TANGIBILITY               0.1376     0.1189     1.1570     0.2474     -0.0956      0.3707
GROWTH_OPPORTUNITIES_ASSETS     0.0514     0.0147     3.4832     0.0005      0.0225      0.0803
LIQUIDITY                      -0.2494     0.0695    -3.5891     0.0003     -0.3856     -0.1131
QUICK_RATIO                     0.0099     0.0767     0.1290     0.

In [307]:
"""
Correlation matrix
"""

HGF_df_corr = HGF_df[control_vars + dependent_vars].corr()
non_HGF_df_corr = non_HGF_df[control_vars + dependent_vars].corr()

print("High-Growth Correlation Matrix:")
print(HGF_df_corr)


print("\nNon-High-Growth Correlation Matrix:")
print(non_HGF_df_corr)

High-Growth Correlation Matrix:
                             SIZE_LOG_ASSETS  SIZE_LOG_SALES  \
SIZE_LOG_ASSETS                     1.000000        0.799205   
SIZE_LOG_SALES                      0.799205        1.000000   
ASSET_TANGIBILITY                   0.171320       -0.013886   
GROWTH_OPPORTUNITIES_ASSETS         0.065818        0.013141   
LIQUIDITY                          -0.118525       -0.139316   
QUICK_RATIO                         0.045647       -0.099766   
CURRENT_RATIO                       0.142242       -0.020885   
ny_vinstprc                        -0.056475        0.113301   
DEBT_TAX_SHIELDS                    0.077395        0.073555   
NON_DEBT_TAX_SHIELDS               -0.068464        0.028728   
AGE                                 0.338579        0.308461   
LEVERAGE_TOTAL                     -0.159659       -0.043709   
LEVERAGE_LONG                       0.185319       -0.047495   
LEVERAGE_SHORT                     -0.219635       -0.028623   

       