In [1]:
import pandas as pd
import numpy as np

df = pd.read_stata("./serrano/serrano_2024_Stata/serrano.dta")


['ORGNR', 'KOMORGNR', 'KOMNAMN', 'BSTYP', 'STATUS', 'REDTYP', 'BELKOD', 'UTDBEL', 'NTOMS', 'LAGERF', 'AKTARB', 'ROINTOV1', 'RAVAR', 'HANDVAR', 'EXTKOSOV', 'PERSKOS', 'AVSKRIV', 'JFRST1', 'RORKOOV1', 'RORRESUL', 'RESAND', 'RTEINKNC', 'RTEINEXT', 'RTEINOV', 'RTEKOKNC', 'RTEKOEXT', 'RTEKOOV', 'JFRSTFIN', 'RESEFIN', 'EXTRAINT', 'EXTRAKOS', 'KNCBDR', 'AGTSK', 'BSLDISP', 'SKATTER', 'MININTRR', 'RESAR', 'KOSALVAR', 'BRUTORES', 'FORSKO', 'ADMKO', 'FOUKO', 'JFRST2', 'ROINTOV2', 'RORKOOV2', 'EJINBET', 'FOUBAUTG', 'PATLIC', 'GOODWILL', 'IMANLOV', 'IMANLSU', 'BYGGMARK', 'MASK', 'INVENT', 'MASKINV', 'MATANLOV', 'MATANLSU', 'ANDKNC', 'LFORDKNC', 'LANDELAG', 'FIANLTOV', 'FIANLTSU', 'ANLTSU', 'PAGARB', 'LAGEROV', 'LAGERSU', 'KUNDFORD', 'KFORDKNC', 'KFORDOV', 'KFORDSU', 'KPLACSU', 'KABASU', 'OMSTGSU', 'TILLGSU', 'AKTIEKAP', 'OVERKURS', 'UPPSKR', 'OVRGBKAP', 'BALRES', 'KNCBDREL', 'AGTSKEL', 'RESARB', 'EKSU', 'OBESKRES', 'MININTR', 'AVSSU', 'LSKKRIN', 'LSKKNC', 'LSKOV', 'LSKSU', 'KSKKRIN', 'KSKLEV', 'KSKKNC', 'KSKOV', 'KSKSU', 'EKSKSU', 'RTENTO', 'ANTANST', 'LONLEDN', 'LONOV', 'SOCKOSTN', 'TANTLEDN', 'RESLONOV', 'AVGVED', 'AVSKSALV', 'AVSKFSG', 'AVSKADM', 'AVSKFOU', 'AVSKOV2', 'AVSKOSPC', 'INTFTG', 'INTFAST', 'SAKOV', 'SAKKOM', 'SAKSU', 'AGTSKV', 'ANSVFOV', 'ANSVFKOM', 'ANSVFSU', 'CHKRBEV', 'CHKRUTN', 'REVBER', 'BOLSTPRO', 'MODDTM', 'BSLSTART', 'BSLSLUT']

# Filter the DataFrame

In [2]:
"""
These filter should be thought through and written about in the methodology section.
We want to avoid cherry-picking the data, but we also want to avoid including data that is not relevant to the study.

We also want to avoid survuvalrship bias, i.e., we want to avoid including only the companies that have survived.
"""

filtered_df = df[(df['ser_year'] >= 2008) & # remove data before 2008
                 (df['ser_jurform'] == 49) & # aktiebolag
                 (df['ser_aktiv'] == 1) & # active companies
                 (df['ser_ftgkategori'] == 30) & # private companies, i.e., not state-owned etc.
                 (df['ser_stklf'] > 0) & # companies with at least 1 employee.
                 (df['br09_tillgsu'] > 0) & # remove companies with no assets (there aren't many of those)
                 (df['knc_kncfall'] == 1)] # only include independet companies, i.e., not subsidiaries or parent companies

# Add Growth Variable

In [3]:
"""
Adding a variable for growth.
Now the companies needs to be big in order to qualify as a high growth company which is a
bit weird since then HGFs will be bigger companies than the other group which might not be ideal
for making comparisons.
"""

# Calculate the annual growth rate for each company and add it as a new column
filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()

# Identify high-growth periods and create a new binary column
filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
    lambda x: ((x['TURNOVER_GROWTH'] > 0.20).rolling(window=3).sum() == 3) & 
              (x['ser_stklf'].iloc[0] in [3, 4, 5, 6, 7]) # only consider companies with at least 10 employees, based on size category
).reset_index(level=0, drop=True).astype(int)



  filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()
  filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()
  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(


# Add one industry per company

In [4]:
"""
There are a lot of columns in the dataset that is about the comapnies industry, a lot of different SNI codes.
But we've decided to use the 'bransch_borsbransch_konv' column as the industry variable. Which is a conversion of the SNI codes
to fewer branches.

Some companies have changed industry over time, so we determine the most frequent industry for each company and add it as a new column
to each row assicated with that company.
"""

# add one industry to all companies.
dict_of_industries = {
    10: 'Energy & Environment',
    15: 'Materials',
    20: 'Industrial goods',
    22: 'Construction industry',
    25: 'Shopping goods',
    30: 'Convenience goods',
    35: 'Health & Education',
    40: 'Finance & Real estate',
    45: 'IT & Electronics',
    50: 'Telecom & Media',
    60: 'Corporate services',
    98: 'Other',
    99: 'SNI07 missing'
}

# Determine the most frequent 'bransch_borsbransch_konv' value for each company
most_frequent_industry = filtered_df.groupby('ORGNR')['bransch_borsbransch_konv'].agg(lambda x: x.mode()[0])

# Map the most frequent 'bransch_borsbransch_konv' value to the corresponding industry name
most_frequent_industry = most_frequent_industry.map(dict_of_industries)

# Add the new 'INDUSTRY' column to the DataFrame
filtered_df = filtered_df.merge(most_frequent_industry.rename('INDUSTRY'), on='ORGNR')

# Filter out rows where the 'INDUSTRY' column is 'SNI07 missing' or 'Other'
filtered_df = filtered_df[~filtered_df['INDUSTRY'].isin(['SNI07 missing', 'Other'])]

# Add variables from Vanacker & Manigart (2010)

Adding variables for financing events, i.e., internal finance, debt, and equity. These are inspired from Vanacker & Manigart (2010).

In [5]:
# Dependent Variables

"""

Internal finance:

"When the net increase of retained earnings within a year exceeds 5% of total assets, we define this as an internal financing event."

N.D., this is only retained earnings, not profit/loss of the current year, maybe that should be included too.
Maybe not since that profit might be used for dividends so
then maybe we list it as a internal finance thing but in reailty that money is not used for investments.
"""

# Calculate the percentage change in retained earnings from the previous year
filtered_df['br10e_balres_pct_change'] = filtered_df.groupby('ORGNR')['br10e_balres'].pct_change()

# Internal finance: 1 if the net increase in retained earnings exceeds 5% of total assets
filtered_df['INTERNAL_FINANCE'] = (filtered_df['br10e_balres_pct_change'] > 0.05).astype(int)
# filtered_df['INTERNAL_FINANCE'] = (filtered_df['br10e_balres_pct_change'] > filtered_df['br09_tillgsu'] * 0.05).astype(int)
# ska inte vara %

# Drop the temporary column
filtered_df = filtered_df.drop(columns=['br10e_balres_pct_change'])

"""
Financing with debt:

"financial debt if there is a yearly net increase of outstanding financial debt (both short-term and long-term) that exceeds 5% of total assets."
"""

# Calculate the combined financial debt
filtered_df['combined_financial_debt'] = filtered_df['br14_kskkrin'] + filtered_df['br16_lskkrin']

# Calculate the percentage change in combined financial debt from the previous year
filtered_df['combined_financial_debt_pct_change'] = filtered_df.groupby('ORGNR')['combined_financial_debt'].pct_change()

# Financial debt dummy: 1 if the net increase in debt exceeds 5% of total assets
filtered_df['FINANCIAL_DEBT'] = (filtered_df['combined_financial_debt_pct_change'] > 0.05).astype(int)

# Drop the temporary column
filtered_df = filtered_df.drop(columns=['combined_financial_debt_pct_change'])

"""
External equity:

"companies are coded as using new equity financing when there is a net increase in external equity of at least 5% of total assets."
"""

# Calculate the combined external equity (new share issues + share premium)
filtered_df['external_equity'] = filtered_df['br10a_aktiekap'] + filtered_df['br10b_overkurs']

# Calculate the percentage change in external equity from the previous year
filtered_df['external_equity_pct_change'] = filtered_df.groupby('ORGNR')['external_equity'].pct_change()

# External equity dummy: 1 if the net increase in external equity exceeds 5% of total assets
filtered_df['EXTERNAL_EQUITY'] = (filtered_df['external_equity_pct_change'] > 0.05).astype(int)

# Drop the temporary column
filtered_df = filtered_df.drop(columns=['external_equity_pct_change'])

  filtered_df['br10e_balres_pct_change'] = filtered_df.groupby('ORGNR')['br10e_balres'].pct_change()
  filtered_df['combined_financial_debt_pct_change'] = filtered_df.groupby('ORGNR')['combined_financial_debt'].pct_change()
  filtered_df['external_equity_pct_change'] = filtered_df.groupby('ORGNR')['external_equity'].pct_change()


In [6]:
# Group by 'ser_year' and calculate the percentage of firms with 1 in each column
summary_table = filtered_df.groupby('ser_year').apply(lambda x: pd.Series({
    'Internal finance %': (x['INTERNAL_FINANCE'].sum() / len(x)) * 100,
    'Financial debt %': (x['FINANCIAL_DEBT'].sum() / len(x)) * 100,
    'External equity %': (x['EXTERNAL_EQUITY'].sum() / len(x)) * 100
}))

# Display the summary table
print(summary_table)


          Internal finance %  Financial debt %  External equity %
ser_year                                                         
2008.0              0.000000          0.000000           0.000000
2009.0             44.536208         13.446251           1.020990
2010.0             43.323048         13.045343           1.007320
2011.0             45.100214         12.878572           0.746730
2012.0             46.142779         12.342685           0.807837
2013.0             43.846490         11.522622           0.746363
2014.0             45.884252         11.214186           0.801013
2015.0             46.756144         10.747891           0.811154
2016.0             45.368373         11.027073           0.830549
2017.0             44.609527         11.104876           0.738431
2018.0             49.173297         10.815575           0.764148
2019.0             48.959033         10.176730           0.746755
2020.0             51.215839          8.878697           0.744791
2021.0    

  summary_table = filtered_df.groupby('ser_year').apply(lambda x: pd.Series({


In [7]:
# Adding Independent Variables (lagging one year)

"""
Internal finance:

"As proxies for the amount of internal finance available within the venture,
we use its profitability ratio, measured as earnings on total assets and
the amount of cash and marketable securities on total assets.

Have not included the second part yet, since I don't really know how to calculate it. Or what it is to be honest

Finally, the pay-out ratio, measured as dividends on total assets, indicates lower internal finance."
 - I use another formula for pay ratio, dividends on net profit/loss.
"""

# ROA: Return on Assets (Earnings on Total Assets) - Lagged 1 Year
filtered_df['ROA'] = filtered_df.groupby('ORGNR')['ny_avktokap'].shift(1)

# Cash & Marketable Securities on Total Assets - Lagged 1 Year
filtered_df['CASH_SEC_RATIO'] = filtered_df['br07_kplackaba'] / filtered_df['br09_tillgsu']
filtered_df['CASH_SEC_RATIO'] = filtered_df.groupby('ORGNR')['CASH_SEC_RATIO'].shift(1)


# PAYOUT_RATIO: Dividends on Net Profit - Avoiding Division by Zero
filtered_df['PAYOUT_RATIO'] = np.where(
    filtered_df['rr15_resar'] != 0,  # Only divide when Net Profit ≠ 0
    filtered_df['rr00_utdbel'] / filtered_df['rr15_resar'],
    0  # Otherwise, set to 0
)

# Lag PAYOUT_RATIO by 1 Year
filtered_df['PAYOUT_RATIO'] = filtered_df.groupby('ORGNR')['PAYOUT_RATIO'].shift(1)

"""
"Debt capacity is proxied by leverage and cash flow.

Leverage is operationalized as a company’s debt ratio (financial debt on total assets).

Furthermore, we include a variable indicating if debt is greater than
total assets (negative stockholders’ equity dummy variable).

Cash flow is operationalized by 
using the cash flow ratio (i.e., internally generated cash flow on total assets), indicating a
company’s ability to support additional debt-related payments."
"""

# LEVERAGE: Total Debt / Total Assets - Lagged 1 Year
filtered_df['LEVERAGE'] = filtered_df['combined_financial_debt'] / filtered_df['br09_tillgsu']
filtered_df['LEVERAGE'] = filtered_df.groupby('ORGNR')['LEVERAGE'].shift(1)

# CASH_FLOW_RATIO: Internally Generated Cash Flow - Lagged 1 Year
filtered_df['CASH_FLOW_RATIO'] = (filtered_df['rr07_rorresul'] + filtered_df['rr05_avskriv'] - filtered_df['rr14_skatter'] - filtered_df['rr09_finkostn']) / filtered_df['br09_tillgsu']
filtered_df['CASH_FLOW_RATIO'] = filtered_df.groupby('ORGNR')['CASH_FLOW_RATIO'].shift(1)

# NEGATIVE STOCKHOLDERS EQUITY: 1 if Debt > Assets, else 0 - Lagged 1 Year
filtered_df['NEGATIVE_STOCKHOLDERS_EQUITY'] = (filtered_df['combined_financial_debt'] > filtered_df['br09_tillgsu']).astype(int)
filtered_df['NEGATIVE_STOCKHOLDERS_EQUITY'] = filtered_df.groupby('ORGNR')['NEGATIVE_STOCKHOLDERS_EQUITY'].shift(1)


In [8]:
# Control variables (lagging one year)

"""
Under the static trade-off theory tax shields, financial distress and agency costs are expected to determine financing decisions.
"""

"""
We include two types of tax shields,

debt tax shields (interests on total assets)

and non-debt tax shields (depreciations on total assets).
"""

# Debt tax shields: Interest expenses / Total Assets - Lagged 1 Year
filtered_df['DEBT_TAX_SHIELDS'] = filtered_df['rr09_finkostn'] / filtered_df['br09_tillgsu']
filtered_df['DEBT_TAX_SHIELDS'] = filtered_df.groupby('ORGNR')['DEBT_TAX_SHIELDS'].shift(1)

# Non-debt tax shields: Depreciation / Total Assets - Lagged 1 Year
filtered_df['NON_DEBT_TAX_SHIELDS'] = filtered_df['rr05_avskriv'] / filtered_df['br09_tillgsu']
filtered_df['NON_DEBT_TAX_SHIELDS'] = filtered_df.groupby('ORGNR')['NON_DEBT_TAX_SHIELDS'].shift(1)


"""
The expected cost of financial distress depends on the probability of trouble and the value
lost if trouble comes (Myers 1984). Our proxy for the probability of financial distress is
the OJD score, which is similar to the Altman Z-statistic, but adapted to the
Belgian context (Ooghe and Van Wymeersch 2003). A lower score indicates a higher risk of failure.

Furthermore, we use asset structure operationalized as the ratio of property, plant and equipment
to total assets as a proxy for the cost of financial distress. A lower ratio indicates a higher cost of financial distress
"""

# OJD score - need to find something similar in the dataset. SKIP FOR NOW

# Asset Structure: Property, Plant & Equipment / Total Assets - Lagged 1 Year
filtered_df['ASSET_STRUCTURE'] = filtered_df['br02_matanlsu'] / filtered_df['br09_tillgsu'] 
filtered_df['ASSET_STRUCTURE'] = filtered_df.groupby('ORGNR')['ASSET_STRUCTURE'].shift(1)

"""
Agency costs are particularly prevalent in a setting characterized by considerable future growth options.
Firms generally engage in research and development to generate growth options (Titman and Wessels 1988).
Consequently, we use the ratio of intangible assets on total assets to operationalize agency costs.
"""

# Agency Costs: Intangible Assets / Total Assets - Lagged 1 Year
filtered_df['INTANGIBLE_ASSETS_RATIO'] = filtered_df['br01_imanlsu'] / filtered_df['br09_tillgsu']
filtered_df['INTANGIBLE_ASSETS_RATIO'] = filtered_df.groupby('ORGNR')['INTANGIBLE_ASSETS_RATIO'].shift(1)

"""
Other general control variables, including organizational size (i.e., natural logarithm of total assets),

previous debt financing (i.e., dummy variable equal to 1 if the venture acquired debt financing in the previous year, zero otherwise)

and previous external equity financing (i.e., dummy variable equal to 1 if the venture acquired external equity in the previous year,
zero otherwise) are included in the model. Furthermore, we included year and industry dummy variables in the analysis to
control for time and industry effects.
"""

# Natural logarithm of total assets
filtered_df['LOG_TOTAL_ASSETS'] = filtered_df['br09_tillgsu'].apply(lambda x: x if x <= 0 else np.log(x))

filtered_df['LOG_TOTAL_ASSETS'] = filtered_df.groupby('ORGNR')['LOG_TOTAL_ASSETS'].shift(1)

# Dummy variable indicating if the venture acquired debt financing in the previous year
filtered_df['PREVIOUS_DEBT_FINANCING'] = filtered_df.groupby('ORGNR')['FINANCIAL_DEBT'].shift(1)

# Convert NaN values (for first-year observations) to 0
filtered_df['PREVIOUS_DEBT_FINANCING'] = filtered_df['PREVIOUS_DEBT_FINANCING'].fillna(0).astype(int)

# Dummy variable indicating if the venture acquired external equity in the previous year
filtered_df['PREVIOUS_EXTERNAL_EQUITY_FINANCING'] = filtered_df.groupby('ORGNR')['EXTERNAL_EQUITY'].shift(1)

# Convert NaN values (for first-year observations) to 0
filtered_df['PREVIOUS_EXTERNAL_EQUITY_FINANCING'] = filtered_df['PREVIOUS_EXTERNAL_EQUITY_FINANCING'].fillna(0).astype(int)

In [None]:
filtered_df['ser_nystartat'].value_counts() # 272 491
filtered_df.count() # 3935282

ORGNR                                 3935282
ser_jurform                           3935282
ser_year                              3935282
ser_pnr                               3933791
bransch_sni1                          2051149
                                       ...   
ASSET_STRUCTURE                       3400289
INTANGIBLE_ASSETS_RATIO               3400288
LOG_TOTAL_ASSETS                      3400322
PREVIOUS_DEBT_FINANCING               3935282
PREVIOUS_EXTERNAL_EQUITY_FINANCING    3935282
Length: 197, dtype: int64

# Perform Vanacker & Manigart (2010) tests.

In [9]:
%pip install statsmodels
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [10]:
import statsmodels.formula.api as smf

In [11]:
# Define the variables needed for the regression
regression_vars = [
    'INTERNAL_FINANCE', 'ROA', 'CASH_SEC_RATIO', 'PAYOUT_RATIO',
    'LEVERAGE', 'NEGATIVE_STOCKHOLDERS_EQUITY', 'NON_DEBT_TAX_SHIELDS', 'ASSET_STRUCTURE', 
    'INTANGIBLE_ASSETS_RATIO', 'LOG_TOTAL_ASSETS',
    'PREVIOUS_DEBT_FINANCING', 'PREVIOUS_EXTERNAL_EQUITY_FINANCING'
]

# Removed CASH_FLOW_RATIO and DEBT_TAX_SHIELDS from the regression_vars list
# VIF was higher than 10 for these variables, indicating multicollinearity

# Create a new DataFrame with only the necessary variables
df_regression = filtered_df[regression_vars].copy()

# Drop or fill missing values
df_regression = df_regression.dropna()  # OR df_regression.fillna(df_regression.mean())

formula_internal = """
    INTERNAL_FINANCE ~ ROA + CASH_SEC_RATIO + PAYOUT_RATIO +
    LEVERAGE + NEGATIVE_STOCKHOLDERS_EQUITY +
    NON_DEBT_TAX_SHIELDS + ASSET_STRUCTURE + 
    INTANGIBLE_ASSETS_RATIO + LOG_TOTAL_ASSETS +
    PREVIOUS_DEBT_FINANCING + PREVIOUS_EXTERNAL_EQUITY_FINANCING
"""

model_internal = smf.logit(formula=formula_internal, data=df_regression).fit()
print("\nInternal Financing Model:")
print(model_internal.summary())


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Optimization terminated successfully.
         Current function value: inf
         Iterations 12

Internal Financing Model:
                           Logit Regression Results                           
Dep. Variable:       INTERNAL_FINANCE   No. Observations:              3392981
Model:                          Logit   Df Residuals:                  3392969
Method:                           MLE   Df Model:                           11
Date:                Mon, 10 Feb 2025   Pseudo R-squ.:                    -inf
Time:                        13:06:59   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                   -2.3501e+06
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                            

  return 1/(1+np.exp(-X))


In [97]:
print(df_regression.isna().sum())



INTERNAL_FINANCE                           0
ROA                                   555825
CASH_SEC_RATIO                        555924
PAYOUT_RATIO                          556583
LEVERAGE                              562487
NEGATIVE_STOCKHOLDERS_EQUITY          555825
NON_DEBT_TAX_SHIELDS                  555825
ASSET_STRUCTURE                       555860
INTANGIBLE_ASSETS_RATIO               555861
LOG_TOTAL_ASSETS                      555825
PREVIOUS_DEBT_FINANCING                    0
PREVIOUS_EXTERNAL_EQUITY_FINANCING         0
dtype: int64


In [99]:
from sklearn.linear_model import LogisticRegression

# Drop or fill missing values
df_regression = df_regression.dropna()  # OR df_regression.fillna(df_regression.mean())

# Convert to NumPy arrays for sklearn
X = df_regression.drop(columns=['INTERNAL_FINANCE']).values
y = df_regression['INTERNAL_FINANCE'].values

model = LogisticRegression(penalty='l2', solver='liblinear')
model.fit(X, y)

# Print coefficients
print(model.coef_)

[[ 9.49248773e-01  3.17869732e-01 -2.34831743e-04 -1.28470452e-03
   3.69447225e-01 -1.71879919e-01  8.35811073e-02 -5.49711078e-02
   5.76996588e-02  6.14935857e-02  3.07760338e-02]]


In [101]:
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 🚀 Step 1: Define the regression variables
regression_vars = [
    'INTERNAL_FINANCE', 'ROA', 'CASH_SEC_RATIO', 'PAYOUT_RATIO',
    'LEVERAGE', 'NEGATIVE_STOCKHOLDERS_EQUITY', 'NON_DEBT_TAX_SHIELDS',
    'ASSET_STRUCTURE', 'INTANGIBLE_ASSETS_RATIO', 'LOG_TOTAL_ASSETS',
    'PREVIOUS_DEBT_FINANCING', 'PREVIOUS_EXTERNAL_EQUITY_FINANCING'
]

# 🚀 Step 2: Create a new DataFrame with only the necessary variables
df_regression = filtered_df[regression_vars].copy()

# 🚀 Step 3: Remove missing values
df_regression = df_regression.dropna()

# 🚀 Step 4: Check for perfect separation and remove problematic variables
perfect_predictors = df_regression.columns[df_regression.nunique() == 1].tolist()
df_regression = df_regression.drop(columns=perfect_predictors, errors='ignore')

# 🚀 Step 5: Check for multicollinearity (VIF test) and remove problematic variables
X = df_regression.drop(columns=['INTERNAL_FINANCE'])
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
high_vif_vars = vif_data[vif_data["VIF"] > 10]["Variable"].tolist()
df_regression = df_regression.drop(columns=high_vif_vars, errors='ignore')

# 🚀 Step 6: Standardize continuous variables
continuous_vars = ['ROA', 'CASH_SEC_RATIO', 'PAYOUT_RATIO', 'LEVERAGE',
                   'NON_DEBT_TAX_SHIELDS', 'ASSET_STRUCTURE', 
                   'INTANGIBLE_ASSETS_RATIO', 'LOG_TOTAL_ASSETS']
scaler = StandardScaler()
df_regression[continuous_vars] = scaler.fit_transform(df_regression[continuous_vars])

# 🚀 Step 7: Run Logistic Regressions

# Internal Finance Model
formula_internal = """
    INTERNAL_FINANCE ~ ROA + CASH_SEC_RATIO + PAYOUT_RATIO +
    NEGATIVE_STOCKHOLDERS_EQUITY + NON_DEBT_TAX_SHIELDS +
    ASSET_STRUCTURE + INTANGIBLE_ASSETS_RATIO + LOG_TOTAL_ASSETS +
    PREVIOUS_DEBT_FINANCING + PREVIOUS_EXTERNAL_EQUITY_FINANCING
"""
model_internal = smf.logit(formula=formula_internal, data=df_regression).fit()
print("\nInternal Financing Model:")
print(model_internal.summary())

# Debt Financing Model
formula_debt = """
    FINANCIAL_DEBT ~ ROA + CASH_SEC_RATIO + PAYOUT_RATIO +
    NEGATIVE_STOCKHOLDERS_EQUITY + NON_DEBT_TAX_SHIELDS +
    ASSET_STRUCTURE + INTANGIBLE_ASSETS_RATIO + LOG_TOTAL_ASSETS +
    PREVIOUS_DEBT_FINANCING + PREVIOUS_EXTERNAL_EQUITY_FINANCING
"""
model_debt = smf.logit(formula=formula_debt, data=df_regression).fit()
print("\nDebt Financing Model:")
print(model_debt.summary())

# External Equity Financing Model
formula_equity = """
    EXTERNAL_EQUITY ~ ROA + CASH_SEC_RATIO + PAYOUT_RATIO +
    NEGATIVE_STOCKHOLDERS_EQUITY + NON_DEBT_TAX_SHIELDS +
    ASSET_STRUCTURE + INTANGIBLE_ASSETS_RATIO + LOG_TOTAL_ASSETS +
    PREVIOUS_DEBT_FINANCING + PREVIOUS_EXTERNAL_EQUITY_FINANCING
"""
model_equity = smf.logit(formula=formula_equity, data=df_regression).fit()
print("\nEquity Financing Model:")
print(model_equity.summary())


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Optimization terminated successfully.
         Current function value: inf
         Iterations 14

Internal Financing Model:
                           Logit Regression Results                           
Dep. Variable:       INTERNAL_FINANCE   No. Observations:              3521631
Model:                          Logit   Df Residuals:                  3521620
Method:                           MLE   Df Model:                           10
Date:                Sun, 09 Feb 2025   Pseudo R-squ.:                    -inf
Time:                        18:45:36   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                   -2.4393e+06
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                            

  return 1/(1+np.exp(-X))


PatsyError: Error evaluating factor: NameError: name 'FINANCIAL_DEBT' is not defined
    FINANCIAL_DEBT ~ ROA + CASH_SEC_RATIO + PAYOUT_RATIO +     NEGATIVE_STOCKHOLDERS_EQUITY + NON_DEBT_TAX_SHIELDS +     ASSET_STRUCTURE + INTANGIBLE_ASSETS_RATIO + LOG_TOTAL_ASSETS +     PREVIOUS_DEBT_FINANCING + PREVIOUS_EXTERNAL_EQUITY_FINANCING
    ^^^^^^^^^^^^^^

# Creating the subset with HGFs + Descriptive Statistics.
*up until this point, all companies have been in the same DF.*

In [25]:
# Create DataFrame with high-growth companies
high_growth_orgnr = filtered_df[filtered_df['HIGH_GROWTH'] == 1]['ORGNR'].unique()
high_growth_df = filtered_df[filtered_df['ORGNR'].isin(high_growth_orgnr)]

# Create DataFrame with non-high-growth companies
non_high_growth_df = filtered_df[~filtered_df['ORGNR'].isin(high_growth_orgnr)]

# Create a DataFrame with unique high-growth companies
unique_high_growth_df = high_growth_df.drop_duplicates(subset='ORGNR')

# Create a DataFrame with unique non-high-growth companies
unique_non_high_growth_df = non_high_growth_df.drop_duplicates(subset='ORGNR')
print(len(unique_non_high_growth_df))



485744


In [19]:
# Combine the unique high-growth companies with the non-high-growth companies
combined_df = pd.concat([unique_high_growth_df, unique_non_high_growth_df])

print(len(combined_df))
combined_df['ORGNR'].nunique()


488611


488611

In [27]:
# Calculate the percentage of each industry among all companies
industry_percentage = combined_df['INDUSTRY'].value_counts(normalize=True) * 100

# Calculate the total percentage to check for rounding errors
total_percentage = industry_percentage.sum()

# If the total percentage is not exactly 100, adjust the last value
if total_percentage != 100:
    difference = 100 - total_percentage
    industry_percentage.iloc[-1] += difference

# Add a row for the total percentage
industry_percentage['Total'] = industry_percentage.sum()

# Display the result
print(industry_percentage)

# Calculate the percentage of each industry among high-growth companies
industry_percentage = unique_high_growth_df['INDUSTRY'].value_counts(normalize=True) * 100

# Calculate the total percentage to check for rounding errors
total_percentage = industry_percentage.sum()

# If the total percentage is not exactly 100, adjust the last value
if total_percentage != 100:
    difference = 100 - total_percentage
    industry_percentage.iloc[-1] += difference

# Add a row for the total percentage
industry_percentage['Total'] = industry_percentage.sum()

# Display the result
print(industry_percentage)

INDUSTRY
Corporate services        30.203782
Shopping goods            21.698857
Construction industry     14.701675
Health & Education         6.917978
IT & Electronics           6.683845
Finance & Real estate      6.570462
Industrial goods           5.921070
Convenience goods          3.622309
Telecom & Media            1.947152
Materials                  1.244753
Energy & Environment       0.488118
Total                    100.000000
Name: proportion, dtype: float64
INDUSTRY
Corporate services        25.776073
Construction industry     19.532612
Shopping goods            14.893617
Health & Education        12.452040
Industrial goods           9.208232
IT & Electronics           8.161842
Convenience goods          3.348448
Finance & Real estate      3.034531
Telecom & Media            1.639344
Materials                  1.185909
Energy & Environment       0.767353
Total                    100.000000
Name: proportion, dtype: float64


In [28]:
# Calculate the average goodwill for each industry among all companies
avg_goodwill_by_industry_all = combined_df.groupby('INDUSTRY')['br01c_goodwill'].mean()

# Sort by average goodwill in descending order
average_goodwill_by_industry = avg_goodwill_by_industry_all.sort_values(ascending=False)

# Display the result
print(average_goodwill_by_industry)

# Calculate the average goodwill for each industry among HGFs
avg_goodwill_by_industry_high_growth = unique_high_growth_df.groupby('INDUSTRY')['br01c_goodwill'].mean()

# Sort by average goodwill in descending order
avg_goodwill_by_industry_high_growth = avg_goodwill_by_industry_high_growth.sort_values(ascending=False)

# Display the result
print(avg_goodwill_by_industry_high_growth)

INDUSTRY
Energy & Environment     775.081444
Materials                719.238926
Industrial goods         504.911862
Telecom & Media          473.896490
Convenience goods        350.011102
Health & Education       154.906440
IT & Electronics         153.526295
Shopping goods           109.110065
Finance & Real estate     71.700881
Corporate services        53.337668
Construction industry     25.057200
Name: br01c_goodwill, dtype: float64
INDUSTRY
Energy & Environment     1574.681818
Industrial goods         1268.776515
IT & Electronics          729.713675
Shopping goods            591.971897
Finance & Real estate     561.574713
Telecom & Media           477.297872
Health & Education        339.000000
Corporate services        178.174560
Convenience goods         134.520833
Construction industry      90.646429
Materials                  25.617647
Name: br01c_goodwill, dtype: float64


In [29]:
# Calculate the average goodwill for each industry among all companies
avg_sales_by_industry_all = combined_df.groupby('INDUSTRY')['rr01_ntoms'].mean()

# Sort by average goodwill in descending order
average_sales_by_industry = avg_goodwill_by_industry_all.sort_values(ascending=False)

# Display the result
print(average_sales_by_industry)

# Calculate the average goodwill for each industry among HGFs
avg_sales_by_industry_high_growth = unique_high_growth_df.groupby('INDUSTRY')['rr01_ntoms'].mean()

# Sort by average goodwill in descending order
avg_sales_by_industry_high_growth = avg_sales_by_industry_high_growth.sort_values(ascending=False)

# Display the result
print(avg_sales_by_industry_high_growth)

INDUSTRY
Energy & Environment     775.081444
Materials                719.238926
Industrial goods         504.911862
Telecom & Media          473.896490
Convenience goods        350.011102
Health & Education       154.906440
IT & Electronics         153.526295
Shopping goods           109.110065
Finance & Real estate     71.700881
Corporate services        53.337668
Construction industry     25.057200
Name: br01c_goodwill, dtype: float64
INDUSTRY
Energy & Environment     612610.318182
Materials                209139.882353
Finance & Real estate     87681.678161
Industrial goods          71223.469697
Convenience goods         47523.437500
Shopping goods            41027.800937
IT & Electronics          31189.606838
Construction industry     30803.689286
Corporate services        25940.364005
Telecom & Media           25127.085106
Health & Education        21918.308123
Name: rr01_ntoms, dtype: float64


In [30]:
# Assuming 'combined_df' is your DataFrame and 'ser_stklf' is the column for size category

# Calculate the average size category for each industry among all companies
avg_size_by_industry_all = combined_df.groupby('INDUSTRY')['ser_stklf'].mean()

# Sort by average size category in descending order
avg_size_by_industry_all = avg_size_by_industry_all.sort_values(ascending=False)

# Display the result
print("Average size category by industry (all companies):")
print(avg_size_by_industry_all)

# Calculate the average size category for each industry among HGFs
avg_size_by_industry_high_growth = unique_high_growth_df.groupby('INDUSTRY')['ser_stklf'].mean()

# Sort by average size category in descending order
avg_size_by_industry_high_growth = avg_size_by_industry_high_growth.sort_values(ascending=False)

# Display the result
print("\nAverage size category by industry (high-growth firms):")
print(avg_size_by_industry_high_growth)

Average size category by industry (all companies):
INDUSTRY
Industrial goods         1.612215
Materials                1.396251
Convenience goods        1.334991
Energy & Environment     1.208386
Construction industry    1.163126
Shopping goods           1.139479
Health & Education       1.092391
Corporate services       1.017381
IT & Electronics         1.010135
Telecom & Media          0.994639
Finance & Real estate    0.644655
Name: ser_stklf, dtype: float64

Average size category by industry (high-growth firms):
INDUSTRY
Materials                3.941176
Energy & Environment     3.909091
Industrial goods         3.625000
Telecom & Media          3.617021
Health & Education       3.610644
Finance & Real estate    3.505747
Shopping goods           3.494145
Corporate services       3.488498
IT & Electronics         3.470085
Convenience goods        3.416667
Construction industry    3.385714
Name: ser_stklf, dtype: float64


In [18]:
# Save the filtered DataFrame to an Excel file
output_path = "./serrano/serrano_2024_Stata/firms.xlsx"
filtered_df.head(20000).to_excel(output_path, index=False)

print(f"Filtered data saved to {output_path}")

Filtered data saved to ./serrano/serrano_2024_Stata/firms.xlsx


In [31]:
# Group by 'ser_year' and calculate the percentage of firms with 1 in each column
summary_table = combined_df.groupby('ser_year').apply(lambda x: pd.Series({
    'Internal finance %': (x['INTERNAL_FINANCE'].sum() / len(x)) * 100,
    'Financial debt %': (x['FINANCIAL_DEBT'].sum() / len(x)) * 100,
    'External equity %': (x['EXTERNAL_EQUITY'].sum() / len(x)) * 100
}))

# Display the summary table
print(summary_table)

          Internal finance %  Financial debt %  External equity %
ser_year                                                         
2007.0                   0.0               0.0                0.0
2008.0                   0.0               0.0                0.0
2009.0                   0.0               0.0                0.0
2010.0                   0.0               0.0                0.0
2011.0                   0.0               0.0                0.0
2012.0                   0.0               0.0                0.0
2013.0                   0.0               0.0                0.0
2014.0                   0.0               0.0                0.0
2015.0                   0.0               0.0                0.0
2016.0                   0.0               0.0                0.0
2017.0                   0.0               0.0                0.0
2018.0                   0.0               0.0                0.0
2019.0                   0.0               0.0                0.0
2020.0    

  summary_table = combined_df.groupby('ser_year').apply(lambda x: pd.Series({
