In [32]:
import pandas as pd

df = pd.read_stata("./serrano/serrano_2024_Stata/serrano.dta")


['ORGNR', 'KOMORGNR', 'KOMNAMN', 'BSTYP', 'STATUS', 'REDTYP', 'BELKOD', 'UTDBEL', 'NTOMS', 'LAGERF', 'AKTARB', 'ROINTOV1', 'RAVAR', 'HANDVAR', 'EXTKOSOV', 'PERSKOS', 'AVSKRIV', 'JFRST1', 'RORKOOV1', 'RORRESUL', 'RESAND', 'RTEINKNC', 'RTEINEXT', 'RTEINOV', 'RTEKOKNC', 'RTEKOEXT', 'RTEKOOV', 'JFRSTFIN', 'RESEFIN', 'EXTRAINT', 'EXTRAKOS', 'KNCBDR', 'AGTSK', 'BSLDISP', 'SKATTER', 'MININTRR', 'RESAR', 'KOSALVAR', 'BRUTORES', 'FORSKO', 'ADMKO', 'FOUKO', 'JFRST2', 'ROINTOV2', 'RORKOOV2', 'EJINBET', 'FOUBAUTG', 'PATLIC', 'GOODWILL', 'IMANLOV', 'IMANLSU', 'BYGGMARK', 'MASK', 'INVENT', 'MASKINV', 'MATANLOV', 'MATANLSU', 'ANDKNC', 'LFORDKNC', 'LANDELAG', 'FIANLTOV', 'FIANLTSU', 'ANLTSU', 'PAGARB', 'LAGEROV', 'LAGERSU', 'KUNDFORD', 'KFORDKNC', 'KFORDOV', 'KFORDSU', 'KPLACSU', 'KABASU', 'OMSTGSU', 'TILLGSU', 'AKTIEKAP', 'OVERKURS', 'UPPSKR', 'OVRGBKAP', 'BALRES', 'KNCBDREL', 'AGTSKEL', 'RESARB', 'EKSU', 'OBESKRES', 'MININTR', 'AVSSU', 'LSKKRIN', 'LSKKNC', 'LSKOV', 'LSKSU', 'KSKKRIN', 'KSKLEV', 'KSKKNC', 'KSKOV', 'KSKSU', 'EKSKSU', 'RTENTO', 'ANTANST', 'LONLEDN', 'LONOV', 'SOCKOSTN', 'TANTLEDN', 'RESLONOV', 'AVGVED', 'AVSKSALV', 'AVSKFSG', 'AVSKADM', 'AVSKFOU', 'AVSKOV2', 'AVSKOSPC', 'INTFTG', 'INTFAST', 'SAKOV', 'SAKKOM', 'SAKSU', 'AGTSKV', 'ANSVFOV', 'ANSVFKOM', 'ANSVFSU', 'CHKRBEV', 'CHKRUTN', 'REVBER', 'BOLSTPRO', 'MODDTM', 'BSLSTART', 'BSLSLUT']

# Filter the DataFrame

In [None]:
"""
These filter should be thought through and written about in the methodology section.
We want to avoid cherry-picking the data, but we also want to avoid including data that is not relevant to the study.

We also want to avoid survuvalrship bias, i.e., we want to avoid including only the companies that have survived.
"""

filtered_df = df[(df['ser_year'] >= 2008) & # remove data before 2007
                 (df['ser_jurform'] == 49) & # aktiebolag
                 (df['ser_aktiv'] == 1) & # active companies
                 (df['ser_ftgkategori'] == 30) & # private companies, i.e., not state-owned etc.
                 (df['ser_stklf'] > 0)] # companies with at least 1 employee.

# Add Variables

In [35]:
"""
Adding a variable for growth.
Now the companies needs to be big in order to qualify as a high growth company which is a
bit weird since then HGFs will be bigger companies than the other group which might not be ideal
for making comparisons.
"""

# Calculate the annual growth rate for each company and add it as a new column
filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()

# Identify high-growth periods and create a new binary column
filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
    lambda x: ((x['TURNOVER_GROWTH'] > 0.20).rolling(window=3).sum() == 3) & 
              (x['ser_stklf'].iloc[0] in [3, 4, 5, 6, 7]) # only consider companies with at least 10 employees, based on size category
).reset_index(level=0, drop=True).astype(int)



  filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()
  filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['TURNOVER_GROWTH'] = filtered_df.groupby('ORGNR')['rr01_ntoms'].pct_change()
  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['HIGH_GROWTH'] = filtered_df.groupby('ORGNR').apply(


# Add one industry per company

In [38]:
"""
There are a lot of columns in the dataset that is about the comapnies industry, a lot of different SNI codes.
But we've decided to use the 'bransch_borsbransch_konv' column as the industry variable. Which is a conversion of the SNI codes
to fewer branches.

Some companies have changed industry over time, so we determine the most frequent industry for each company and add it as a new column
to each row assicated with that company.
"""

# add one industry to all companies.
dict_of_industries = {
    10: 'Energy & Environment',
    15: 'Materials',
    20: 'Industrial goods',
    22: 'Construction industry',
    25: 'Shopping goods',
    30: 'Convenience goods',
    35: 'Health & Education',
    40: 'Finance & Real estate',
    45: 'IT & Electronics',
    50: 'Telecom & Media',
    60: 'Corporate services',
    98: 'Other',
    99: 'SNI07 missing'
}

# Determine the most frequent 'bransch_borsbransch_konv' value for each company
most_frequent_industry = filtered_df.groupby('ORGNR')['bransch_borsbransch_konv'].agg(lambda x: x.mode()[0])

# Map the most frequent 'bransch_borsbransch_konv' value to the corresponding industry name
most_frequent_industry = most_frequent_industry.map(dict_of_industries)

# Add the new 'INDUSTRY' column to the DataFrame
filtered_df = filtered_df.merge(most_frequent_industry.rename('INDUSTRY'), on='ORGNR')

# Filter out rows where the 'INDUSTRY' column is 'SNI07 missing' or 'Other'
filtered_df = filtered_df[~filtered_df['INDUSTRY'].isin(['SNI07 missing', 'Other'])]

# Add varibles from Vanacker & Manigart (2010)

Adding variables for financing events, i.e., internal finance, debt, and equity. These are inspired from Vanacker & Manigart (2010).

In [45]:
# Dependent Variables

"""

Internal finance:

"When the net increase of retained earnings within a year exceeds 5% of total assets, we define this as an internal financing event."

N.D., this is only retained earnings, not profit/loss of the current year, maybe that should be included too.
Maybe not since that profit might be used for dividends so
then maybe we list it as a internal finance thing but in reailty that money is not used for investments.
"""

# Calculate the percentage change in retained earnings from the previous year
filtered_df['br10e_balres_pct_change'] = filtered_df.groupby('ORGNR')['br10e_balres'].pct_change()

# Calculate the percentage increase in retained earnings relative to total assets
filtered_df['INTERNAL_FINANCE'] = (filtered_df['br10e_balres_pct_change'] * filtered_df.groupby('ORGNR')['br10e_balres'].shift(1) / filtered_df['br09_tillgsu']) > 0.05

# Convert the boolean values to binary (1 or 0)
filtered_df['INTERNAL_FINANCE'] = filtered_df['INTERNAL_FINANCE'].astype(int)

# Drop the temporary column
filtered_df = filtered_df.drop(columns=['br10e_balres_pct_change'])

"""
Financing with debt:

"financial debt if there is a yearly net increase of outstanding financial debt (both short-term and long-term) that exceeds 5% of total assets."
"""

# Calculate the combined financial debt
filtered_df['combined_financial_debt'] = filtered_df['br14_kskkrin'] + filtered_df['br16_lskkrin']

# Calculate the percentage change in combined financial debt from the previous year
filtered_df['combined_financial_debt_pct_change'] = filtered_df.groupby('ORGNR')['combined_financial_debt'].pct_change()

# Calculate the percentage increase in combined financial debt relative to total assets
filtered_df['FINANCIAL_DEBT'] = (filtered_df['combined_financial_debt_pct_change'] * filtered_df.groupby('ORGNR')['combined_financial_debt'].shift(1) / filtered_df['br09_tillgsu']) > 0.05

# Convert the boolean values to binary (1 or 0)
filtered_df['FINANCIAL_DEBT'] = filtered_df['FINANCIAL_DEBT'].astype(int)

# Drop the temporary columns
filtered_df = filtered_df.drop(columns=[#'combined_financial_debt', do not drop this since it is used in the next section.
                                        'combined_financial_debt_pct_change'])

"""
External equity:

"companies are coded as using new equity financing when there is a net increase in external equity of at least 5% of total assets."
"""

# Calculate the combined external equity
filtered_df['external_equity'] = filtered_df['br10a_aktiekap'] + filtered_df['br10b_overkurs']

# Calculate the percentage change in external equity from the previous year
filtered_df['external_equity_pct_change'] = filtered_df.groupby('ORGNR')['external_equity'].pct_change()

# Calculate the percentage increase in external equity relative to total assets
filtered_df['EXTERNAL_EQUITY'] = (filtered_df['external_equity_pct_change'] * filtered_df.groupby('ORGNR')['external_equity'].shift(1) / filtered_df['br09_tillgsu']) > 0.05

# Convert the boolean values to binary (1 or 0)
filtered_df['EXTERNAL_EQUITY'] = filtered_df['EXTERNAL_EQUITY'].astype(int)

# Drop the temporary columns
filtered_df = filtered_df.drop(columns=['external_equity', 'external_equity_pct_change'])

  filtered_df['br10e_balres_pct_change'] = filtered_df.groupby('ORGNR')['br10e_balres'].pct_change()
  filtered_df['combined_financial_debt_pct_change'] = filtered_df.groupby('ORGNR')['combined_financial_debt'].pct_change()
  filtered_df['external_equity_pct_change'] = filtered_df.groupby('ORGNR')['external_equity'].pct_change()


In [47]:
# Group by 'ser_year' and calculate the percentage of firms with 1 in each column
summary_table = filtered_df.groupby('ser_year').apply(lambda x: pd.Series({
    'Internal finance %': (x['INTERNAL_FINANCE'].sum() / len(x)) * 100,
    'Financial debt %': (x['FINANCIAL_DEBT'].sum() / len(x)) * 100,
    'External equity %': (x['EXTERNAL_EQUITY'].sum() / len(x)) * 100
}))

# Display the summary table
print(summary_table)


  summary_table = filtered_df.groupby('ser_year').apply(lambda x: pd.Series({


          Internal finance %  Financial debt %  External equity %
ser_year                                                         
2008.0              0.000000          0.000000           0.000000
2009.0             21.160410          6.745321           0.564323
2010.0             21.130682          6.433843           0.514910
2011.0             23.028019          6.400859           0.226083
2012.0             22.516212          6.056607           0.195006
2013.0             21.186777          5.503027           0.179047
2014.0             24.001483          5.259545           0.180275
2015.0             25.268922          5.112839           0.183023
2016.0             25.019276          5.205132           0.169192
2017.0             24.304577          5.245630           0.129232
2018.0             27.645858          5.156801           0.126795
2019.0             28.022367          4.779566           0.106303
2020.0             31.433181          4.085938           0.105153
2021.0    

Unnamed: 0,ORGNR,ser_jurform,ser_year,ser_pnr,bransch_sni1,ser_aas,ser_inregyr,bransch_sni2,bransch_sni3,bransch_sni4,...,br15c_obllan,ser_regdat,regfall,TURNOVER_GROWTH,HIGH_GROWTH,INDUSTRY,INTERNAL_FINANCE,FINANCIAL_DEBT,EXTERNAL_EQUITY,combined_financial_debt
0,5.560000e+09,49.0,2008.0,63180.0,36220.0,2.0,1.0,28610.0,51479.0,28759.0,...,0.0,1874-12-11,BOL,,0,Shopping goods,0,0,0,10630.0
1,5.560000e+09,49.0,2009.0,63180.0,36220.0,1.0,1.0,28610.0,51479.0,28759.0,...,0.0,1874-12-11,BOL,-0.059624,0,Shopping goods,0,0,0,15226.0
2,5.560000e+09,49.0,2010.0,63180.0,36220.0,1.0,1.0,28610.0,51479.0,28759.0,...,0.0,1874-12-11,BOL,-0.075869,0,Shopping goods,0,0,0,5364.0
3,5.560000e+09,49.0,2011.0,63180.0,36220.0,1.0,1.0,28610.0,51479.0,28759.0,...,0.0,1874-12-11,BOL,0.027947,0,Shopping goods,0,0,0,0.0
4,5.560000e+09,49.0,2012.0,63180.0,36220.0,1.0,1.0,28610.0,51479.0,28759.0,...,0.0,1874-12-11,BOL,-0.091184,0,Shopping goods,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4086629,5.594141e+09,49.0,2022.0,55474.0,,0.0,1.0,,,,...,0.0,2022-12-11,BOL,,0,Health & Education,0,0,0,0.0
4086630,5.594152e+09,49.0,2022.0,41673.0,,0.0,1.0,,,,...,0.0,2022-12-02,BOL,,0,Corporate services,0,0,0,0.0
4086631,5.594155e+09,49.0,2022.0,21755.0,,0.0,1.0,,,,...,0.0,2022-12-12,BOL,,0,Finance & Real estate,0,0,0,0.0
4086633,5.594157e+09,49.0,2022.0,79140.0,,0.0,1.0,,,,...,0.0,2022-11-25,BOL,,0,Corporate services,0,0,0,0.0


In [48]:
# Set display options
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Auto-detect the display width
pd.set_option('display.max_colwidth', None)  # Show full column content

# Assuming 'filtered_df' is your DataFrame
print(filtered_df)

KeyboardInterrupt: 

In [44]:
# Adding Independent Variables (lagging one year)

"""
Internal finance:

"As proxies for the amount of internal finance available within the venture,
we use its profitability ratio, measured as earnings on total assets and
the amount of cash and marketable securities on total assets.

Have not included the second part yet, since I don't really know how to calculate it. Or what it is to be honest

Finally, the pay-out ratio, measured as dividends on total assets, indicates lower internal finance."
 - I use another formula for pay ratio, dividends on net profit/loss.
"""

# Create the 'ROA' column by lagging the 'ny_avktokap' column by one year
filtered_df['ROA'] = filtered_df.groupby('ORGNR')['ny_avktokap'].shift(1)

# Calculate the PAYOUT_RATIO for the current year
filtered_df['PAYOUT_RATIO'] = filtered_df['rr00_utdbel'] / filtered_df['rr15_resar']

# Lag the PAYOUT_RATIO by one year
filtered_df['PAYOUT_RATIO'] = filtered_df.groupby('ORGNR')['PAYOUT_RATIO'].shift(1)

"""
"Debt capacity is proxied by leverage and cash flow.

Leverage is operationalized as a company’s debt ratio (financial debt on total assets).

Furthermore, we include a variable indicating if debt is greater than
total assets (negative stockholders’ equity dummy variable).

Cash flow is operationalized by 
using the cash flow ratio (i.e., internally generated cash flow on total assets), indicating a
company’s ability to support additional debt-related payments."
"""

# Calculate the LEVERAGE for the current year
filtered_df['LEVERAGE'] = filtered_df['combined_financial_debt'] / filtered_df['br09_tillgsu']

# Lag the LEVERAGE by one year
filtered_df['LEVERAGE'] = filtered_df.groupby('ORGNR')['LEVERAGE'].shift(1)

# Calculate the cash flow as the difference in cash balance from the previous year
filtered_df['CASH_FLOW'] = filtered_df.groupby('ORGNR')['cash_balance'].diff()

# this will be wrong as they have internally generated cash flow, looking at difference in cash balance
# will mean a good positive cash flow if they take a loan or take in equity. 
# So it should probably be cash flow as it is now minus change in loan and equity.



0          0.042452
1          0.130531
2          0.007158
3          0.033726
4         -0.016962
             ...   
4086629    0.086860
4086630    0.109091
4086631    0.044776
4086633    0.086093
4086634    0.062500
Name: ny_avktokap, Length: 3936840, dtype: float64

# Creating the subset with HGF
*up until this point, all companies have been in the same DF.*

In [25]:
# Create DataFrame with high-growth companies
high_growth_orgnr = filtered_df[filtered_df['HIGH_GROWTH'] == 1]['ORGNR'].unique()
high_growth_df = filtered_df[filtered_df['ORGNR'].isin(high_growth_orgnr)]

# Create DataFrame with non-high-growth companies
non_high_growth_df = filtered_df[~filtered_df['ORGNR'].isin(high_growth_orgnr)]

# Create a DataFrame with unique high-growth companies
unique_high_growth_df = high_growth_df.drop_duplicates(subset='ORGNR')

# Create a DataFrame with unique non-high-growth companies
unique_non_high_growth_df = non_high_growth_df.drop_duplicates(subset='ORGNR')
print(len(unique_non_high_growth_df))



485744


In [19]:
# Combine the unique high-growth companies with the non-high-growth companies
combined_df = pd.concat([unique_high_growth_df, unique_non_high_growth_df])

print(len(combined_df))
combined_df['ORGNR'].nunique()


488611


488611

In [27]:
# Calculate the percentage of each industry among all companies
industry_percentage = combined_df['INDUSTRY'].value_counts(normalize=True) * 100

# Calculate the total percentage to check for rounding errors
total_percentage = industry_percentage.sum()

# If the total percentage is not exactly 100, adjust the last value
if total_percentage != 100:
    difference = 100 - total_percentage
    industry_percentage.iloc[-1] += difference

# Add a row for the total percentage
industry_percentage['Total'] = industry_percentage.sum()

# Display the result
print(industry_percentage)

# Calculate the percentage of each industry among high-growth companies
industry_percentage = unique_high_growth_df['INDUSTRY'].value_counts(normalize=True) * 100

# Calculate the total percentage to check for rounding errors
total_percentage = industry_percentage.sum()

# If the total percentage is not exactly 100, adjust the last value
if total_percentage != 100:
    difference = 100 - total_percentage
    industry_percentage.iloc[-1] += difference

# Add a row for the total percentage
industry_percentage['Total'] = industry_percentage.sum()

# Display the result
print(industry_percentage)

INDUSTRY
Corporate services        30.203782
Shopping goods            21.698857
Construction industry     14.701675
Health & Education         6.917978
IT & Electronics           6.683845
Finance & Real estate      6.570462
Industrial goods           5.921070
Convenience goods          3.622309
Telecom & Media            1.947152
Materials                  1.244753
Energy & Environment       0.488118
Total                    100.000000
Name: proportion, dtype: float64
INDUSTRY
Corporate services        25.776073
Construction industry     19.532612
Shopping goods            14.893617
Health & Education        12.452040
Industrial goods           9.208232
IT & Electronics           8.161842
Convenience goods          3.348448
Finance & Real estate      3.034531
Telecom & Media            1.639344
Materials                  1.185909
Energy & Environment       0.767353
Total                    100.000000
Name: proportion, dtype: float64


In [28]:
# Calculate the average goodwill for each industry among all companies
avg_goodwill_by_industry_all = combined_df.groupby('INDUSTRY')['br01c_goodwill'].mean()

# Sort by average goodwill in descending order
average_goodwill_by_industry = avg_goodwill_by_industry_all.sort_values(ascending=False)

# Display the result
print(average_goodwill_by_industry)

# Calculate the average goodwill for each industry among HGFs
avg_goodwill_by_industry_high_growth = unique_high_growth_df.groupby('INDUSTRY')['br01c_goodwill'].mean()

# Sort by average goodwill in descending order
avg_goodwill_by_industry_high_growth = avg_goodwill_by_industry_high_growth.sort_values(ascending=False)

# Display the result
print(avg_goodwill_by_industry_high_growth)

INDUSTRY
Energy & Environment     775.081444
Materials                719.238926
Industrial goods         504.911862
Telecom & Media          473.896490
Convenience goods        350.011102
Health & Education       154.906440
IT & Electronics         153.526295
Shopping goods           109.110065
Finance & Real estate     71.700881
Corporate services        53.337668
Construction industry     25.057200
Name: br01c_goodwill, dtype: float64
INDUSTRY
Energy & Environment     1574.681818
Industrial goods         1268.776515
IT & Electronics          729.713675
Shopping goods            591.971897
Finance & Real estate     561.574713
Telecom & Media           477.297872
Health & Education        339.000000
Corporate services        178.174560
Convenience goods         134.520833
Construction industry      90.646429
Materials                  25.617647
Name: br01c_goodwill, dtype: float64


In [29]:
# Calculate the average goodwill for each industry among all companies
avg_sales_by_industry_all = combined_df.groupby('INDUSTRY')['rr01_ntoms'].mean()

# Sort by average goodwill in descending order
average_sales_by_industry = avg_goodwill_by_industry_all.sort_values(ascending=False)

# Display the result
print(average_sales_by_industry)

# Calculate the average goodwill for each industry among HGFs
avg_sales_by_industry_high_growth = unique_high_growth_df.groupby('INDUSTRY')['rr01_ntoms'].mean()

# Sort by average goodwill in descending order
avg_sales_by_industry_high_growth = avg_sales_by_industry_high_growth.sort_values(ascending=False)

# Display the result
print(avg_sales_by_industry_high_growth)

INDUSTRY
Energy & Environment     775.081444
Materials                719.238926
Industrial goods         504.911862
Telecom & Media          473.896490
Convenience goods        350.011102
Health & Education       154.906440
IT & Electronics         153.526295
Shopping goods           109.110065
Finance & Real estate     71.700881
Corporate services        53.337668
Construction industry     25.057200
Name: br01c_goodwill, dtype: float64
INDUSTRY
Energy & Environment     612610.318182
Materials                209139.882353
Finance & Real estate     87681.678161
Industrial goods          71223.469697
Convenience goods         47523.437500
Shopping goods            41027.800937
IT & Electronics          31189.606838
Construction industry     30803.689286
Corporate services        25940.364005
Telecom & Media           25127.085106
Health & Education        21918.308123
Name: rr01_ntoms, dtype: float64


In [30]:
# Assuming 'combined_df' is your DataFrame and 'ser_stklf' is the column for size category

# Calculate the average size category for each industry among all companies
avg_size_by_industry_all = combined_df.groupby('INDUSTRY')['ser_stklf'].mean()

# Sort by average size category in descending order
avg_size_by_industry_all = avg_size_by_industry_all.sort_values(ascending=False)

# Display the result
print("Average size category by industry (all companies):")
print(avg_size_by_industry_all)

# Calculate the average size category for each industry among HGFs
avg_size_by_industry_high_growth = unique_high_growth_df.groupby('INDUSTRY')['ser_stklf'].mean()

# Sort by average size category in descending order
avg_size_by_industry_high_growth = avg_size_by_industry_high_growth.sort_values(ascending=False)

# Display the result
print("\nAverage size category by industry (high-growth firms):")
print(avg_size_by_industry_high_growth)

Average size category by industry (all companies):
INDUSTRY
Industrial goods         1.612215
Materials                1.396251
Convenience goods        1.334991
Energy & Environment     1.208386
Construction industry    1.163126
Shopping goods           1.139479
Health & Education       1.092391
Corporate services       1.017381
IT & Electronics         1.010135
Telecom & Media          0.994639
Finance & Real estate    0.644655
Name: ser_stklf, dtype: float64

Average size category by industry (high-growth firms):
INDUSTRY
Materials                3.941176
Energy & Environment     3.909091
Industrial goods         3.625000
Telecom & Media          3.617021
Health & Education       3.610644
Finance & Real estate    3.505747
Shopping goods           3.494145
Corporate services       3.488498
IT & Electronics         3.470085
Convenience goods        3.416667
Construction industry    3.385714
Name: ser_stklf, dtype: float64


In [18]:
# Save the filtered DataFrame to an Excel file
output_path = "./serrano/serrano_2024_Stata/firms.xlsx"
filtered_df.head(20000).to_excel(output_path, index=False)

print(f"Filtered data saved to {output_path}")

Filtered data saved to ./serrano/serrano_2024_Stata/firms.xlsx


In [31]:
# Group by 'ser_year' and calculate the percentage of firms with 1 in each column
summary_table = combined_df.groupby('ser_year').apply(lambda x: pd.Series({
    'Internal finance %': (x['INTERNAL_FINANCE'].sum() / len(x)) * 100,
    'Financial debt %': (x['FINANCIAL_DEBT'].sum() / len(x)) * 100,
    'External equity %': (x['EXTERNAL_EQUITY'].sum() / len(x)) * 100
}))

# Display the summary table
print(summary_table)

          Internal finance %  Financial debt %  External equity %
ser_year                                                         
2007.0                   0.0               0.0                0.0
2008.0                   0.0               0.0                0.0
2009.0                   0.0               0.0                0.0
2010.0                   0.0               0.0                0.0
2011.0                   0.0               0.0                0.0
2012.0                   0.0               0.0                0.0
2013.0                   0.0               0.0                0.0
2014.0                   0.0               0.0                0.0
2015.0                   0.0               0.0                0.0
2016.0                   0.0               0.0                0.0
2017.0                   0.0               0.0                0.0
2018.0                   0.0               0.0                0.0
2019.0                   0.0               0.0                0.0
2020.0    

  summary_table = combined_df.groupby('ser_year').apply(lambda x: pd.Series({
