In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# Read csv file to dataframe
df = pd.read_csv('data/data_main_upd_trim_1.csv')
df.head()

Unnamed: 0,gvkey,year,naics,sale,cogs,xsga,xlr,xrd,xad,dvt,...,cogs_D,xsga_D,mkvalt_D,dividend_D,capital_D,intan_D,xlr_D,kexp,mat1,s_g
0,1001,1983.0,722,25395.0,6310.0,16435.0,,0.0,1.33,0.0,...,11782.499719,30688.650218,48302.647024,0.0,23215.819176,1142.771764,,3983.568519,,4.024564
1,1001,1984.0,722,32007.0,8171.0,20628.0,,,1.84,0.0,...,14733.406758,37195.045234,24125.931027,0.0,32151.679831,1137.777465,,6002.025893,,3.917146
2,1001,1985.0,722,53798.0,13530.0,33021.0,,0.0,3.039,0.0,...,23638.968456,57692.71082,70547.382691,0.0,50772.241193,22847.508537,,8578.8148,,3.976201
3,1003,1982.0,442110,12748.0,7973.0,2869.0,,,0.161,0.0,...,15472.540634,5567.630638,,0.0,444.40133,0.0,,80.254809,,1.598896
4,1003,1983.0,442110,13793.0,8469.0,3186.0,,,0.154,0.0,...,15813.944551,5949.135357,26301.956484,0.0,702.09507,0.0,,120.471468,,1.628646


In [2]:
# Generate id group based on 'gvkey' and drop if missing
df['id'] = df['gvkey'].astype(str)
df = df.dropna(subset=['id'])
df['id'] = df['id'].astype(int)
df.head()
# Costshare0 = "calibrated 0.85 (fig 1 NBER)"
df['costshare0'] = 0.85

# Generate costshare1 = "cogs_D/(cogs_D+kexp)"
df['costshare1'] = df['cogs_D'] / (df['cogs_D'] + df['kexp'])

# Generate costshare2 = "cogs_D/(cogs_D+xsga_D+kexp)"
df['costshare2'] = df['cogs_D'] / (df['cogs_D'] + df['xsga_D'] + df['kexp'])

# Generate costshare3 = "sga_D/(cogs_D+xsga_D+kexp)"
df['costshare3'] = df['xsga_D'] / (df['cogs_D'] + df['xsga_D'] + df['kexp'])

# Generate costshare4 = "capital cost share"
df['costshare4'] = df['kexp'] / (df['cogs_D'] + df['xsga_D'] + df['kexp'])

# Generate mu_0, mu_1, mu_2 based on the loop in Stata
for s in range(3):
    df[f'mu_{s}'] = df[f'costshare{s}'] * (df['sale_D'] / df['cogs_D'])
    
for s in range(1, 3):
    # Calculate the 1st and 99th percentiles by year
    df[f'cs{s}_p1'] = df.groupby('year')[f'costshare{s}'].transform(lambda x: x.quantile(0.01))
    df[f'cs{s}_p99'] = df.groupby('year')[f'costshare{s}'].transform(lambda x: x.quantile(0.99))
    
    # Drop rows where costshare is 0 or NaN
    df = df[(df[f'costshare{s}'] != 0) & (~df[f'costshare{s}'].isna())]
    
    # Drop rows where costshare is outside the 1st and 99th percentiles
    df = df[(df[f'costshare{s}'] >= df[f'cs{s}_p1']) & (df[f'costshare{s}'] <= df[f'cs{s}_p99'])]

    # Optional: Drop the percentile columns if you no longer need them
    df.drop([f'cs{s}_p1', f'cs{s}_p99'], axis=1, inplace=True)
# Function to calculate summary statistics
def summarize(df, columns):
    summary = df[columns].agg(['mean', 'median', 'count']).T
    return summary

# Specify the columns to summarize
columns_to_summarize = ['sale_D', 'cogs', 'capital_D', 'xlr_D', 'emp', 'xsga_D']

# 1. Summary statistics for the full sample
summary_all = summarize(df, columns_to_summarize)

# 2. Summary statistics for the sample where xlr_D is not NaN
summary_xlr = summarize(df[df['xlr_D'].notna()], columns_to_summarize)

# Save the results to a text file in the working directory
with open("data/sumstat.txt", "w") as f:
    f.write("Summary Statistics - Full Sample\n")
    f.write(summary_all.to_string())
    f.write("\n\nSummary Statistics - Sample where xlr_D is not NaN\n")
    f.write(summary_xlr.to_string())
# Calculate elasticities via np estimate cost share, median cost share

# Step 1: Calculate and generate mu1_med and mu2_med based on costshare1 and costshare2
for m in range(2, 5):  # Loop over 2, 3, 4 digits of industry code
    for c in range(1, 3): # Loop over costshare1 and costshare2
        # Calculate the median of costshare`c' grouped by indmd (industry code) and year
        median_column_name = f'cs{c}_med_{m}dt'
        df[median_column_name] = df.groupby([f'ind{m}d', 'year'])[f'costshare{c}'].transform('median') # Calculate the median costshare of each industry code and year
        
        # Generate the mu`c'_med_`m' variable
        mu_column_name = f'mu{c}_med_{m}'
        if c == 1:
            df[mu_column_name] = df[median_column_name] * (df['sale_D'] / df['cogs_D'])
        elif c == 2:
            df[mu_column_name] = df[median_column_name] * (df['sale_D'] / (df['cogs_D'] + df['xsga_D']))

# Step 2: Calculate and generate mu3_med based on costshare3
for m in range(2, 5):
    # Calculate the median of costshare3 grouped by ind`m'd and year
    median_column_name = f'cs3_med_{m}dt'
    df[median_column_name] = df.groupby([f'ind{m}d', 'year'])[f'costshare3'].transform('median')
    
    # Generate the mu3_med_`m' variable
    mu_column_name = f'mu3_med_{m}'
    df[mu_column_name] = df[median_column_name] * (df['sale_D'] / df['xsga_D'])

# Step 3: Calculate and generate mu4_med based on costshare4
for m in range(2, 5):
    # Calculate the median of costshare4 grouped by ind`m'd and year
    median_column_name = f'cs4_med_{m}dt'
    df[median_column_name] = df.groupby([f'ind{m}d', 'year'])[f'costshare4'].transform('median')
    
    # Generate the mu4_med_`m' variable
    mu_column_name = f'mu4_med_{m}'
    df[mu_column_name] = df[median_column_name] * (df['sale_D'] / df['kexp'])

# Step 4: Rename and label the variables
rename_map = {
    'mu1_med_2': 'mu_3',
    'mu1_med_3': 'mu_4',
    'mu1_med_4': 'mu_5',
    'mu2_med_2': 'mu_6',
    'mu2_med_3': 'mu_7',
    'mu2_med_4': 'mu_8'
}

# Rename the columns according to the rename_map
df.rename(columns=rename_map, inplace=True)

# Optional: Add labels (in pandas, you can store them in a dictionary)
labels = {
    'mu_3': "markup median costshare 2d (cogs+rk)",
    'mu_4': "markup median costshare 3d (cogs+rk)",
    'mu_5': "markup median costshare 4d (cogs+rk)",
    'mu_6': "markup median costshare 2d (cogs+rk+sga)",
    'mu_7': "markup median costshare 3d (cogs+rk+sga)",
    'mu_8': "markup median costshare 4d (cogs+rk+sga)"
}

# For future reference, you can store these labels or print them out
for col, label in labels.items():
    print(f"{col}: {label}")
# Calculate elasticities via PF estimation, pull parameters F(cogs, k) by period-indsutry

df = df.sort_values(by='ind2d')

# Step 1: Merge with theta_ALLsectors (costshare of cogs by two digit industry code, one value per year)
theta_cd = pd.read_stata("data/theta_ALLsectors.dta")
df = pd.merge(df, theta_cd[['ind2d', 'theta_c']], on='ind2d', how='left')

mu_3: markup median costshare 2d (cogs+rk)
mu_4: markup median costshare 3d (cogs+rk)
mu_5: markup median costshare 4d (cogs+rk)
mu_6: markup median costshare 2d (cogs+rk+sga)
mu_7: markup median costshare 3d (cogs+rk+sga)
mu_8: markup median costshare 4d (cogs+rk+sga)


In [3]:
# Step 2: Generate mu_9
df['mu_9'] = df['theta_c'] * (df['sale_D'] / df['cogs_D'])

# Step 3: Merge with theta_W_s_window.dta
df = df.sort_values(by=['ind2d', 'year'])
theta_w = pd.read_stata("data/theta_W_s_window.dta")


# Show df['year'] and df['ind2d'] types
print(df['year'].dtypes, df['ind2d'].dtypes)
print(theta_w['year'].dtypes, theta_w['ind2d'].dtypes)

# Transfer theta_w datetime64 to int and keeping only the year
theta_w['year'] = theta_w['year'].dt.year
theta_w['year'] = theta_w['year'].astype(int) 
df['year'] = df['year'].astype(int)
theta_w.head()

float64 int64
datetime64[ns] int8


Unnamed: 0,year,ind2d,theta_WI1_ct,theta_WI2_ct,theta_WI2_xt,theta_WI1_kt,theta_WI2_kt
0,1955,11,0.798882,0.636764,-0.058981,-0.09239,0.069324
1,1956,11,0.798882,0.636764,-0.058981,-0.09239,0.069324
2,1957,11,0.798882,0.636764,-0.058981,-0.09239,0.069324
3,1958,11,0.798882,0.636764,-0.058981,-0.09239,0.069324
4,1959,11,0.798882,0.636764,-0.058981,-0.09239,0.069324


In [6]:

# Merge df and theta_w based on 'ind2d' and 'year'
df = pd.merge(df, theta_w[['ind2d', 'year', 'theta_WI1_ct', 'theta_WI2_ct', 'theta_WI2_xt', 'theta_WI1_kt',  'theta_WI2_kt']], on=['ind2d', 'year'], how='left')
df.head()

Unnamed: 0,gvkey,year,naics,sale,cogs,xsga,xlr,xrd,xad,dvt,...,mu4_med_3,cs4_med_4dt,mu4_med_4,theta_c,mu_9,theta_WI1_ct,theta_WI2_ct,theta_WI2_xt,theta_WI1_kt,theta_WI2_kt
0,10971,1955,111,14900.0,8330.0,890.0,,,,1070.0,...,1.306162,0.191757,1.306162,0.86593,1.548903,0.798882,0.636764,-0.058981,-0.09239,0.069324
1,7675,1955,115210,389600.0,241960.0,98870.0,131100.0,,,14510.0,...,1.068598,0.065169,1.068598,0.86593,1.394306,0.798882,0.636764,-0.058981,-0.09239,0.069324
2,10971,1956,111,16300.0,9220.0,860.0,,,,1380.0,...,1.301528,0.174181,1.335401,0.86593,1.530874,0.798882,0.636764,-0.058981,-0.09239,0.069324
3,10884,1956,111339,343700.0,242310.0,28510.0,,,,26320.0,...,1.087573,0.165345,1.059268,0.86593,1.228262,0.798882,0.636764,-0.058981,-0.09239,0.069324
4,7675,1956,115210,410500.0,255300.0,102470.0,134160.0,,,14510.0,...,1.078489,0.060047,1.078489,0.86593,1.39234,0.798882,0.636764,-0.058981,-0.09239,0.069324


In [9]:
#  Generate mu_10, mu_11, mu_12
df['mu_10'] = df['theta_WI1_ct'] * (df['sale_D'] / df['cogs_D'])
df['mu_11'] = df['theta_WI2_ct'] * (df['sale_D'] / df['cogs_D'])
df['mu_12'] = df['theta_WI2_xt'] * (df['sale_D'] / df['xsga_D'])

# Rename and label variables
df.rename(columns={'mu3_med_2': 'mu_13'}, inplace=True)

# Generate additional variables
df['mu_14'] = df['costshare3'] * (df['sale_D'] / df['xsga_D'])
df['mu_cap'] = df['theta_WI1_kt'] * (df['sale_D'] / df['kexp'])

KeyError: 'theta_WI1_ct'

In [7]:


# Step 6: Merge with theta_ms_window.dta
df = df.sort_values(by=['ind2d', 'year'])
theta_ms = pd.read_stata("data/theta_ms_window.dta")
df = pd.merge(df, theta_ms[['ind2d', 'year', 'theta_CM1_ct', 'theta_CM2_ct']], on=['ind2d', 'year'], how='left')

# Generate mu_15 and mu_16
df['mu_15'] = df['theta_CM1_ct'] * (df['sale_D'] / df['cogs_D'])
df['mu_16'] = df['theta_CM2_ct'] * (df['sale_D'] / df['cogs_D'])

# Step 7: Calculate total costs and sales
df['totcost1'] = df['cogs_D'] + df['kexp']
df['totcost2'] = df['cogs_D'] + df['xsga_D'] + df['kexp']

# Step 8: Aggregate values by year
df['TOTSALES'] = df.groupby('year')['sale_D'].transform('sum')
df['TOTCOST1'] = df.groupby('year')['totcost1'].transform('sum')
df['TOTCOST2'] = df.groupby('year')['totcost2'].transform('sum')
df['TOTEMP'] = df.groupby('year')['emp'].transform('sum')

df['TOTCOGS'] = df.groupby('year')['cogs_D'].transform('sum')
df['TOTSGA'] = df.groupby('year')['xsga_D'].transform('sum')
df['TOTK'] = df.groupby('year')['capital_D'].transform('sum')
df['TOTrK'] = df.groupby('year')['kexp'].transform('sum')

# Step 9: Generate cost share ratios
df['cs_red_tot'] = df['TOTCOGS'] / df['TOTCOST1']
df['cs_blue_tot'] = df['TOTCOGS'] / df['TOTCOST2']
df['cs_red_k_tot'] = df['TOTK'] / df['TOTCOST1']
df['cs_red_rk_tot'] = df['TOTrK'] / df['TOTCOST1']
df['cs_blue_k_tot'] = df['TOTK'] / df['TOTCOST2']
df['cs_blue_rk_tot'] = df['TOTrK'] / df['TOTCOST2']
df['cs_blue_x_tot'] = df['TOTSGA'] / df['TOTCOST2']

# Step 10: Calculate the total cost ratio
df['m_totcost'] = df['totcost2'] / df['TOTCOST2']

# You can save the final DataFrame or proceed with additional analysis

ValueError: You are trying to merge on int64 and datetime64[ns] columns for key 'year'. If you wish to proceed you should use pd.concat