In [184]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import statsmodels.api as sm

df = pd.read_csv("data/imputed_dataset.csv")

In [185]:
df = df[(df['fyear'] >= 2000) & (df['fyear'] <= 2023)]
df = df.dropna(subset=['prev_inv'])

In [186]:
def best_model(df):
    X = df[['liquidity', 'leverage', 'roa', 'icapt', 'aqc', 'prev_inv', 'cpu_index', 'asset_growth']]
    y = df['y']

    X = X.applymap(lambda x: np.log(x) if x > 0 else x)
    y = np.log(df['y'])

    X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    print(model.summary())

In [187]:
df['sic'].unique()

array([4911, 3845, 3674, 2810, 9997, 1311, 3823, 2836, 3663, 3559, 2070,
       7370, 7374, 2670, 3411, 3841, 4812, 5731, 3826, 3721, 7363, 2834,
       2085, 4011, 2030, 2082, 5122, 3531, 2911, 2840, 7200, 2842, 2086,
       2844, 4841, 2000, 4931, 3851, 3679, 3510, 5331, 4888, 3585, 2820,
       7311, 4924, 7323, 4923, 4731, 2870, 4513, 2040, 5013, 5000, 1389,
       3812, 3944, 2060, 3570, 5211, 2011, 3640, 4213, 3560, 2860, 2631,
       3827, 2621, 5411, 5990, 3760, 3430, 2090, 5812, 8000, 1040, 3021,
       3569, 4932, 3312, 2851, 3490, 8721, 3561, 2080, 2111, 3620, 7340,
       5651, 3630, 2033, 3420, 4512, 3540, 5140, 3825, 7373, 4210, 3724,
       8062, 2300, 1400, 5912, 3060, 3572, 4922, 7372, 8700, 4400, 3843,
       4833, 3678, 7990, 4953, 1000, 8071, 3576, 3844, 3829, 5531, 2835,
       4991, 3577, 2273, 3672, 3714, 7011, 5399, 2800, 5200, 2650, 5010,
       3743, 5090, 8090, 5047, 5500, 5961, 1731, 4700], dtype=int64)

In [188]:
sic_divisions = {
    (100, 1000): "Agriculture, Forestry, and Fishing",
    range(1000, 1500): "Mining",
    range(1500, 1800): "Construction",
    range(2000, 4000): "Manufacturing",
    range(4000, 5000): "Transportation, Communications, Electric, Gas, and Sanitary Services",
    range(5000, 5200): "Wholesale Trade",
    range(5200, 6000): "Retail Trade",
    range(6000, 6800): "Finance, Insurance, and Real Estate",
    range(7000, 9000): "Services",
    range(9100, 9730): "Public Administration",
    range(9900, 10000): "Non-classifiable Establishments"
}

def map_sic_to_division(code):
    for range_, division in sic_divisions.items():
        if isinstance(range_, int):
            if code == range_:
                return division
        elif isinstance(range_, range):
            if code in range_:
                return division
        elif isinstance(range_, tuple):
            if range_[0] <= code <= range_[1]:
                return division
    return None

# Apply the function to create a new column indicating the group
df['division'] = df['sic'].apply(map_sic_to_division)


In [189]:
df['division'].unique()

array(['Transportation, Communications, Electric, Gas, and Sanitary Services',
       'Manufacturing', 'Non-classifiable Establishments', 'Mining',
       'Services', 'Retail Trade', 'Wholesale Trade',
       'Agriculture, Forestry, and Fishing', 'Construction'], dtype=object)

In [190]:
df.head()

Unnamed: 0.1,Unnamed: 0,GVKEY,datadate,fyear,indfmt,consol,popsrc,datafmt,conm,curcd,...,y,liquidity,leverage,size,roa,prev_inv,asset_growth,year,cpu_index,division
276,276,1075,31/12/2000,2000,INDL,C,D,STD,PINNACLE WEST CAPITAL CORP,USD,...,0.092124,0.666427,0.499975,7149.151,0.042289,0.130371,1.08181,2000,62.448747,"Transportation, Communications, Electric, Gas,..."
277,277,1078,31/12/2000,2000,INDL,C,D,STD,ABBOTT LABORATORIES,USD,...,0.087658,1.716387,0.50141,15283.254,0.18229,0.054773,1.056127,2000,62.448747,Manufacturing
278,278,1161,31/12/2000,2000,INDL,C,D,STD,ADVANCED MICRO DEVICES,USD,...,0.139652,2.171121,0.385951,5767.735,0.170435,0.120212,1.317527,2000,62.448747,Manufacturing
279,279,1209,30/09/2000,2000,INDL,C,D,STD,AIR PRODUCTS & CHEMICALS INC,USD,...,0.092824,1.312918,0.487293,8270.5,0.015017,0.087616,1.00425,2000,62.448747,Manufacturing
280,280,1300,31/12/2000,2000,INDL,C,D,STD,HONEYWELL INTERNATIONAL INC,USD,...,0.033883,1.477821,0.743175,25175.0,0.065899,0.036159,1.070047,2000,62.448747,Non-classifiable Establishments


In [191]:
services_df = df[df['division'] == 'Services']["conm"].unique()
print(services_df)

['AUTODESK INC' 'AUTOMATIC DATA PROCESSING' 'ROBERT HALF INC'
 'CINTAS CORP' 'OMNICOM GROUP INC' 'EQUIFAX INC'
 'INTL BUSINESS MACHINES CORP' 'INTERPUBLIC GROUP OF COS'
 'JACOBS SOLUTIONS INC' 'S&P GLOBAL INC' 'CVS HEALTH CORP' 'PAYCHEX INC'
 'ROLLINS INC' 'TYLER TECHNOLOGIES INC' 'UNIVERSAL HEALTH SVCS INC'
 'HENRY (JACK) & ASSOCIATES' 'MICROSOFT CORP' 'ORACLE CORP' 'GARTNER INC'
 'ADOBE INC' 'CADENCE DESIGN SYSTEMS INC' 'FAIR ISAAC CORP'
 'MGM RESORTS INTERNATIONAL' 'LABORATORY CP OF AMER HLDGS'
 'GEN DIGITAL INC' 'ELECTRONIC ARTS INC' 'PTC INC'
 'ROPER TECHNOLOGIES INC' 'SYNOPSYS INC' 'MATCH GROUP INC' 'INTUIT INC'
 'MARRIOTT INTL INC' 'DAVITA INC' 'ANSYS INC'
 'FACTSET RESEARCH SYSTEMS INC' 'QUEST DIAGNOSTICS INC'
 'TAKE-TWO INTERACTIVE SFTWR' 'VERISIGN INC' 'COGNIZANT TECH SOLUTIONS'
 'COSTAR GROUP INC' 'F5 INC' 'AKAMAI TECHNOLOGIES INC']


In [192]:
agri_df = df[df['division'] == 'Agriculture, Forestry, and Fishing']["conm"].unique()
print(agri_df)

['FREEPORT-MCMORAN INC']


In [193]:
# Group by 'division' column and apply best_model method to each group
results = df.groupby('division').apply(lambda x: best_model(x))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.786
Method:                 Least Squares   F-statistic:                     11.12
Date:                Fri, 08 Mar 2024   Prob (F-statistic):           7.06e-05
Time:                        13:27:03   Log-Likelihood:                 8.2408
No. Observations:                  23   AIC:                             1.518
Df Residuals:                      14   BIC:                             11.74
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -3.6524      1.154     -3.165   

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.644
Model:                            OLS   Adj. R-squared:                  0.441
Method:                 Least Squares   F-statistic:                     3.171
Date:                Fri, 08 Mar 2024   Prob (F-statistic):             0.0285
Time:                        13:27:03   Log-Likelihood:                 16.639
No. Observations:                  23   AIC:                            -15.28
Df Residuals:                      14   BIC:                            -5.058
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -3.0622      1.045     -2.930   