In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

df = pd.read_csv("data/imputed_dataset.csv")

In [25]:
df = df[(df['fyear'] >= 2000) & (df['fyear'] <= 2023)]
df = df.dropna(subset=['prev_inv'])

In [26]:
X = df[['liquidity', 'leverage', 'roa', 'icapt', 'aqc', 'prev_inv', 'cpu_index', 'asset_growth']]
y = df['y']

In [27]:
X = X.applymap(lambda x: np.log(x) if x > 0 else x)
y = np.log(df['y'])

In [28]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [29]:
X.head()

Unnamed: 0,liquidity,leverage,roa,icapt,aqc,prev_inv,cpu_index,asset_growth
276,-0.405825,-0.693197,-3.163223,8.375122,0.0,-2.037374,4.134346,0.078636
277,0.540221,-0.690331,-1.702159,9.174431,0.0,-2.904557,4.134346,0.054608
278,0.775244,-0.952044,-1.769399,8.375547,0.0,-2.1185,4.134346,0.275756
279,0.272252,-0.71889,-4.198557,8.622022,5.138149,-2.434787,4.134346,0.004241
280,0.390569,-0.296824,-2.719636,9.521348,7.833204,-3.319816,4.134346,0.067703


In [30]:
X.isnull().sum()

liquidity       0
leverage        0
roa             0
icapt           0
aqc             0
prev_inv        0
cpu_index       0
asset_growth    0
dtype: int64

# Model 1 without rnd

In [31]:
import statsmodels.api as sm

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.728
Model:                            OLS   Adj. R-squared:                  0.727
Method:                 Least Squares   F-statistic:                     2116.
Date:                Sat, 09 Mar 2024   Prob (F-statistic):               0.00
Time:                        11:40:03   Log-Likelihood:                -3569.4
No. Observations:                6347   AIC:                             7157.
Df Residuals:                    6338   BIC:                             7218.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.4519      0.068      6.633   

In [32]:
# Check instrument relevance (F-statistic)
f_statistic = model.fvalue
if f_statistic < 10:
    print("Warning: Instruments may not be relevant.")
else: 
    print("Significant Instruments")

Significant Instruments


**t-values**:
   - t-values represent the ratio of the estimated coefficient to its standard error.
   - They indicate the significance of the estimated coefficient.
   - Larger absolute t-values (in either direction) suggest greater evidence against the null hypothesis that the coefficient is zero.
   
**P-values (P>|t|)**:
   - P-values represent the probability of observing the estimated coefficient (or more extreme) if the null hypothesis (that the coefficient is zero) is true.
   - Small p-values (typically less than a chosen significance level, such as 0.05) suggest that the coefficient is statistically significant.
   - In this output, all p-values are very close to zero (0.000), indicating that all coefficients are statistically significant at conventional significance levels (e.g., 0.05).
   
**Confidence Intervals ([0.025 0.975])**:
   - Confidence intervals provide a range of plausible values for the true population parameter (the coefficient) based on the sample data.
   - They are typically constructed around the estimated coefficient and provide a sense of the precision of the estimate.
   - The confidence interval is calculated as the estimated coefficient plus or minus a margin of error.
   - In this output, the confidence intervals indicate the range within which we are reasonably confident the true coefficients lie.

**R-squared (or Adjusted R-squared) Test:**
R-squared measures the proportion of variance in the dependent variable that is explained by the independent variables in the model.
Adjusted R-squared adjusts for the number of independent variables in the model, providing a more conservative measure of model fit.
Both R-squared and Adjusted R-squared range from 0 to 1, with higher values indicating better fit.

In [33]:
# Calculate R-squared
r_squared = model.rsquared
print("R-squared:", r_squared)

# Calculate Adjusted R-squared
adjusted_r_squared = model.rsquared_adj
print("Adjusted R-squared:", adjusted_r_squared)


R-squared: 0.7275620320910468
Adjusted R-squared: 0.7272181533054248


The F-test assesses the overall significance of the model by comparing the explained variance to the unexplained variance.
A significant F-test indicates that at least one independent variable has a nonzero coefficient, suggesting that the model as a whole is significant.

In [34]:
# Get F-statistic and associated p-value
f_statistic = model.fvalue
p_value = model.f_pvalue

print("F-statistic:", f_statistic)
print("p-value:", p_value)


F-statistic: 2115.7514290253553
p-value: 0.0


# Model 2 with change in investment

In [35]:
df_deltas = df[['liquidity', 'leverage', 'roa', 'icapt', 'aqc', 'prev_inv', 'asset_growth', 'y', 'xrd']].diff(axis=1)
df_deltas = df_deltas.add_suffix('_delta')
df_combined = pd.concat([df, df_deltas], axis=1)
print(df_combined)

      Unnamed: 0   GVKEY    datadate  fyear indfmt consol popsrc datafmt  \
276          276    1075  31/12/2000   2000   INDL      C      D     STD   
277          277    1078  31/12/2000   2000   INDL      C      D     STD   
278          278    1161  31/12/2000   2000   INDL      C      D     STD   
279          279    1209  30/09/2000   2000   INDL      C      D     STD   
280          280    1300  31/12/2000   2000   INDL      C      D     STD   
...          ...     ...         ...    ...    ...    ...    ...     ...   
6618        6618  117768  31/01/2023   2022   INDL      C      D     STD   
6619        6619  119314  31/12/2022   2022   INDL      C      D     STD   
6620        6620  121077  30/09/2022   2022   INDL      C      D     STD   
6621        6621  121718  31/12/2022   2022   INDL      C      D     STD   
6622        6622  125595  31/12/2022   2022   INDL      C      D     STD   

                              conm curcd  ...   cpu_index  liquidity_delta  \
276     P

In [36]:
X2 = df_combined[['leverage_delta', 'roa_delta', 'icapt_delta', 'aqc_delta', 'cpu_index', 'asset_growth_delta', 'prev_inv_delta']]
y = df_combined['y']

In [37]:
X2.isnull().sum()

leverage_delta        0
roa_delta             0
icapt_delta           0
aqc_delta             0
cpu_index             0
asset_growth_delta    0
prev_inv_delta        0
dtype: int64

In [38]:
X2 = sm.add_constant(X2)

model2 = sm.OLS(y, X2).fit()

print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.136
Model:                            OLS   Adj. R-squared:                  0.135
Method:                 Least Squares   F-statistic:                     143.0
Date:                Sat, 09 Mar 2024   Prob (F-statistic):          2.07e-196
Time:                        11:40:03   Log-Likelihood:                 11896.
No. Observations:                6347   AIC:                        -2.378e+04
Df Residuals:                    6339   BIC:                        -2.372e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  0.0705      0