<a href="https://colab.research.google.com/github/justinstayy/Two-Million-Regressions-Python/blob/main/Homework2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from google.colab import files
import statsmodels.api as sm

#1
#Chose to upload the excel file manually everytime I run the code.
uploaded = files.upload()
filename = list(uploaded.keys())[0]  # Get uploaded filename
xls = pd.ExcelFile(filename)  # Load the Excel file
df_data = pd.read_excel(xls, sheet_name=0)  #First page of the file
df_data.columns = df_data.columns.str.strip()

dependent_var = "gamma"  # Dependent Variable

# Fixed Independent Variables that are fixed in my Regression.
fixed_vars = ["GDPSH60", "LIFEE060", "s60"]  # Level of Income, Life Expectancy, Primary School Enrollment Rate

# Additional Independent Variables (4 more variables per regression)
other_vars = ["human60", "ggcfd3", "dpop6090", "lly1"]


selected_vars = [dependent_var] + fixed_vars + other_vars
existing_vars = [var for var in selected_vars if var in df_data.columns]

#Generating Summary Statistics
summary_stats = df_data[existing_vars].describe()
print("\n=== Summary Statistics ===")
print(summary_stats)
missing_values = df_data[existing_vars].isnull().sum()
print("\n=== Missing Values in Key Variables ===")
print(missing_values) # Checking for missing values

#2 Running the Fixed Regression


#Removing rows where Dependent Variable or Fixed Variables are missing
df_filtered = df_data.dropna(subset=[dependent_var] + fixed_vars)
df_filtered = df_filtered.apply(pd.to_numeric, errors='coerce')
df_filtered = df_filtered.dropna(subset=[dependent_var])

# Prepare the Data
X = df_filtered[fixed_vars]  # Independent variables (Fixed)
y = df_filtered[dependent_var]  # Dependent variable (GDP growth)

# Adding the "Intercept" into our model.
X = sm.add_constant(X)
X = X.rename(columns={"const": "Intercept"})
model = sm.OLS(y, X)  # Ordinary Least Squares (OLS) Regression
results = model.fit()  # Fit the model

#Printing the "fixed"
print(results.summary())

# Add spacing and display the comment below the regression results
print("\n" + "="*80 + "\n")  # Creates a visual separator

print(" (2) Interpretation of the 'Fixed' Regression Results ".center(80, "="))  # Title centered with '=' padding

print("\nOur 'Fixed' regression shows that our three fixed variables (the same ones used in Sala-i-Martin's study) significantly impact economic growth.\n"
      "Initial GDP has a negative coefficient (-0.0180, p < 0.01), indicating that countries with lower starting GDP grow faster over time, but only when factors such as education and life expectancy are taken into account.\n"
      "The model explains 45.8% of the variation in economic growth (R² = 0.458), highlighting the importance of human capital in development.\n")

print("="*80 + "\n")  # Creates a closing separator


Saving DUOMENYS.xlsx to DUOMENYS (8).xlsx

=== Summary Statistics ===
            gamma     GDPSH60     LIFEE060         s60     human60      ggcfd3    dpop6090        lly1
count  119.000000  117.000000   116.000000  117.000000  101.000000  111.000000  126.000000   72.000000
mean     1.680672  729.726496  5226.810345   20.974359  333.514851    5.468468    2.222222   32.444444
std      1.836230   90.263945  1242.666557   20.852297  241.841544    4.506197    1.301623   24.330407
min     -2.000000  552.000000  3150.000000    0.000000    7.000000    0.000000    0.000000    3.000000
25%      0.000000  665.000000  4172.500000    3.000000  134.000000    3.000000    1.250000   17.000000
50%      2.000000  720.000000  4830.000000   13.000000  299.000000    4.000000    2.000000   26.500000
75%      3.000000  794.000000  6390.000000   33.000000  460.000000    7.000000    3.000000   41.250000
max      7.000000  919.000000  7340.000000   86.000000  961.000000   25.000000   10.000000  160.000000

==

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import random

#Defining Variables again
dependent_var = "gamma"
fixed_vars = ["GDPSH60", "LIFEE060", "s60"]
#Defining other 22 variables from the table 1.
table_1_vars = [
    "EQINV", "YrsOpen", "CONFUC", "RULELAW", "MUSLIM",
    "prightsb", "laam", "safrica", "civlibb", "revcoup", "Mining",
    "bmp1", "PRIEXP70", "EcOrg", "wardum", "NONEQINV",
    "ABSLATIT", "RERD", "PROT", "BUDDHA", "CATH",
    "SPAIN"
]

#Preparing the storage
num_regressions = 100
beta_values = {var: [] for var in fixed_vars + table_1_vars}  # Storing coeff. values
significant_counts = {var: 0 for var in fixed_vars + table_1_vars}  # Count significance

#Cleaning data
df_filtered.replace([np.inf, -np.inf], np.nan, inplace=True)
df_filtered.dropna(subset=[dependent_var] + fixed_vars + table_1_vars, inplace=True)

#Running the regression
for i in range(num_regressions):
    # Selecting 4 random additional variables, ensuring each appears at least twice
    selected_vars = random.sample(table_1_vars, 4)

    #Ensuring all variables appear at least twice across different regressions
    for var in selected_vars:
        table_1_vars.remove(var)
        table_1_vars.append(var)

    # Preparing dataset
    selected_X = fixed_vars + selected_vars
    X = df_filtered[selected_X]
    y = df_filtered[dependent_var]

    #Checking for missing values
    if X.isnull().values.any() or y.isnull().values.any():
        print(f"⚠️ Skipping Regression {i+1} due to missing data.")
        continue

    #Adding a constant
    X = sm.add_constant(X)

    #Running the regression
    model = sm.OLS(y, X)
    results = model.fit()

    #
    for var in selected_X:
        beta_values[var].append(results.params[var])

    #Saving the Significant Variables
    significant_vars = results.pvalues[results.pvalues < 0.05].index.tolist()

    # Remove constant from significant variables list
    significant_vars = [var for var in significant_vars if var != 'const']

    for var in significant_vars:
        if var in significant_counts:
            significant_counts[var] += 1
        else:
            significant_counts[var] = 1

    #Printing Results for This Regression
    print(f"\n=== Regression {i+1} ===")
    print(f"Variables used: {selected_X}")
    print(results.summary())
    print(f"\n🚀 Significant Variables (p < 0.05): {significant_vars}")

#Calculating the average Coeff. And standart Deviation across all Regressions for all Variables.
beta_means = {var: np.mean(beta_values[var]) for var in beta_values}  # Mean β
beta_stddev = {var: np.std(beta_values[var]) for var in beta_values}  # Standard Deviation

#Calculating the CDF
cdf_weighted = {}
for var in beta_values:
    if beta_means[var] > 0:
        cdf_weighted[var] = np.mean(np.array(beta_values[var]) > 0)
    else:
        cdf_weighted[var] = np.mean(np.array(beta_values[var]) < 0)

#Sorting Variables
sorted_vars = sorted(beta_means.keys(), key=lambda x: abs(beta_means[x]), reverse=True)

#Displaying with Full correct names.
variable_mapping = {
    "BUDDHA": "Fraction Buddhist",
    "EQINV": "Equipment Investing",
    "PRIEXP70": "Primary Exports in 1970",
    "PROT": "Fraction Protestant",
    "CONFUC": "Fraction Confucian",
    "YrsOpen": "Number of Years Open Economy",
    "RERD": "Exchange Rate Distortions",
    "safrica": "Sub-Saharan Africa Dummy",
    "NONEQINV": "Non-equipment Investment",
    "prightsb": "Political Rights",
    "civlibb": "Civil Liberties",
    "revcoup": "Revolutions and Coups",
    "RULELAW": "Rule of Law",
    "ABSLATIT": "Absolute Latitude",
    "laam": "Latin America Dummy",
    "EcOrg": "Economic Organization",
    "human60": "Human Capital",
    "wardum": "War Dummy",
    "ggcfd3": "Public Investment Share",
    "SPAIN": "Fraction Spanish Colony",
    "MUSLIM": "Fraction Muslim",
    "h60": "Higher Education Enrollment",
    "CATH": "Fraction Catholic",
    "Mining": "Fraction of GDP in Mining",
    "bmp1": "Black Market Premium"
}


summary_df = pd.DataFrame({
    "Variable": [f"{var} ({variable_mapping.get(var, 'Unknown')})" for var in sorted_vars],
    "Mean β": [beta_means[var] for var in sorted_vars],
    "Standard Deviation": [beta_stddev[var] for var in sorted_vars],
    "Weighted CDF": [cdf_weighted[var] for var in sorted_vars]
})





=== Regression 1 ===
Variables used: ['GDPSH60', 'LIFEE060', 's60', 'civlibb', 'Mining', 'MUSLIM', 'bmp1']
                            OLS Regression Results                            
Dep. Variable:                  gamma   R-squared:                       0.580
Model:                            OLS   Adj. R-squared:                  0.533
Method:                 Least Squares   F-statistic:                     12.42
Date:                Sun, 02 Mar 2025   Prob (F-statistic):           7.12e-10
Time:                        20:48:17   Log-Likelihood:                -113.29
No. Observations:                  71   AIC:                             242.6
Df Residuals:                      63   BIC:                             260.7
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------

In [None]:
#Displaying the final summary table

print("\n=== Final Summary: Aggregated Regression Results with Weighted CDF ===\n")
print(summary_df.to_string(index=False))  # Displays the table without index


print("\n" + "="*80 + "\n")
print(" (4) Interpretation of Findings ".center(80, "="))
print("\nOur results mainly confirm Sala-i-Martin's findings, at least in terms of coefficient sign and order of magnitude.\n"
      "However, some coefficients differ between our results and those of the author, such as equipment investment (EQINV),\n"
      "which is slightly larger in our model, whereas school enrollment (s60) shows a much weaker impact on growth.\n"
      "Differences in CDFs and order of magnitude might arise from sample variations or a different number of regression draws.\n"
      "I used the same three fixed variables as the original study’s author, and their effects remain largely consistent.\n"
      "We can state that rule of law, human capital, and equipment investment influence economic growth, while factors like\n"
      "revolutions tend to diminish it.\n")
print("="*80 + "\n")





=== Final Summary: Aggregated Regression Results with Weighted CDF ===

                              Variable    Mean β  Standard Deviation  Weighted CDF
           EQINV (Equipment Investing)  0.241964            0.029966      1.000000
           CONFUC (Fraction Confucian)  0.074687            0.010777      1.000000
   NONEQINV (Non-equipment Investment)  0.063358            0.017869      1.000000
    Mining (Fraction of GDP in Mining)  0.041733            0.019956      0.947368
YrsOpen (Number of Years Open Economy)  0.023336            0.002557      1.000000
                 RULELAW (Rule of Law)  0.023140            0.005301      1.000000
                     GDPSH60 (Unknown) -0.021282            0.002989      1.000000
    PRIEXP70 (Primary Exports in 1970) -0.020239            0.004691      1.000000
            BUDDHA (Fraction Buddhist)  0.020189            0.005021      1.000000
            PROT (Fraction Protestant) -0.017575            0.003981      1.000000
    safrica (S