<a href="https://colab.research.google.com/github/kumpaten/masters-thesis-code/blob/main/DataRandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Random Forest Monolith

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns # For potentially nicer default styles if needed, though we customize

# --- LaTeX Style Plotting Setup ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'serif'
# plt.rcParams['font.serif'] = ['DejaVu Serif', 'Bitstream Vera Serif', 'Liberation Serif', 'Times New Roman'] # Choose available
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 8
plt.rcParams['figure.titlesize'] = 12
plt.rcParams['axes.titlesize'] = 10
plt.rcParams['figure.figsize'] = (8, 6) # Adjust for feature importance plots
plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.format'] = 'pdf'

# --- 1. Load Data ---
try:
    file_path = '/content/drive/MyDrive/Data_to_analyze/Final_Dataset_transformed.csv'
    df_original = pd.read_csv(file_path, sep=';')
except FileNotFoundError:
    print("ERROR: 'Final_Dataset_transformed.csv' not found. Please upload it or check the path.")
    # Dummy data for testing
    data_size = 140
    firms = [f'Comp{i%10}' for i in range(data_size)]
    years = [2008 + i//10 for i in range(data_size)][:data_size]
    df_original = pd.DataFrame({
        'company': firms[:len(years)], 'year': years,
        'ln(Tobin’s Q)': np.random.rand(len(years)) * 2 + 0.5,
        'ln(PE)': np.random.rand(len(years)) * 30 + 5,
        'ln(TSR)': np.random.rand(len(years)) * 0.5 - 0.1,
        'brand_value': np.random.rand(len(years)) * 1000,
        'patent_claims': np.random.randint(0, 500, size=len(years)),
        'employee_rating': np.random.rand(len(years)) * 2 + 3,
        'RD_Intensity': np.random.rand(len(years)) * 0.15,
        'SGA_Intensity': np.random.rand(len(years)) * 0.25,
        'YJ_Sentiment_PCR': np.random.randn(len(years)),
        'is_imputed': np.random.choice([0, 1], size=len(years), p=[0.8, 0.2]),
        'ln_total_assets_lag1': np.random.rand(len(years)) * 5 + 10,
        'ROA_lag1': np.random.rand(len(years)) * 0.2,
        'ln_financial_leverage_lag1': np.random.rand(len(years)) * 1 + 0.5,
        'delta_ln_S5INFT_lag1': np.random.randn(len(years)) * 0.1,
        'delta_ln_GDPWorld_lag1': np.random.randn(len(years)) * 0.02
    })
    print("Using DUMMY DATA for testing Random Forest.")

df_original['year'] = df_original['year'].astype(int)
df_original.set_index(['company', 'year'], inplace=True)
df = df_original.copy()

# --- 2. Define Variable Groups (ensure these match your DataFrame column names) ---
dependent_vars_list = ['ln(Tobin’s Q)', 'ln(PE)', 'ln(TSR)']
contemporaneous_intangibles = [
    'brand_value', 'patent_claims', 'employee_smoothed_rating',
    'R&D_Intensity', 'SG&A_Intensity', 'YJ(Sentiment_PCR)'
]
# Using only 'ln_total_assets_lag1' as the size control
firm_controls = [
    'ln_totalAssets-1', 'ROA-1', 'ln(Financial Leverage-1)'
]
macro_controls = [
    'delta_ln_S5INFT-1', 'delta_ln_GDPWorld-1'
]
dummy_controls_contemp = ["is_imputed"]

# Create lagged intangibles and lagged dummy
lagged_intangibles = []
for col in contemporaneous_intangibles:
    lagged_col_name = f'{col}_lag1'
    df[lagged_col_name] = df.groupby(level='company')[col].shift(1)
    lagged_intangibles.append(lagged_col_name)
df['is_imputed_lag1'] = df.groupby(level='company')['is_imputed'].shift(1)
dummy_controls_lagged = ["is_imputed_lag1"]

# --- 3. Data Preparation Function for RF (Same as Lasso) ---
def prepare_data_for_ml(dataf, dep_var_name, predictor_vars_list):
    """Demeans and standardizes data for RF/Lasso, handling NaNs."""
    cols_to_use = [dep_var_name] + predictor_vars_list
    missing_cols_in_df = [col for col in cols_to_use if col not in dataf.columns]
    if missing_cols_in_df:
        print(f"ERROR in prepare_data_for_ml: Columns {missing_cols_in_df} not found.")
        return None, None, None

    panel_data = dataf[cols_to_use].copy()
    demeaned_data = panel_data.groupby(level='company').transform(lambda x: x - x.mean())

    y_demeaned = demeaned_data[dep_var_name]
    X_demeaned = demeaned_data[predictor_vars_list]

    combined_for_dropna = pd.concat([y_demeaned, X_demeaned], axis=1).dropna()
    if combined_for_dropna.empty or len(combined_for_dropna) < 10: # Min obs check
        print(f"Warning: Dataframe empty/too small after demeaning and dropna for DV: {dep_var_name}.")
        return None, None, None

    y_final = combined_for_dropna[dep_var_name]
    X_final_demeaned = combined_for_dropna[predictor_vars_list]

    if not X_final_demeaned.empty:
        scaler = StandardScaler()
        X_final_standardized_array = scaler.fit_transform(X_final_demeaned)
        X_final_standardized_df = pd.DataFrame(X_final_standardized_array, columns=X_final_demeaned.columns, index=X_final_demeaned.index)
        return y_final, X_final_standardized_df, X_final_demeaned.columns # Return column names
    else:
        print(f"Warning: X_final_demeaned is empty for DV: {dep_var_name}. Cannot standardize.")
        return None, None, None

# --- 4. Random Forest Analysis Function ---
def run_rf_feature_importance(y, X, X_cols, model_name_display, dep_var_name, scenario_label, test_size=0.25, random_state=42):
    """Runs Random Forest, evaluates, and extracts feature importances."""
    if y is None or X is None or X.empty or y.empty or len(y) < 20 : # Min obs for split
        print(f"Skipping RF for {model_name_display} (DV: {dep_var_name}) due to insufficient data after prep (Obs: {len(y) if y is not None else 0}).")
        return None, None, None

    print(f"\n--- Fitting Random Forest for: {model_name_display} (DV: {dep_var_name}) ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    if len(X_train) < 10 or len(X_test) < 5: # Ensure splits are meaningful
        print(f"  Skipping {model_name_display}: Train/Test split resulted in too few samples (Train: {len(X_train)}, Test: {len(X_test)}).")
        return None, None, None

    rf_model = RandomForestRegressor(
        n_estimators=200,       # Reasonable number of trees
        max_depth=None,           # Control tree depth (adjust based on p)
        min_samples_split=5,
        min_samples_leaf=2,
        oob_score=True,
        random_state=random_state,
        n_jobs=-1               # Use all available cores
    )

    try:
        rf_model.fit(X_train, y_train)
    except Exception as e:
        print(f"  ERROR fitting RandomForestRegressor for {model_name_display}: {e}")
        return None, None, None

    # Basic Performance Metrics
    oob_r2 = rf_model.oob_score_
    test_r2 = r2_score(y_test, rf_model.predict(X_test))
    print(f"  OOB R-squared: {oob_r2:.4f}")
    print(f"  Test Set R-squared: {test_r2:.4f}")

    # Feature Importance Assessment
    # 1. Permutation Importance (on test set)
    print("  Calculating Permutation Importance...")
    perm_importance = permutation_importance(
        rf_model, X_test, y_test, n_repeats=10, random_state=random_state, n_jobs=-1
    )
    perm_importance_df = pd.DataFrame({
        'feature': X_cols,
        'importance_mean': perm_importance.importances_mean,
        'importance_std': perm_importance.importances_std
    }).sort_values(by='importance_mean', ascending=False)
    print("\n  Permutation Importances (Test Set):")
    print(perm_importance_df)

    # Plot Permutation Importance
    plt.figure(figsize=(8, 0.25 * len(X_cols) + 2)) # Dynamic height
    sorted_idx = perm_importance.importances_mean.argsort()
    plt.barh(X_cols[sorted_idx], perm_importance.importances_mean[sorted_idx],
             xerr=perm_importance.importances_std[sorted_idx], align='center', color='skyblue', ecolor='gray')
    plt.xlabel("Permutation Importance (Mean Decrease in R-squared)")
    plt.ylabel("Predictor")
    clean_dep_var = dep_var_name.replace("(", "").replace(")", "").replace("’", "").replace(" ", "_")
    clean_model_name = model_name_display.replace(" ","_").replace("(","").replace(")","").replace("+","plus")
    plt.title(f'RF Permutation Importance: {clean_dep_var}\n{scenario_label} - {clean_model_name}', fontsize=10)
    plt.tight_layout()
    plot_filename_perm = f"rf_perm_importance_{clean_dep_var}_{scenario_label}_{clean_model_name}.pdf"
    plt.savefig(plot_filename_perm)
    plt.show()
    print(f"  Saved Permutation Importance plot to: {plot_filename_perm}")

    # 2. Mean Decrease in Impurity (MDI - Gini importance for regression)
    mdi_importance = rf_model.feature_importances_
    mdi_importance_df = pd.DataFrame({
        'feature': X_cols,
        'mdi_importance': mdi_importance
    }).sort_values(by='mdi_importance', ascending=False)
    print("\n  Mean Decrease in Impurity (MDI) Importances:")
    print(mdi_importance_df)

    # Plot MDI Importance
    plt.figure(figsize=(8, 0.25 * len(X_cols) + 2)) # Dynamic height
    sorted_mdi_idx = mdi_importance_df['mdi_importance'].argsort()
    plt.barh(mdi_importance_df['feature'].iloc[sorted_mdi_idx], mdi_importance_df['mdi_importance'].iloc[sorted_mdi_idx],
             align='center', color='lightcoral')
    plt.xlabel("Mean Decrease in Impurity (MDI)")
    plt.ylabel("Predictor")
    plt.title(f'RF MDI Feature Importance: {clean_dep_var}\n{scenario_label} - {clean_model_name}', fontsize=10)
    plt.tight_layout()
    plot_filename_mdi = f"rf_mdi_importance_{clean_dep_var}_{scenario_label}_{clean_model_name}.pdf"
    plt.savefig(plot_filename_mdi)
    plt.show()
    print(f"  Saved MDI Importance plot to: {plot_filename_mdi}")

    return rf_model, perm_importance_df, mdi_importance_df

# --- 5. Define Model Specifications for RF (Full Models Only) ---
rf_scenarios = {}

# Scenario RF1c (Full Contemporaneous Model)
s1_rf_model_c_preds = contemporaneous_intangibles + dummy_controls_contemp + firm_controls + macro_controls
rf_scenarios["Scenario_RF1c_Contemp_Full"] = {
    "C_Full": {"label": "RF Model C (Contemporaneous Full)", "predictors": s1_rf_model_c_preds, "intangible_type": "Contemporaneous"}
}

# Scenario RF2c (Full Lagged Model)
s2_rf_model_c_preds = lagged_intangibles + dummy_controls_lagged + firm_controls + macro_controls
rf_scenarios["Scenario_RF2c_Lagged_Full"] = {
    "C_Full": {"label": "RF Model C (Lagged Full)", "predictors": s2_rf_model_c_preds, "intangible_type": "Lagged"}
}

# --- 6. Execute Random Forest Analysis ---
all_rf_results = {} # To store models and importance dataframes

for dv in dependent_vars_list:
    all_rf_results[dv] = {}
    print(f"\n\n{'='*80}")
    print(f" RANDOM FOREST ANALYSIS FOR DEPENDENT VARIABLE: {dv} ".center(80, "="))
    print(f"{'='*80}")

    for scenario_key_rf, models_rf in rf_scenarios.items():
        all_rf_results[dv][scenario_key_rf] = {}
        print(f"\n--- {scenario_key_rf.replace('_', ' ')} ---")

        for model_key_rf, spec_rf in models_rf.items(): # Should only be 'C_Full'
            model_label_rf = spec_rf["label"]
            predictors_rf = spec_rf["predictors"]

            y_prepared_rf, X_prepared_rf, X_col_names_rf = prepare_data_for_ml(df, dv, predictors_rf)

            if y_prepared_rf is not None and X_prepared_rf is not None and not X_prepared_rf.empty:
                fitted_rf, perm_df, mdi_df = run_rf_feature_importance(
                    y_prepared_rf, X_prepared_rf, X_col_names_rf, model_label_rf, dv, scenario_key_rf
                )
                all_rf_results[dv][scenario_key_rf][model_label_rf] = {
                    "model": fitted_rf,
                    "permutation_importance": perm_df,
                    "mdi_importance": mdi_df
                }
            else:
                print(f"Skipping RF for {model_label_rf} (DV: {dv}) due to data preparation issues.")

print("\n\n" + "="*30 + " END OF RANDOM FOREST ANALYSIS " + "="*30)