<a href="https://colab.research.google.com/github/kumpaten/masters-thesis-code/blob/main/DataLasso.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Lasso Monolith

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import RepeatedKFold, train_test_split # train_test_split for bootstrap robustness
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# --- LaTeX Style Plotting Setup ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'serif'
# Attempt to use a commonly available serif font, or comment out if Times New Roman is installed/critical
# plt.rcParams['font.serif'] = ['DejaVu Serif', 'Bitstream Vera Serif', 'Liberation Serif', 'Times New Roman']
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 8
plt.rcParams['figure.titlesize'] = 12
plt.rcParams['axes.titlesize'] = 10
plt.rcParams['figure.figsize'] = (7, 5) # Slightly wider for some plots
plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.format'] = 'pdf'

# --- 1. Load Data ---
try:
    file_path = '/content/drive/MyDrive/Data_to_analyze/Final_Dataset_transformed.csv' # Ensure this is your preprocessed file
    df_original = pd.read_csv(file_path, sep=';')
except FileNotFoundError:
    print("ERROR: 'Final_Dataset_transformed.csv' not found. Please upload it or check the path.")
    # dummy dataframe to test lasso
    data_size = 140
    firms = [f'Comp{i%10}' for i in range(data_size)]
    years = [2008 + i//10 for i in range(data_size)][:data_size] # ensure years match data_size
    df_original = pd.DataFrame({
        'company': firms[:len(years)], 'year': years,
        'ln(Tobin’s Q)': np.random.rand(len(years)) * 2 + 0.5,
        'ln(PE)': np.random.rand(len(years)) * 30 + 5,
        'ln(TSR)': np.random.rand(len(years)) * 0.5 - 0.1,
        'brand_value': np.random.rand(len(years)) * 1000,
        'patent_claims': np.random.randint(0, 500, size=len(years)),
        'employee_rating': np.random.rand(len(years)) * 2 + 3,
        'RD_Intensity': np.random.rand(len(years)) * 0.15,
        'SGA_Intensity': np.random.rand(len(years)) * 0.25,
        'YJ_Sentiment_PCR': np.random.randn(len(years)),
        'is_imputed': np.random.choice([0, 1], size=len(years), p=[0.8, 0.2]),
        'ln_total_assets_lag1': np.random.rand(len(years)) * 5 + 10,
        'ROA_lag1': np.random.rand(len(years)) * 0.2,
        'ln_financial_leverage_lag1': np.random.rand(len(years)) * 1 + 0.5,
        'delta_ln_S5INFT_lag1': np.random.randn(len(years)) * 0.1,
        'delta_ln_GDPWorld_lag1': np.random.randn(len(years)) * 0.02
    })
    print("Using DUMMY DATA for testing.")


df_original['year'] = df_original['year'].astype(int)
df_original.set_index(['company', 'year'], inplace=True)
df = df_original.copy()

# --- 2. Define Variable Groups (ensure these match your DataFrame column names) ---
dependent_vars_list = ['ln(Tobin’s Q)', 'ln(PE)', 'ln(TSR)']
contemporaneous_intangibles = [
    'brand_value', 'patent_claims', 'employee_smoothed_rating',
    'R&D_Intensity', 'SG&A_Intensity', 'YJ(Sentiment_PCR)'
]
# Using only 'ln_total_assets_lag1' as the size control based on previous discussions
firm_controls = [
    'ln_totalAssets-1', 'ROA-1', 'ln(Financial Leverage-1)'
]
macro_controls = [
    'delta_ln_S5INFT-1', 'delta_ln_GDPWorld-1'
]
dummy_controls_contemp = ["is_imputed"]

# Create lagged intangibles and lagged dummy
lagged_intangibles = []
for col in contemporaneous_intangibles:
    lagged_col_name = f'{col}_lag1'
    df[lagged_col_name] = df.groupby(level='company')[col].shift(1)
    lagged_intangibles.append(lagged_col_name)
df['is_imputed_lag1'] = df.groupby(level='company')['is_imputed'].shift(1)
dummy_controls_lagged = ["is_imputed_lag1"]

# --- 3. Data Preparation Function for Lasso ---
def prepare_data_for_lasso(dataf, dep_var_name, predictor_vars_list):
    """Demeans and standardizes data for Lasso, handling NaNs."""
    cols_to_use = [dep_var_name] + predictor_vars_list

    # Check for missing columns before proceeding
    missing_cols_in_df = [col for col in cols_to_use if col not in dataf.columns]
    if missing_cols_in_df:
        print(f"ERROR in prepare_data_for_lasso: Columns {missing_cols_in_df} not found in dataframe.")
        return None, None

    panel_data = dataf[cols_to_use].copy()

    # Demeaning (Within-Transformation)
    # Group by firm (level 0 of MultiIndex) and transform
    demeaned_data = panel_data.groupby(level='company').transform(lambda x: x - x.mean())

    # Separate Y and X after demeaning
    y_demeaned = demeaned_data[dep_var_name]
    X_demeaned = demeaned_data[predictor_vars_list]

    # Drop NaNs that arose from demeaning or original NaNs for this specific model
    combined_for_dropna = pd.concat([y_demeaned, X_demeaned], axis=1).dropna()
    if combined_for_dropna.empty:
        print(f"Warning: Dataframe empty after demeaning and dropna for DV: {dep_var_name} with predictors: {predictor_vars_list}")
        return None, None

    y_final = combined_for_dropna[dep_var_name]
    X_final_demeaned = combined_for_dropna[predictor_vars_list]

    # Standardization of Predictors (X)
    # Fit scaler only on non-NaN values from the *original* X columns before demeaning for this specific set of predictors
    # to avoid data leakage from y or from other firms' demeaning process.
    # However, for Lasso with demeaned data, standardizing the already demeaned X_final_demeaned is standard.
    if not X_final_demeaned.empty:
        scaler = StandardScaler()
        X_final_standardized = scaler.fit_transform(X_final_demeaned)
        X_final_standardized_df = pd.DataFrame(X_final_standardized, columns=X_final_demeaned.columns, index=X_final_demeaned.index)
        return y_final, X_final_standardized_df
    else:
        print(f"Warning: X_final_demeaned is empty for DV: {dep_var_name}. Cannot standardize.")
        return None, None

# --- 4. Lasso CV and Path Plotting Function ---
def run_lasso_cv_and_plot_paths(y, X, model_name_display, dep_var_name, scenario_label, n_alphas=100, eps=1e-3):
    """Runs LassoCV, prints results, and plots coefficient paths."""
    if y is None or X is None or X.empty or y.empty:
        print(f"Skipping {model_name_display} for {dep_var_name} due to insufficient data.")
        return None, None

    print(f"\n--- Fitting Lasso for: {model_name_display} (DV: {dep_var_name}) ---")

    # Using RepeatedKFold for more stable lambda selection
    cv_method = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

    lasso_cv = LassoCV(alphas=None, cv=cv_method, random_state=42, n_alphas=n_alphas, eps=eps, max_iter=10000, tol=1e-3)
    try:
        lasso_cv.fit(X, y)
    except Exception as e:
        print(f"  ERROR fitting LassoCV for {model_name_display}: {e}")
        return None, None

    optimal_alpha = lasso_cv.alpha_
    print(f"  Optimal alpha (lambda_min): {optimal_alpha:.6f}")

    # Refit with optimal alpha on the full (prepared) data
    final_lasso = Lasso(alpha=optimal_alpha, random_state=42, max_iter=10000, tol=1e-3)
    final_lasso.fit(X, y)

    coefficients = pd.Series(final_lasso.coef_, index=X.columns)
    selected_features = coefficients[coefficients != 0]
    print("  Selected Features and Coefficients:")
    if not selected_features.empty:
        print(selected_features)
    else:
        print("  No features selected.")

    # Plot coefficient paths for Model C
    if "Model C" in model_name_display: # Only plot paths for full models
        plt.figure(figsize=(8, 6))
        alphas_lasso, coefs_lasso, _ = lasso_cv.path(X, y, alphas=None, n_alphas=n_alphas, eps=eps, tol=1e-3) # Get paths

        # Correctly handle coefs_lasso dimensions
        if coefs_lasso.ndim == 1: # If only one feature or one alpha path element
             coefs_lasso = coefs_lasso[:, np.newaxis].T # Reshape for consistency

        for i in range(coefs_lasso.shape[0]): # Iterate over features
             plt.plot(np.log10(alphas_lasso), coefs_lasso[i,:], label=X.columns[i] if coefs_lasso.shape[0] < 15 else None)

        plt.axvline(np.log10(optimal_alpha), linestyle='--', color='k', label=f'Optimal $\lambda$ (log10={np.log10(optimal_alpha):.2f})')
        plt.xlabel(r'$\log_{10}(\lambda)$')
        plt.ylabel('Coefficients')
        clean_dep_var = dep_var_name.replace("(", "").replace(")", "").replace("’", "").replace(" ", "_")
        clean_model_name = model_name_display.replace(" ","_").replace("(","").replace(")","").replace("+","plus")
        plt.title(f'Lasso Coefficient Paths: {clean_dep_var}\n{scenario_label} - {clean_model_name}')
        if coefs_lasso.shape[0] < 15 : # Add legend only if not too many features
            plt.legend(loc='upper right', bbox_to_anchor=(1.25, 1))
        plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend

        plot_filename = f"lasso_paths_{clean_dep_var}_{scenario_label}_{clean_model_name}.pdf"
        plt.savefig(plot_filename)
        plt.show()
        print(f"  Saved Lasso Paths plot to: {plot_filename}")

    return final_lasso, optimal_alpha

# --- 5. Bootstrap Lasso Function ---
def bootstrap_lasso_feature_selection(y, X, n_bootstrap=100, alpha_cv=5, n_alphas=100, eps=1e-3):
    """Performs bootstrap Lasso for feature selection stability."""
    if y is None or X is None or X.empty or y.empty:
        print("Skipping bootstrap due to insufficient data.")
        return None, None

    n_features = X.shape[1]
    selected_features_freq = np.zeros(n_features)
    avg_coeffs = np.zeros(n_features)

    print(f"\n--- Running Bootstrap Lasso ({n_bootstrap} replications) ---")

    for i in range(n_bootstrap):
        if (i + 1) % (n_bootstrap // 10) == 0:
            print(f"  Bootstrap iteration: {i+1}/{n_bootstrap}")

        # Sample firms with replacement (block bootstrap)
        # This requires original multi-index to know which firms to sample
        unique_firms = X.index.get_level_values('company').unique()
        bootstrap_firms = np.random.choice(unique_firms, size=len(unique_firms), replace=True)

        bootstrap_indices = []
        for firm in bootstrap_firms:
            firm_indices = X.index[X.index.get_level_values('company') == firm]
            bootstrap_indices.extend(firm_indices)

        X_boot = X.loc[bootstrap_indices]
        y_boot = y.loc[bootstrap_indices]

        if X_boot.empty or y_boot.empty or len(X_boot) < 10 : # Basic check
             print(f"    Skipping bootstrap iter {i+1} due to empty/small data after resampling.")
             continue

        # LassoCV on bootstrap sample
        cv_method_boot = RepeatedKFold(n_splits=alpha_cv, n_repeats=1, random_state=i) # Less repeats for speed
        lasso_cv_boot = LassoCV(alphas=None, cv=cv_method_boot, random_state=i, n_alphas=n_alphas, eps=eps, max_iter=5000, tol=1e-3) # Fewer max_iter
        try:
            lasso_cv_boot.fit(X_boot, y_boot)
            final_lasso_boot = Lasso(alpha=lasso_cv_boot.alpha_, random_state=i, max_iter=5000, tol=1e-3)
            final_lasso_boot.fit(X_boot, y_boot)

            selected_mask = final_lasso_boot.coef_ != 0
            selected_features_freq += selected_mask
            avg_coeffs += final_lasso_boot.coef_
        except Exception as e:
            print(f"    Error in bootstrap iteration {i+1}: {e}")
            continue

    selection_frequency = selected_features_freq / n_bootstrap
    avg_coeffs_on_selected_overall = avg_coeffs / n_bootstrap # This is average over ALL bootstraps, even if coeff was zero

    results = pd.DataFrame({
        'feature': X.columns,
        'selection_frequency': selection_frequency,
        'average_coefficient': avg_coeffs_on_selected_overall
    })
    # Calculate average coefficient only for times it was selected (more meaningful)
    avg_coeffs_when_selected = np.zeros(n_features)
    for j in range(n_features):
        # This requires storing all bootstrap coefficients, which is memory intensive.
        # For simplicity here, we'll stick to the average over all bootstraps.
        # A more advanced version would store all coef_ paths.
        pass

    print("\n--- Bootstrap Lasso Results ---")
    print(results.sort_values(by='selection_frequency', ascending=False))
    return results

# --- 6. Define Model Specifications for Lasso (analogous to OLS) ---
lasso_scenarios = {}

# Scenario 1 (Lasso): Contemporaneous Effects
s1_model_a_preds = contemporaneous_intangibles + dummy_controls_contemp
s1_model_b_preds = s1_model_a_preds + firm_controls
s1_model_c_preds = s1_model_b_preds + macro_controls
lasso_scenarios["Scenario1_Contemp"] = {
    "A": {"label": "Lasso Model A (Intangibles Only)", "predictors": s1_model_a_preds, "intangible_type": "Contemporaneous"},
    "B": {"label": "Lasso Model B (+ Firm-Level Controls)", "predictors": s1_model_b_preds, "intangible_type": "Contemporaneous"},
    "C": {"label": "Lasso Model C (Full Controls)", "predictors": s1_model_c_preds, "intangible_type": "Contemporaneous"}
}

# Scenario 2 (Lasso): Lagged Effects
s2_model_a_preds = lagged_intangibles + dummy_controls_lagged
s2_model_b_preds = s2_model_a_preds + firm_controls
s2_model_c_preds = s2_model_b_preds + macro_controls
lasso_scenarios["Scenario2_Lagged"] = {
    "A": {"label": "Lasso Model A (Lagged Intangibles Only)", "predictors": s2_model_a_preds, "intangible_type": "Lagged"},
    "B": {"label": "Lasso Model B (+ Firm-Level Controls)", "predictors": s2_model_b_preds, "intangible_type": "Lagged"},
    "C": {"label": "Lasso Model C (Full Controls)", "predictors": s2_model_c_preds, "intangible_type": "Lagged"}
}

# --- 7. Execute Lasso Analysis ---
all_lasso_results = {}
all_bootstrap_lasso_results = {}

for dv in dependent_vars_list:
    all_lasso_results[dv] = {}
    all_bootstrap_lasso_results[dv] = {}
    print(f"\n\n{'='*80}")
    print(f" LASSO ANALYSIS FOR DEPENDENT VARIABLE: {dv} ".center(80, "="))
    print(f"{'='*80}")

    for scenario_key, models in lasso_scenarios.items():
        all_lasso_results[dv][scenario_key] = {}
        all_bootstrap_lasso_results[dv][scenario_key] = {}
        print(f"\n--- {scenario_key.replace('_', ' ')} ---")

        for model_key, spec in models.items():
            model_label = spec["label"]
            predictors = spec["predictors"]

            # Prepare data for this specific model
            y_prepared, X_prepared = prepare_data_for_lasso(df, dv, predictors)

            if y_prepared is not None and X_prepared is not None and not X_prepared.empty:
                fitted_lasso, optimal_lambda = run_lasso_cv_and_plot_paths(y_prepared, X_prepared, model_label, dv, scenario_key)
                all_lasso_results[dv][scenario_key][model_label] = {
                    "model": fitted_lasso,
                    "optimal_lambda": optimal_lambda,
                    "coefficients": pd.Series(fitted_lasso.coef_, index=X_prepared.columns) if fitted_lasso else None
                }

                # Run Bootstrap Lasso for Model C (Full Controls)
                if model_key == "C":
                    bootstrap_results = bootstrap_lasso_feature_selection(y_prepared, X_prepared, n_bootstrap=100) # 100 reps for speed, increase for paper
                    all_bootstrap_lasso_results[dv][scenario_key][model_label] = bootstrap_results

                    if bootstrap_results is not None:
                        # Plot Bootstrap Lasso Selection Frequencies
                        plt.figure(figsize=(10, 0.3 * len(bootstrap_results['feature']))) # Dynamic height
                        bootstrap_results_sorted = bootstrap_results.sort_values('selection_frequency', ascending=True)
                        plt.barh(bootstrap_results_sorted['feature'], bootstrap_results_sorted['selection_frequency'], color='skyblue')
                        plt.xlabel("Selection Frequency")
                        plt.ylabel("Predictor")
                        clean_dep_var = dv.replace("(", "").replace(")", "").replace("’", "").replace(" ", "_")
                        clean_model_name = model_label.replace(" ","_").replace("(","").replace(")","").replace("+","plus")
                        plt.title(f'Bootstrap Lasso: Feature Selection Frequency\n{clean_dep_var} ~ {scenario_key} - {clean_model_name}')
                        plt.axvline(0.8, color='red', linestyle='--', label='80% Threshold') # Your >80% threshold
                        plt.legend()
                        plt.tight_layout()
                        plot_filename_boot = f"bootstrap_lasso_freq_{clean_dep_var}_{scenario_key}_{clean_model_name}.pdf"
                        plt.savefig(plot_filename_boot)
                        plt.show()
                        print(f"  Saved Bootstrap Lasso Frequency plot to: {plot_filename_boot}")
            else:
                print(f"Skipping Lasso for {model_label} (DV: {dv}) due to data preparation issues.")

print("\n\n" + "="*30 + " END OF LASSO ANALYSIS " + "="*30)