In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [2]:
datasets = ["Grouping_Datasets/Female_Older.csv", 
            "Grouping_Datasets/Female_Younger.csv", 
            "Grouping_Datasets/Male_Older.csv", 
            "Grouping_Datasets/Male_Younger.csv"]

all_significant_predictors = pd.DataFrame(columns=["Dataset", "Variable", "P-Value", 
                                                   "Coefficient"])

In [3]:
for dataset in datasets:
    data = pd.read_csv(dataset, index_col=0)
    
    X = data.drop(columns=["phq_sum"])
    y = data["phq_sum"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                        random_state=42)
    
    # Perform OLS for p-value calculation
    X_train_sm = sm.add_constant(X_train)  # Add constant term for intercept
    y_train_aligned, X_train_sm_aligned = y_train.align(X_train_sm, join="inner", axis=0)
    sm_model = sm.OLS(y_train_aligned, X_train_sm_aligned).fit()

    # Filter significant predictors (p-value < 0.05)
    significant_vars = sm_model.pvalues[sm_model.pvalues < 0.05].index
    if len(significant_vars) > 0:
        significant_data = pd.DataFrame({
            "Dataset": [dataset] * len(significant_vars),
            "Variable": significant_vars,
            "P-Value": sm_model.pvalues[significant_vars].round(4).values,
            "Coefficient": sm_model.params[significant_vars].round(4).values
        })
        all_significant_predictors = pd.concat([all_significant_predictors, significant_data], ignore_index=True)

  all_significant_predictors = pd.concat([all_significant_predictors, significant_data], ignore_index=True)


In [4]:
dictionary = pd.read_csv("dictionary.csv")

# Merge significant predictors with dictionary for meanings
merged_data = all_significant_predictors.merge(
    dictionary[['var_id', 'var_english']],
    how='left',
    left_on='Variable',
    right_on='var_id'
)

merged_data = merged_data.rename(columns={"var_english": "Meaning"})
merged_data = merged_data.drop(columns=["var_id"])

merged_data["Dataset"] = merged_data["Dataset"].str.replace("Grouping_Datasets/", "", regex=False)

# Select top 5 predictors with the highest absolute coefficients for each dataset
top_predictors = merged_data.loc[
    merged_data.groupby("Dataset")["Coefficient"]
    .apply(lambda x: abs(x).nlargest(5).index)
    .explode()
]

# Print the table
print("Top 5 Significant Predictors P-Value < 0.05 (Highest Absolute Coefficients):")
print(top_predictors.to_string(index=False))

Top 5 Significant Predictors P-Value < 0.05 (Highest Absolute Coefficients):
           Dataset     Variable  P-Value  Coefficient                          Meaning
  Female_Older.csv healthcare04   0.0000       1.9621 healthcare_mentalhealth_provider
  Female_Older.csv       func05   0.0000       1.6227           difficulty_remembering
  Female_Older.csv       func06   0.0117       0.9795              difficulty_selfcare
  Female_Older.csv       func04   0.0045       0.9139         difficulty_communicating
  Female_Older.csv        com01   0.0000       0.8712                    health_status
Female_Younger.csv       func05   0.0000       2.0487           difficulty_remembering
Female_Younger.csv healthcare04   0.0000       1.6457 healthcare_mentalhealth_provider
Female_Younger.csv       func02   0.0001       1.4131               difficulty_hearing
Female_Younger.csv       func04   0.0313       0.8456         difficulty_communicating
Female_Younger.csv      habit08   0.0046       0.8020