In [33]:
import pandas as pd
import scipy.stats as stats
import statsmodels.formula.api as smf
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def pre_process(df, keep_cols=None):
    if keep_cols is None:
        keep_cols = df.columns
    df_selected = df[keep_cols].dropna().copy()

    numeric_cols = df_selected.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = df_selected.select_dtypes(exclude=['number']).columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', StandardScaler(), numeric_cols),
            ('categorical', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
        ]
    )

    transformed_array = preprocessor.fit_transform(df_selected)

    feature_names = []
    if numeric_cols:
        feature_names.extend(numeric_cols)
    if categorical_cols:
        cat_features = preprocessor.named_transformers_['categorical'].get_feature_names_out(categorical_cols)
        feature_names.extend(cat_features)

    preprocessed_df = pd.DataFrame(transformed_array, columns=feature_names, index=df_selected.index)

    return preprocessed_df

In [None]:
# ENGLISH
# ----------------------------------------------------------------------------------------------------
# load df
df_EN = pd.read_csv('./data/regression_data_EN.csv')

# define baselines and covariates studied
BASELINE_EN = '1 + Length + LgSUBTLWF + OLD + LgSUBTLWF:Length + LgSUBTLWF:OLD'
COVARIATES = ['n_communities', 'entropy', 'max_dom', 'dissimilarity', 'ambiguity_binary', 'ambiguity']

# baseline
baseline_formula = 'log_rt ~ ' + BASELINE_EN
preprocessed_df = pre_process(df_EN, ['log_rt', 'Length', 'LgSUBTLWF', 'OLD'])
baseline_model = smf.ols(baseline_formula, data=preprocessed_df).fit()

report = f"""
--------------------------------------------------------------------
Covariate: None (Baseline model)
Likelihood Ratio Test Statistic (p-value)': N/A
AIC: {baseline_model.aic:.0f}
BIC: {baseline_model.bic:.0f}
--------------------------------------------------------------------
"""
print(report)
print(baseline_model.summary())
print('\n\n')

for covariate in COVARIATES:
    preprocessed_df = pre_process(df_EN, ['log_rt', 'Length', 'LgSUBTLWF', 'OLD', covariate])
    
    # expanded: baseline + covariate
    expanded_formula = 'log_rt ~ ' + BASELINE_EN + ' + ' + covariate
    expanded_model = smf.ols(expanded_formula, data=preprocessed_df).fit()

    lr_stat = 2 * (expanded_model.llf - baseline_model.llf)
    dof = 1 # difference between expanded and baseline model is 1 covariate
    p_value = 1 - stats.chi2.cdf(lr_stat, dof)

    report = f"""
--------------------------------------------------------------------
Covariate: {covariate}
Likelihood Ratio Test Statistic (p-value)': {lr_stat:.1f} ({p_value:.3f})
AIC: {expanded_model.aic:.0f}
BIC: {expanded_model.bic:.0f}
--------------------------------------------------------------------
        """
    print(report)
    print(expanded_model.summary())
    print('\n\n')



--------------------------------------------------------------------
Covariate: None (Baseline model)
Likelihood Ratio Test Statistic (p-value)': N/A
AIC: 15353
BIC: 15394
--------------------------------------------------------------------

                            OLS Regression Results                            
Dep. Variable:                 log_rt   R-squared:                       0.444
Model:                            OLS   Adj. R-squared:                  0.444
Method:                 Least Squares   F-statistic:                     1087.
Date:                Mon, 28 Jul 2025   Prob (F-statistic):               0.00
Time:                        12:18:25   Log-Likelihood:                -7670.4
No. Observations:                6815   AIC:                         1.535e+04
Df Residuals:                    6809   BIC:                         1.539e+04
Df Model:                           5                                         
Covariance Type:            nonrobust         

In [None]:
# RIOPLATENSE SPANISH
# ----------------------------------------------------------------------------------------------------
# load df
df_RP = pd.read_csv('./data/regression_data_RP.csv')

# define baselines and covariates studied
BASELINE_RP = '1 + log_frq + num_letters + Lev_N + log_frq:num_letters + log_frq:Lev_N'
COVARIATES = ['n_communities', 'entropy', 'max_dom', 'dissimilarity', 'ambiguity_binary', 'ambiguity']

# baseline
baseline_formula = 'log_rt ~ ' + BASELINE_RP
preprocessed_df = pre_process(df_RP, ['log_rt', 'log_frq', 'num_letters', 'Lev_N'])
baseline_model = smf.ols(baseline_formula, data=preprocessed_df).fit()

report = f"""
--------------------------------------------------------------------
Covariate: None (Baseline model)
Likelihood Ratio Test Statistic (p-value)': N/A
AIC: {baseline_model.aic:.0f}
BIC: {baseline_model.bic:.0f}
--------------------------------------------------------------------
"""
print(report)
print(baseline_model.summary())
print('\n\n')

for covariate in COVARIATES:
    preprocessed_df = pre_process(df_RP, ['log_rt', 'log_frq', 'num_letters', 'Lev_N', covariate])
    
    # expanded: baseline + covariate
    expanded_formula = 'log_rt ~ ' + BASELINE_RP + ' + ' + covariate
    expanded_model = smf.ols(expanded_formula, data=preprocessed_df).fit()

    lr_stat = 2 * (expanded_model.llf - baseline_model.llf)
    dof = 1 # difference between expanded and baseline model is 1 covariate
    p_value = 1 - stats.chi2.cdf(lr_stat, dof)

    report = f"""
--------------------------------------------------------------------
Covariate: {covariate}
Likelihood Ratio Test Statistic (p-value)': {lr_stat:.1f} ({p_value:.3f})
AIC: {expanded_model.aic:.0f}
BIC: {expanded_model.bic:.0f}
--------------------------------------------------------------------
        """
    print(report)
    print(expanded_model.summary())
    print('\n\n')



--------------------------------------------------------------------
Covariate: None (Baseline model)
Likelihood Ratio Test Statistic (p-value)': N/A
AIC: 16546
BIC: 16587
--------------------------------------------------------------------

                            OLS Regression Results                            
Dep. Variable:                 log_rt   R-squared:                       0.417
Model:                            OLS   Adj. R-squared:                  0.417
Method:                 Least Squares   F-statistic:                     1030.
Date:                Mon, 28 Jul 2025   Prob (F-statistic):               0.00
Time:                        12:23:28   Log-Likelihood:                -8266.9
No. Observations:                7196   AIC:                         1.655e+04
Df Residuals:                    7190   BIC:                         1.659e+04
Df Model:                           5                                         
Covariance Type:            nonrobust         