In [5]:
import pandas as pd
import statsmodels.api as sm

In [3]:
df = pd.read_csv('data_Mazancieux_2018.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28960 entries, 0 to 28959
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Subj_idx       28960 non-null  int64  
 1   Stimulus       28960 non-null  int64  
 2   Response       28960 non-null  int64  
 3   Confidence     28960 non-null  int64  
 4   RT_decision    28960 non-null  float64
 5   RT_confidence  28960 non-null  float64
 6   Task           28960 non-null  object 
dtypes: float64(2), int64(4), object(1)
memory usage: 1.5+ MB


In [10]:
len(df.Subj_idx.unique())

181

## Run a linear regression model per task, predict Confidence (i) with Confidence (-i)

In [6]:
df['Confidence_lag'] = df.groupby('Task')['Confidence'].shift(1)

results = {}

# iterate on each task
for task in df['Task'].unique():

    task_df = df[df['Task'] == task].dropna(subset=['Confidence_lag'])
    
    X = task_df['Confidence_lag']
    y = task_df['Confidence']

    X = sm.add_constant(X)
    
    model = sm.OLS(y, X).fit()
    
    results[task] = model.summary()

# Show results
for task, result in results.items():
    print(f"Resultados para Task = {task}")
    print(result)
    print("\n")

Resultados para Task = VP
                            OLS Regression Results                            
Dep. Variable:             Confidence   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.263
Method:                 Least Squares   F-statistic:                     2590.
Date:                Fri, 19 Jul 2024   Prob (F-statistic):               0.00
Time:                        23:14:02   Log-Likelihood:                -17243.
No. Observations:                7239   AIC:                         3.449e+04
Df Residuals:                    7237   BIC:                         3.450e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             

### Do the same but with lag from 1 to 11 as Xs

In [12]:
lags = [1, 2, 4, 5, 6, 7,8,9,10,11]
for lag in lags:
    df[f'Confidence_lag_{lag}'] = df.groupby('Task')['Confidence'].shift(lag)

results = {}

for task in df['Task'].unique():
    task_df = df[df['Task'] == task].dropna(subset=[f'Confidence_lag_{lag}' for lag in lags])
    
    y = task_df['Confidence']
    
    X = task_df[[f'Confidence_lag_{lag}' for lag in lags]]
    
    X = sm.add_constant(X)
    
    model = sm.OLS(y, X).fit()
    
    results[task] = model.summary()

for task, result in results.items():
    print(f"Resultados para Task = {task}")
    print(result)
    print("\n")

Resultados para Task = VP
                            OLS Regression Results                            
Dep. Variable:             Confidence   R-squared:                       0.345
Model:                            OLS   Adj. R-squared:                  0.345
Method:                 Least Squares   F-statistic:                     380.9
Date:                Fri, 19 Jul 2024   Prob (F-statistic):               0.00
Time:                        23:28:05   Log-Likelihood:                -16798.
No. Observations:                7229   AIC:                         3.362e+04
Df Residuals:                    7218   BIC:                         3.369e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const       

### Do the same with the RT of confidence reports

In [14]:
lags = [1, 2, 4, 5, 6, 7,8,9,10,11]
for lag in lags:
    df[f'RT_confidence_lag_{lag}'] = df.groupby('Task')['RT_confidence'].shift(lag)

results = {}

for task in df['Task'].unique():
    task_df = df[df['Task'] == task].dropna(subset=[f'RT_confidence_lag_{lag}' for lag in lags])
    
    y = task_df['RT_confidence']
    
    X = task_df[[f'RT_confidence_lag_{lag}' for lag in lags]]
    
    X = sm.add_constant(X)
    
    model = sm.OLS(y, X).fit()
    
    results[task] = model.summary()

for task, result in results.items():
    print(f"Resultados para Task = {task}")
    print(result)
    print("\n")

Resultados para Task = VP
                            OLS Regression Results                            
Dep. Variable:          RT_confidence   R-squared:                       0.136
Model:                            OLS   Adj. R-squared:                  0.135
Method:                 Least Squares   F-statistic:                     113.5
Date:                Fri, 19 Jul 2024   Prob (F-statistic):          3.20e-220
Time:                        23:32:34   Log-Likelihood:                -11032.
No. Observations:                7229   AIC:                         2.209e+04
Df Residuals:                    7218   BIC:                         2.216e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const 

### Too much good, something could be wrong. I will try the null model 

In [16]:

df_copy = df.copy()

df_copy['Confidence_lag_1'] = df_copy.groupby('Task')['Confidence'].shift(1)

results = {}

for task in df_copy['Task'].unique():
    task_df = df_copy[df_copy['Task'] == task].dropna(subset=['Confidence_lag_1'])
    
    X_shift_1 = task_df['Confidence_lag_1']
    y = task_df['Confidence']
    
    X_shift_1 = sm.add_constant(X_shift_1)
    
    model_shift_1 = sm.OLS(y, X_shift_1).fit()
    
    X_intercept = pd.DataFrame({'const': [1] * len(y)}, index=y.index)
    model_intercept = sm.OLS(y, X_intercept).fit()
    
    results[task] = {
        'shift_1': model_shift_1.summary(),
        'intercept_only': model_intercept.summary()
    }

for task, result in results.items():
    print(f"Resultados para Task = {task} - Modelo con shift(1)")
    print(result['shift_1'])
    print(f"Resultados para Task = {task} - Modelo solo con intercepta")
    print(result['intercept_only'])
    print("\n")


Resultados para Task = VP - Modelo con shift(1)
                            OLS Regression Results                            
Dep. Variable:             Confidence   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.263
Method:                 Least Squares   F-statistic:                     2590.
Date:                Fri, 19 Jul 2024   Prob (F-statistic):               0.00
Time:                        23:38:08   Log-Likelihood:                -17243.
No. Observations:                7239   AIC:                         3.449e+04
Df Residuals:                    7237   BIC:                         3.450e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------

### The first models are better than the null model. I'm starting to trust in my results

## Now a mixed linear regression model with a random intercept per participant

In [19]:
from statsmodels.regression.mixed_linear_model import MixedLM

df_copy = df.copy()

lags = [1, 2, 3]
for lag in lags:
    df_copy[f'Confidence_lag_{lag}'] = df_copy.groupby('Subj_idx')['Confidence'].shift(lag)

results = {}

for task in df_copy['Task'].unique():
    task_df = df_copy[df_copy['Task'] == task].dropna(subset=[f'Confidence_lag_{lag}' for lag in lags])
    
    y = task_df['Confidence']
    
    X = task_df[[f'Confidence_lag_{lag}' for lag in lags]]
    X = sm.add_constant(X)
    
    model = MixedLM(y, X, groups=task_df['Subj_idx'])
    
    result = model.fit()
    
    results[task] = result.summary()

for task, result in results.items():
    print(f"Resultados para Task = {task} - Modelo mixto con lags 1, 2 y 3")
    print(result)
    print("\n")




Resultados para Task = VP - Modelo mixto con lags 1, 2 y 3
          Mixed Linear Model Regression Results
Model:             MixedLM Dependent Variable: Confidence 
No. Observations:  7084    Method:             REML       
No. Groups:        181     Scale:              5.4273     
Min. group size:   37      Log-Likelihood:     -16296.5107
Max. group size:   40      Converged:          Yes        
Mean group size:   39.1                                   
----------------------------------------------------------
                 Coef. Std.Err.   z    P>|z| [0.025 0.975]
----------------------------------------------------------
const            3.855    0.146 26.486 0.000  3.570  4.140
Confidence_lag_1 0.186    0.012 15.439 0.000  0.162  0.210
Confidence_lag_2 0.056    0.012  4.678 0.000  0.033  0.080
Confidence_lag_3 0.042    0.012  3.620 0.000  0.019  0.065
Group Var        1.911    0.104                           



Resultados para Task = EF - Modelo mixto con lags 1, 2 y 3
     