# Age-Related Differences in Delay Discounting: Income Matters

This notebook replicates the analyses from the publication:

> Wan, H., Myerson, J., Green, L., Strube, M. J., & Hale, S. (2024). Age-related differences in delay discounting: Income matters. *Psychology and Aging*. Advance online publication. https://doi.org/10.1037/pag0000818

The original analyses were conducted in R and have been translated into a Python workflow using `pandas`, `scipy`, `statsmodels`, and `pymc`. The analyses test the "buffering hypothesis" by examining the interaction between age and income on delay discounting, using a series of focused Bayesian multilevel regressions.

The data from this study are available at <https://osf.io/um68t/>.

In [None]:
# Install necessary packages
import sys
!{sys.executable} -m pip install pandas numpy openpyxl scipy statsmodels pymc arviz scikit-learn

In [None]:
# --- 1. Setup: Load Packages, Functions, and Process Data ---

# --- Load Packages ---
import pandas as pd
import numpy as np
import warnings
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
import pymc as pm
import arviz as az

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.3f}'.format

# --- Custom Functions ---
def binomial_pymc(formula, data, progressbar=False):
    """Wrapper to fit a Bayesian binomial regression using PyMC."""
    outcome, predictor = [v.strip() for v in formula.split('~')]
    
    with pm.Model() as model:
        # Define priors based on the original brms setup
        intercept = pm.Normal("Intercept", mu=0, sigma=10)
        slope = pm.Normal(predictor, mu=0, sigma=2.5)

        # Define the linear model on the logit scale
        eta = intercept + slope * data[predictor]
        p = pm.math.invlogit(eta)

        # Define the likelihood
        pm.Binomial(
            outcome,
            n=data['trials'].values,
            p=p,
            observed=data[outcome].values
        )
        
        idata = pm.sample(draws=2000, tune=2000, chains=4, cores=4,
                          target_accept=0.95, progressbar=progressbar)
    return idata

def beta_pymc(formula, data, progressbar=False):
    """Wrapper to fit a Bayesian beta regression using PyMC."""
    outcome, predictor = [v.strip() for v in formula.split('~')]
    
    with pm.Model() as model:
        # Define priors
        intercept = pm.Normal("Intercept", mu=0, sigma=10)
        slope = pm.Normal(predictor, mu=0, sigma=2.5)
        nu = pm.HalfCauchy("nu", beta=10) 

        # Define the linear model for the mean (mu), using a logit link
        eta = intercept + slope * data[predictor]
        mu = pm.math.invlogit(eta)

        # Define the likelihood with the corrected parameter nu
        pm.Beta(
            outcome,
            mu=mu,
            nu=nu, 
            observed=data[outcome].values
        )

        idata = pm.sample(draws=2000, tune=2000, chains=4, cores=4,
                          target_accept=0.95, progressbar=progressbar)
    return idata

def pymc_summary(idata, var_names=None):
    """Custom function to format PyMC model summary similar to the R version."""
    summary_df = az.summary(idata, hdi_prob=0.95, kind='stats', var_names=var_names)
    
    # Calculate Probability of Direction (pd)
    posterior = az.extract(idata, var_names=var_names)
    pd_values = {}
    for var in posterior.data_vars:
        samples = posterior[var].values.flatten()
        pd = np.mean(samples > 0)
        pd = max(pd, 1 - pd)
        pd_values[var] = pd
        
    summary_df['pd'] = summary_df.index.map(pd_values)
    summary_df['signif'] = np.where(summary_df['pd'] >= 0.975, '*', ' ')
    summary_df = summary_df[['mean', 'hdi_2.5%', 'hdi_97.5%', 'pd', 'signif']]
    summary_df.columns = ['b', '95% CI_l', '95% CI_u', 'pd', 'signif']
    
    return summary_df.apply(lambda x: x.map('{:.3f}'.format) if x.name in ['b', '95% CI_l', '95% CI_u', 'pd'] else x)
    
# --- Data Processing ---
# Load raw data
dat_adj_amt_raw = pd.read_excel("Manuscript/Resubmission2/Analysis/Data_AgeIncome.xlsx", sheet_name="Adj-Amt")
dat_mcq_raw = pd.read_excel("Manuscript/Resubmission2/Analysis/Data_AgeIncome.xlsx", sheet_name="MCQ")

# Assign k and Amount values for MCQ
k_conditions = [dat_mcq_raw['Q_ID'] == i for i in range(1, 28)]
k_values = [0.00016, 0.006, 0.006, 0.25, 0.041, 0.0004, 0.1, 0.1, 0.00016, 0.006, 
            0.25, 0.001, 0.00016, 0.041, 0.0025, 0.0025, 0.0004, 0.016, 0.1, 0.0004, 
            0.016, 0.0025, 0.041, 0.001, 0.016, 0.001, 0.25]
amt_values = ["$50-$60", "$75-$85", "$25-$35", "$75-$85", "$25-$35", "$50-$60", 
              "$25-$35", "$50-$60", "$75-$85", "$50-$60", "$25-$35", "$75-$85", 
              "$25-$35", "$50-$60", "$75-$85", "$50-$60", "$75-$85", "$25-$35", 
              "$75-$85", "$25-$35", "$50-$60", "$25-$35", "$75-$85", "$50-$60", 
              "$75-$85", "$25-$35", "$50-$60"]
dat_mcq_raw['k'] = np.select(k_conditions, k_values, default=np.nan)
dat_mcq_raw['Amount'] = pd.Categorical(np.select(k_conditions, amt_values, default=None), 
                                     categories=["$75-$85", "$50-$60", "$25-$35"], ordered=True)

# Process individual-level data
adj_amt_processed = dat_adj_amt_raw[dat_adj_amt_raw['Delay'] != 730].groupby(['ID', 'Amount']).apply(
    lambda g: pd.Series({'AuC': np.trapz(g['RSV'], g['Delay'] / g['Delay'].max())})
).reset_index()

mcq_processed = dat_mcq_raw.groupby(['ID', 'Amount'], observed=True).agg(Num_Choice=('Choice', 'sum')).reset_index()

# Combine and create final analysis dataframes
demographics = dat_adj_amt_raw[['ID', 'Group', 'Age', 'Gender', 'Education', 'HADS']].drop_duplicates()
demographics['Age_grp'] = np.where(demographics['Group'].isin([1, 2]), 0, 1)    # 0=Younger
demographics['Income_grp'] = np.where(demographics['Group'].isin([1, 3]), 0, 1) # 0=Lower
demographics['Education_grp'] = np.where(demographics['Education'] <= 3, 0, 1)
demographics['HADS'] = pd.to_numeric(demographics['HADS'], errors='coerce')

MCQ_ID_df = pd.merge(mcq_processed, demographics, on='ID')
AdjAmt_ID_df = pd.merge(adj_amt_processed, demographics, on='ID')

# Create Model-Specific Dataframes with Centered Variables
def center_vars(df, cols):
    for col in cols:
        df[f'{col}_c'] = df[col] - df[col].mean()
    return df

MCQ_mod1_df = center_vars(MCQ_ID_df[MCQ_ID_df['Amount'] != "$50-$60"].copy(), ['Age_grp', 'Income_grp'])
AdjAmt_mod1_df = center_vars(AdjAmt_ID_df[AdjAmt_ID_df['Amount'] != 500].copy(), ['Age_grp', 'Income_grp'])
MCQ_mod1_df['trials'] = 9

# Create the Composite z-Score DataFrame
mcq_z = MCQ_ID_df[MCQ_ID_df['Amount'] != "$50-$60"].sort_values(['ID', 'Amount'], ascending=[True, False])
adj_z = AdjAmt_ID_df[AdjAmt_ID_df['Amount'] != 500]
z_df = mcq_z[['ID', 'Age_grp', 'Income_grp', 'Age', 'Num_Choice']].copy()
z_df['AuC'] = adj_z['AuC'].values
z_df['MCQ_z'] = (z_df['Num_Choice'] - z_df['Num_Choice'].mean()) / z_df['Num_Choice'].std()
z_df['AdjAmt_z'] = (z_df['AuC'] - z_df['AuC'].mean()) / z_df['AuC'].std()
z_df['z_score'] = z_df[['MCQ_z', 'AdjAmt_z']].mean(axis=1)
z_df = z_df.groupby('ID').mean().reset_index() # Get mean z-score per participant
z_df = center_vars(z_df, ['Age_grp', 'Income_grp'])
z_df['Age_c'] = (z_df['Age'] - z_df['Age'].mean()) / z_df['Age'].std()

## 2. Data Quality & Reliability Checks

This section replicates the initial analyses from the paper that establish the quality of the discounting data, including group-level model fits and within-procedure correlations.

In [7]:
# --- Group-Level Data Frames ---
group_labels = {1: "Younger, Lower-Income", 2: "Younger, Higher-Income", 3: "Older, Lower-Income", 4: "Older, Higher-Income"}
dat_mcq_raw['Group_Label'] = dat_mcq_raw['Group'].map(group_labels)
dat_adj_amt_raw['Group_Label'] = dat_adj_amt_raw['Group'].map(group_labels)
dat_adj_amt_raw['Amount_Label'] = pd.Categorical(dat_adj_amt_raw['Amount'].replace({30:"$30", 80:"$80", 500:"$500"}))

MCQ_grp_df = dat_mcq_raw.groupby(['Group_Label', 'Amount', 'k'], observed=True).agg(Prop=('Choice', 'mean')).reset_index()
MCQ_grp_df['log_k'] = np.log(MCQ_grp_df['k'])
AdjAmt_grp_df = dat_adj_amt_raw.groupby(['Group_Label', 'Amount_Label', 'Delay'], observed=True).agg(Mean_RSV=('RSV', 'mean')).reset_index()

# --- Model Definitions ---
def logistic_growth(log_k, a, r):
    return 1 / (1 + np.exp(-(log_k - a) * r))

def hyperboloid(delay, k, s):
    return 1 / (1 + k * delay)**s

# --- MCQ: Group-Level Logistic Growth Model Fit ---
print("--- MCQ: R-squared for Group-Level Logistic Growth Fits ---")
r2_mcq = MCQ_grp_df.groupby(['Group_Label', 'Amount'], observed=True).apply(
    lambda g: r2_score(g['Prop'], logistic_growth(g['log_k'], *curve_fit(logistic_growth, g['log_k'], g['Prop'], p0=[-4, 1])[0]))
).unstack()
print(r2_mcq)

# --- Adj-Amt: Group-Level Hyperboloid Model Fit ---
print("\n--- Adj-Amt: R-squared for Group-Level Hyperboloid Fits ---")
r2_adj = AdjAmt_grp_df.groupby(['Group_Label', 'Amount_Label'], observed=True).apply(
    lambda g: r2_score(g['Mean_RSV'], hyperboloid(g['Delay'], *curve_fit(hyperboloid, g['Delay'], g['Mean_RSV'], p0=[0.1, 1])[0]))
).unstack()
print(r2_adj)

# --- Within-Procedure Reliability (Correlations) ---
print("\n--- MCQ: Within-Procedure Correlations (Number of Delayed Choices) ---")
mcq_pivot = MCQ_ID_df.pivot(index='ID', columns='Amount', values='Num_Choice')
print(mcq_pivot.corr())

print("\n--- Adj-Amt: Within-Procedure Correlations (AuC) ---")
adj_pivot = AdjAmt_ID_df.pivot(index='ID', columns='Amount', values='AuC')
adj_pivot.columns = [f"${c}" for c in adj_pivot.columns]
print(adj_pivot.corr())

--- MCQ: R-squared for Group-Level Logistic Growth Fits ---
Amount                  $75-$85  $50-$60  $25-$35
Group_Label                                      
Older, Higher-Income      0.983    0.991    0.996
Older, Lower-Income       0.994    0.996    0.996
Younger, Higher-Income    0.993    0.987    0.994
Younger, Lower-Income     0.995    0.991    0.992

--- Adj-Amt: R-squared for Group-Level Hyperboloid Fits ---
Amount_Label             $30  $500   $80
Group_Label                             
Older, Higher-Income   0.967 0.943 0.951
Older, Lower-Income    0.944 0.978 0.985
Younger, Higher-Income 0.997 0.959 0.988
Younger, Lower-Income  0.997 0.974 0.990

--- MCQ: Within-Procedure Correlations (Number of Delayed Choices) ---
Amount   $75-$85  $50-$60  $25-$35
Amount                            
$75-$85    1.000    0.940    0.903
$50-$60    0.940    1.000    0.918
$25-$35    0.903    0.918    1.000

--- Adj-Amt: Within-Procedure Correlations (AuC) ---
       $30   $80  $500
$30  1.00

## 3. Hypothesis Testing: Effects of Age on Discounting

This section replicates the focused contrasts that test the buffering hypothesis by examining the effect of age at each level of income. The results correspond to **Table 2** in the publication.

In [14]:
# --- Model 1: Discounting = fn(Age) ---
print("--- Model 1: Age Effects ---")
# Lower Income
mcq_low_inc_mod1_idata = binomial_pymc("Num_Choice ~ Age_grp_c", MCQ_mod1_df[MCQ_mod1_df['Income_grp'] == 0])
adj_low_inc_mod1_idata = beta_pymc("AuC ~ Age_grp_c", AdjAmt_mod1_df[AdjAmt_mod1_df['Income_grp'] == 0])
# Higher Income
mcq_high_inc_mod1_idata = binomial_pymc("Num_Choice ~ Age_grp_c", MCQ_mod1_df[MCQ_mod1_df['Income_grp'] == 1])
adj_high_inc_mod1_idata = beta_pymc("AuC ~ Age_grp_c", AdjAmt_mod1_df[AdjAmt_mod1_df['Income_grp'] == 1])

print("\nMCQ Lower Income:")
print(pymc_summary(mcq_low_inc_mod1_idata, ['Intercept', 'Age_grp_c']))
print("\nAdj-Amt Lower Income:")
print(pymc_summary(adj_low_inc_mod1_idata, ['Intercept', 'Age_grp_c']))
print("\nMCQ Higher Income:")
print(pymc_summary(mcq_high_inc_mod1_idata, ['Intercept', 'Age_grp_c']))
print("\nAdj-Amt Higher Income:")
print(pymc_summary(adj_high_inc_mod1_idata, ['Intercept', 'Age_grp_c']))

Initializing NUTS using jitter+adapt_diag...


--- Model 1: Age Effects ---


Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, Age_grp_c]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 351 seconds.
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, Age_grp_c, nu]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 1637 seconds.
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, Age_grp_c]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 672 seconds.
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, Age_grp_c, nu]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 970 seconds.



MCQ Lower Income:
                b 95% CI_l 95% CI_u     pd signif
Intercept  -0.091   -0.161   -0.022  0.995      *
Age_grp_c   0.344    0.216    0.498  1.000      *

Adj-Amt Lower Income:
               b 95% CI_l 95% CI_u     pd signif
Intercept  0.393    0.291    0.493  1.000      *
Age_grp_c  0.367    0.164    0.559  1.000      *

MCQ Higher Income:
               b 95% CI_l 95% CI_u     pd signif
Intercept  0.034   -0.040    0.098  0.835       
Age_grp_c  0.050   -0.086    0.181  0.766       

Adj-Amt Higher Income:
               b 95% CI_l 95% CI_u     pd signif
Intercept  0.589    0.486    0.689  1.000      *
Age_grp_c  0.047   -0.137    0.243  0.683       


## 4. Hypothesis Testing: Effects of Income on Discounting

This section replicates the complementary focused contrasts that test the effect of income at each level of age. The results correspond to **Table 3** in the publication.

In [None]:
# --- Model 1: Discounting = fn(Income) ---
print("--- Model 1: Income Effects ---")
# Younger
mcq_young_mod1_idata = binomial_pymc("Num_Choice ~ Income_grp_c", MCQ_mod1_df[MCQ_mod1_df['Age_grp'] == 0])
adj_young_mod1_idata = beta_pymc("AuC ~ Income_grp_c", AdjAmt_mod1_df[AdjAmt_mod1_df['Age_grp'] == 0])
# Older
mcq_old_mod1_idata = binomial_pymc("Num_Choice ~ Income_grp_c", MCQ_mod1_df[MCQ_mod1_df['Age_grp'] == 1])
adj_old_mod1_idata = beta_pymc("AuC ~ Income_grp_c", AdjAmt_mod1_df[AdjAmt_mod1_df['Age_grp'] == 1])

print("\nMCQ Younger:")
print(pymc_summary(mcq_young_mod1_idata, ['Intercept', 'Income_grp_c']))
print("\nAdj-Amt Younger:")
print(pymc_summary(adj_young_mod1_idata, ['Intercept', 'Income_grp_c']))
print("\nMCQ Older:")
print(pymc_summary(mcq_old_mod1_idata, ['Intercept', 'Income_grp_c']))
print("\nAdj-Amt Older:")
print(pymc_summary(adj_old_mod1_idata, ['Intercept', 'Income_grp_c']))

## 5. Association with Composite Discounting Measure

The following analyses evaluate the correlations between Age Group and the composite z-score for each Income Group, and between Income Group and the z-score for each Age Group.

In [16]:
def print_corr(df, x_col, y_col):
    """Helper function to print correlation results."""
    res = pearsonr(df[x_col], df[y_col])
    print(f"Correlation between {x_col} and {y_col}: r = {res.statistic:.3f}, p = {res.pvalue:.3f}")

# Correlation between Age Group and z-score for each Income Group
print("--- Age Effect on z-score within each Income Group ---")
print_corr(z_df[z_df['Income_grp'] == 0], 'z_score', 'Age_grp') # Lower Income
print_corr(z_df[z_df['Income_grp'] == 1], 'z_score', 'Age_grp') # Higher Income

# Correlation between Income Group and z-score for each Age Group
print("\n--- Income Effect on z-score within each Age Group ---")
print_corr(z_df[z_df['Age_grp'] == 0], 'z_score', 'Income_grp') # Younger
print_corr(z_df[z_df['Age_grp'] == 1], 'z_score', 'Income_grp') # Older

--- Age Effect on z-score within each Income Group ---
Correlation between z_score and Age_grp: r = 0.202, p = 0.007
Correlation between z_score and Age_grp: r = 0.041, p = 0.581

--- Income Effect on z-score within each Age Group ---
Correlation between z_score and Income_grp: r = 0.163, p = 0.026
Correlation between z_score and Income_grp: r = 0.003, p = 0.964


## 6. The Magnitude of the Age Difference

These final analyses use the composite discounting measure to examine the magnitude of the effects of Age and Income on discounting via linear regression.

In [19]:
# --- Mean z-score for each group ---
print("--- Mean Composite z-Score by Group ---")
mean_z_scores = z_df.groupby(['Age_grp', 'Income_grp'])['z_score'].mean().reset_index()
mean_z_scores['Age_grp'] = np.where(mean_z_scores['Age_grp'] == 0, "Younger", "Older")
mean_z_scores['Income_grp'] = np.where(mean_z_scores['Income_grp'] == 0, "Lower-Income", "Higher-Income")
print(mean_z_scores)

# --- z-score = fn[Age, Income, Age x Income] ---
print("\n--- Full Model: z-score ~ Age * Income ---")
full_model = smf.ols('z_score ~ Age_grp_c * Income_grp_c', data=z_df).fit()
print(full_model.summary())

# --- Age effect in each income group using continuous age ---
print("\n--- Age Effect on z-score within each Income Group (Continuous Age) ---")
print("\nLower Income Group:")
low_inc_model = smf.ols('z_score ~ Age', data=z_df[z_df['Income_grp'] == 0]).fit()
print(low_inc_model.summary())

print("\nHigher Income Group:")
high_inc_model = smf.ols('z_score ~ Age', data=z_df[z_df['Income_grp'] == 1]).fit()
print(high_inc_model.summary())

--- Mean Composite z-Score by Group ---
   Age_grp     Income_grp  z_score
0  Younger   Lower-Income   -0.266
1  Younger  Higher-Income    0.043
2    Older   Lower-Income    0.112
3    Older  Higher-Income    0.118

--- Full Model: z-score ~ Age * Income ---
                            OLS Regression Results                            
Dep. Variable:                z_score   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     3.461
Date:                Thu, 25 Sep 2025   Prob (F-statistic):             0.0166
Time:                        13:12:09   Log-Likelihood:                -477.72
No. Observations:                 359   AIC:                             963.4
Df Residuals:                     355   BIC:                             979.0
Df Model:                           3                                         
Covariance Type:            no