In [1]:
import pandas as pd
from scipy.stats import pearsonr
import numpy as np
import re
from pingouin import cronbach_alpha, mediation_analysis
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.mediation import Mediation
from pyprocessmacro import Process

In [2]:
raw_df = pd.read_csv("../results/pilot5_survey1_normal.csv")

In [3]:
short_col_df = raw_df.rename(lambda s: s[0:3], axis='columns')
short_col_df

Unnamed: 0,Tri,Sce,001,002,003,004,005,006,007,008,...,029,030,031,032,033,034,035,036,037,038
0,,,00 Demographics,00 Demographics,00 Demographics,00 Demographics,00 Demographics,01 Distributive Price Fairness Perception,01 Distributive Price Fairness Perception,01 Distributive Price Fairness Perception,...,18 Perceived Information Sensitivity,18 Perceived Information Sensitivity,18 Perceived Information Sensitivity,18 Perceived Information Sensitivity,18 Perceived Information Sensitivity,19 Willingness to Reveal Information,19 Willingness to Reveal Information,19 Willingness to Reveal Information,19 Willingness to Reveal Information,19 Willingness to Reveal Information
1,0.0,01 Disadvantaged Customer,b,62,e,a,d,6,4,3,...,4,2,6,5,3,5,4,6,4,2
2,1.0,01 Same Price,b,62,a,e,d,4,6,5,...,4,3,6,4,3,5,4,5,6,4
3,2.0,01 Same Price,b,52,c,b,d,6,5,4,...,4,5,6,5,3,4,5,3,4,2
4,3.0,01 Disadvantaged Customer,b,32,d,a,e,5,4,3,...,6,4,6,5,3,6,5,4,5,3
5,4.0,01 Favored Customer,b,42,c,e,d),6,4,3,...,6,3,6,5,4,3,2,4,3,2
6,5.0,01 Same Price,a,42,f,a,d,4,6,5,...,7,4,6,5,3,2,3,4,5,2
7,6.0,01 Same Price,b,48,d,c,d,5,3,6,...,6,3,5,4,3,2,4,3,5,2
8,7.0,01 Same Price,a,43,e,f,d,5,4,6,...,3,2,4,6,5,3,2,4,6,3
9,8.0,01 Disadvantaged Customer,b,62,c,a,e,6,4,5,...,6,2,4,5,3,4,3,4,5,2


In [4]:
first_row = short_col_df.loc[[0]].rename(index={0: "groups"}).T

def column_group_index(group_name):
    return first_row.index[first_row["groups"] == group_name]

column_group_index("01 Distributive Price Fairness Perception")

Index(['006', '007', '008'], dtype='object')

In [5]:
# Remove first row
df = short_col_df.loc[1:len(short_col_df)].copy()

def to_number_if_number(s):
    if s.isnumeric():
        return int(s)
    return s

for col in ["{0:03}".format(i) for i in range(1, 29)]:
    df[col] = (df[col]
        .str.extract(r'^([a-zA-Z0-9]+).*', expand=False)
        .map(to_number_if_number)
    )

# Fix scenario error
df["Sce"] = df["Sce"].str.extract(r'^[0-9]*\s?(.*)$', expand=False)

df

Unnamed: 0,Tri,Sce,001,002,003,004,005,006,007,008,...,029,030,031,032,033,034,035,036,037,038
1,0.0,Disadvantaged Customer,b,62,e,a,d,6,4,3,...,4,2,6,5,3,5,4,6,4,2
2,1.0,Same Price,b,62,a,e,d,4,6,5,...,4,3,6,4,3,5,4,5,6,4
3,2.0,Same Price,b,52,c,b,d,6,5,4,...,4,5,6,5,3,4,5,3,4,2
4,3.0,Disadvantaged Customer,b,32,d,a,e,5,4,3,...,6,4,6,5,3,6,5,4,5,3
5,4.0,Favored Customer,b,42,c,e,d,6,4,3,...,6,3,6,5,4,3,2,4,3,2
6,5.0,Same Price,a,42,f,a,d,4,6,5,...,7,4,6,5,3,2,3,4,5,2
7,6.0,Same Price,b,48,d,c,d,5,3,6,...,6,3,5,4,3,2,4,3,5,2
8,7.0,Same Price,a,43,e,f,d,5,4,6,...,3,2,4,6,5,3,2,4,6,3
9,8.0,Disadvantaged Customer,b,62,c,a,e,6,4,5,...,6,2,4,5,3,4,3,4,5,2
10,9.0,Same Price,b,53,a,c,c,5,6,4,...,6,4,6,7,5,3,2,4,6,3


In [6]:
def mean(s):
    return sum(s)/len(s)

df["price_fairness"] = mean([df[i].astype('int') for i in column_group_index("01 Distributive Price Fairness Perception")])
df["purchase_intention"] = mean([df[i].astype('int') for i in column_group_index("03 Purchase Intention")])
df["future_search_intention"] = mean([df[i].astype('int') for i in column_group_index("04 Future Search Intention")])
df["product_involvement"] = mean([df[i].astype('int') for i in column_group_index("08 Product Involvement")])
df

Unnamed: 0,Tri,Sce,001,002,003,004,005,006,007,008,...,033,034,035,036,037,038,price_fairness,purchase_intention,future_search_intention,product_involvement
1,0.0,Disadvantaged Customer,b,62,e,a,d,6,4,3,...,3,5,4,6,4,2,4.333333,5.666667,6.0,5.571429
2,1.0,Same Price,b,62,a,e,d,4,6,5,...,3,5,4,5,6,4,5.0,6.666667,5.0,5.714286
3,2.0,Same Price,b,52,c,b,d,6,5,4,...,3,4,5,3,4,2,5.0,6.666667,5.666667,5.285714
4,3.0,Disadvantaged Customer,b,32,d,a,e,5,4,3,...,3,6,5,4,5,3,4.0,5.666667,5.666667,5.142857
5,4.0,Favored Customer,b,42,c,e,d,6,4,3,...,4,3,2,4,3,2,4.333333,5.666667,5.666667,5.285714
6,5.0,Same Price,a,42,f,a,d,4,6,5,...,3,2,3,4,5,2,5.0,6.333333,4.666667,5.714286
7,6.0,Same Price,b,48,d,c,d,5,3,6,...,3,2,4,3,5,2,4.666667,6.333333,5.0,5.285714
8,7.0,Same Price,a,43,e,f,d,5,4,6,...,5,3,2,4,6,3,5.0,6.666667,5.333333,5.428571
9,8.0,Disadvantaged Customer,b,62,c,a,e,6,4,5,...,3,4,3,4,5,2,5.0,6.333333,6.333333,5.571429
10,9.0,Same Price,b,53,a,c,c,5,6,4,...,5,3,2,4,6,3,5.0,6.666667,5.666667,5.428571


In [7]:
variablen = [
    "price_fairness",
    "purchase_intention",
    "future_search_intention",
    "product_involvement"
]


n = len(variablen)
# Correlation matrix
cor_matrix = pd.DataFrame(index=variablen, columns=variablen)

# Significance
p_values = pd.DataFrame(index=variablen, columns=variablen)

for i in range(n):
    for j in range(n):
        if i == j:
            cor_matrix.iloc[i, j] = "1.000"
        else:
            r, p = pearsonr(df[variablen[i]], df[variablen[j]])
            stars = "***" if p < 0.01 else "**" if p < 0.05 else "*" if p < 0.10 else ""
            cor_matrix.iloc[i, j] = f"{r:.3f}{stars}"

means = df[variablen].mean().round(2).tolist()
stds = df[variablen].std().round(2).tolist()

cor_matrix.loc["Mean"] = means
cor_matrix.loc["Standard Deviation"] = stds


In [8]:
cor_matrix

Unnamed: 0,price_fairness,purchase_intention,future_search_intention,product_involvement
price_fairness,1.0,0.419,-0.243,0.22
purchase_intention,0.419,1.000,-0.603**,0.438
future_search_intention,-0.243,-0.603**,1.000,-0.422
product_involvement,0.22,0.438,-0.422,1.0
Mean,4.78,6.04,5.53,5.44
Standard Deviation,0.35,0.62,0.56,0.34


In [9]:
# Cronbach's alpha

for cat in ["01 Distributive Price Fairness Perception",
            "03 Purchase Intention",
            "04 Future Search Intention",
            "08 Product Involvement"]:
    
    alpha,_ = cronbach_alpha(data=df[column_group_index(cat)])

    print(f"{cat}: {alpha}")

01 Distributive Price Fairness Perception: -2.8173913043478245
03 Purchase Intention: 0.6284916201117318
04 Future Search Intention: 0.277777777777778
08 Product Involvement: 0.1655405405405404


In [10]:
# Dummy variables

df["age"] = df["002"]
df["gender"] = df["001"]
# Male=0, Female=1
df["dummy_gender"] = (df['gender'] == 'b').astype(int)
df["income"] = df["005"]
# More or equal than 50k
df["dummy_income"] = (df['income'].isin(["d", "e", "f"])).astype(int)
df["frequency"] = df["023"]
# At least weekly
df["dummy_frequency"] = (df['frequency'] == "f").astype(int)

df["favored"] = (df['Sce'] == 'Favored Customer').astype(int)
df["disadvantaged"] = (df['Sce'] == 'Disadvantaged Customer').astype(int)
df

Unnamed: 0,Tri,Sce,001,002,003,004,005,006,007,008,...,product_involvement,age,gender,dummy_gender,income,dummy_income,frequency,dummy_frequency,favored,disadvantaged
1,0.0,Disadvantaged Customer,b,62,e,a,d,6,4,3,...,5.571429,62,b,1,d,1,f,1,0,1
2,1.0,Same Price,b,62,a,e,d,4,6,5,...,5.714286,62,b,1,d,1,f,1,0,0
3,2.0,Same Price,b,52,c,b,d,6,5,4,...,5.285714,52,b,1,d,1,c,0,0,0
4,3.0,Disadvantaged Customer,b,32,d,a,e,5,4,3,...,5.142857,32,b,1,e,1,f,1,0,1
5,4.0,Favored Customer,b,42,c,e,d,6,4,3,...,5.285714,42,b,1,d,1,f,1,1,0
6,5.0,Same Price,a,42,f,a,d,4,6,5,...,5.714286,42,a,0,d,1,f,1,0,0
7,6.0,Same Price,b,48,d,c,d,5,3,6,...,5.285714,48,b,1,d,1,f,1,0,0
8,7.0,Same Price,a,43,e,f,d,5,4,6,...,5.428571,43,a,0,d,1,f,1,0,0
9,8.0,Disadvantaged Customer,b,62,c,a,e,6,4,5,...,5.571429,62,b,1,e,1,f,1,0,1
10,9.0,Same Price,b,53,a,c,c,5,6,4,...,5.428571,53,b,1,c,0,f,1,0,0


In [11]:
# Model 1

model1_formula = 'price_fairness ~ favored + disadvantaged + age + dummy_gender + dummy_income + dummy_frequency'
model1 = smf.ols(model1_formula, data=df).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:         price_fairness   R-squared:                       0.769
Model:                            OLS   Adj. R-squared:                  0.595
Method:                 Least Squares   F-statistic:                     4.427
Date:                Sat, 12 Apr 2025   Prob (F-statistic):             0.0286
Time:                        14:35:13   Log-Likelihood:                 6.0056
No. Observations:                  15   AIC:                             1.989
Df Residuals:                       8   BIC:                             6.945
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           4.7718      0.369     

  res = hypotest_fun_out(*samples, **kwds)


In [19]:
p = Process(data=df, model=4, mcx="Sce", y="purchase_intention", m=["price_fairness"], controls=["age", "dummy_gender", "dummy_income", "dummy_frequency"])
p.summary()

ValueError: The variables supplied do not match the definition of Model 4
            Expected variable(s) not supplied: x

In [None]:
# Model 2

model2_formula = 'purchase_intention ~ favored + disadvantaged + price_fairness + age + dummy_gender + dummy_income + dummy_frequency'
model2 = smf.ols(model2_formula, data=df).fit()
print(model2.summary())

In [25]:
# Model 4
model4 = smf.ols('future_search_intention ~ favored + disadvantaged + price_fairness + age + dummy_gender + dummy_income + dummy_frequency', data=df).fit()
print(model4.summary())

                               OLS Regression Results                              
Dep. Variable:     future_search_intention   R-squared:                       0.695
Model:                                 OLS   Adj. R-squared:                  0.389
Method:                      Least Squares   F-statistic:                     2.274
Date:                     Sat, 12 Apr 2025   Prob (F-statistic):              0.150
Time:                             13:34:22   Log-Likelihood:                -3.1907
No. Observations:                       15   AIC:                             22.38
Df Residuals:                            7   BIC:                             28.05
Df Model:                                7                                         
Covariance Type:                 nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------

  res = hypotest_fun_out(*samples, **kwds)


In [None]:
# Model 3

m1 = sm.OLS.from_formula(model1_formula, data=df)
m2 = sm.OLS.from_formula(model2_formula, data=df)

med = Mediation(m2, m1, exposure='favored', mediator='price_fairness')
med_result = med.fit(n_rep=500, method='bootstrap')
print(med_result.summary())

In [42]:
mediation_analysis(data=df, x='favored', m='price_fairness', y='purchase_intention', alpha=0.05)

IndexError: index 1 is out of bounds for axis 0 with size 1

In [34]:
def bootstrap_indirect_effect_two_groups(data, n_boot=5000, seed=42):
    np.random.seed(seed)
    groups = ['favored', 'disadvantaged']
    
    rows = []
    for group in groups:
        effects = []
        for _ in range(n_boot):
            sample = data.sample(n=len(data), replace=True)
            
            # 1. Effekt von Gruppe auf Mediator (a-Pfad)
            a = sample.loc[sample[group] == 1, 'price_fairness'].mean()
            
            # 2. Effekt von Mediator auf Outcome (b-Pfad, Gesamtmodell)
            X = sample['price_fairness']
            Y = sample['purchase_intention']
            b = np.cov(X, Y, ddof=1)[0, 1] / np.var(X, ddof=1)
            
            # Indirekter Effekt: a * b
            effects.append(a * b)
        
        effects = np.array(effects)
        rows.append({
            'Kundentyp': f'{group.capitalize()} customer',
            'Indirekter Effekt': np.mean(effects),
            'Bootstrapping SE': np.std(effects, ddof=1),
            '95% CI Untergrenze': np.percentile(effects, 2.5),
            '95% CI Obergrenze': np.percentile(effects, 97.5),
        })
    
    return pd.DataFrame(rows)

# Analyse ausführen
result_df = bootstrap_indirect_effect_two_groups(df)
print(result_df)

  b = np.cov(X, Y, ddof=1)[0, 1] / np.var(X, ddof=1)


                Kundentyp  Indirekter Effekt  Bootstrapping SE  \
0        Favored customer                NaN               NaN   
1  Disadvantaged customer                NaN               NaN   

   95% CI Untergrenze  95% CI Obergrenze  
0                 NaN                NaN  
1                 NaN                NaN  
