# 1. Generate the data

This Python code generates synthetic data for a Structural Equation Model (SEM) that simulates relationships between demographic variables and three latent constructs: Perceived Ease of Use (PEoU), Perceived Usefulness (PU), and Behavioral Intention (BI). The code creates a dataset with demographic variables and observable indicators for each latent construct, standardizing the variables and incorporating both linear and non-linear relationships with controlled random noise.

In [104]:
import numpy as np
import pandas as pd
from semopy import Model, calc_stats

def generate_sem_data(n_samples=1000, seed=42):
   np.random.seed(seed)

   # Demographics
   age = np.random.randint(20, 70, n_samples)
   gender = np.random.binomial(1, 0.5, n_samples)
   income = np.random.normal(100000, 30000, n_samples)
   education = np.random.randint(1, 9, n_samples)  # 1 to 8 inclusive
   credit_score = np.random.randint(300, 850, n_samples)
   political_ideology = np.random.randint(1, 8, n_samples)
   political_party = np.random.choice([0, 1, 2], n_samples)  # 0=D, 1=R, 2=I

   # Standardizing variables
   age_std = (age - np.mean(age)) / np.std(age)
   income_std = (income - np.mean(income)) / np.std(income)
   education_std = (education - np.mean(education)) / np.std(education)
   credit_score_std = (credit_score - np.mean(credit_score)) / np.std(credit_score)
   political_ideology_std = (political_ideology - np.mean(political_ideology)) / np.std(political_ideology)

   # Latent variables
   PEoU_latent = (0.35 * age_std +
                  0.25 * gender +
                  0.25 * income_std +
                  0.25 * education_std +
                  0.10 * credit_score_std +
                  0.05 * political_ideology_std +
                  0.05 * np.cos(np.pi * political_party) +
                  np.random.normal(0, 0.3, n_samples))

   PU_latent = (0.75 * PEoU_latent +
                0.35 * age_std +
                0.25 * gender +
                0.25 * income_std +
                0.35 * education_std +
                0.15 * credit_score_std +
                0.10 * political_ideology_std +
                0.10 * np.sin(np.pi * political_party) +
                np.random.normal(0, 0.2, n_samples))

   BI_latent = (0.6 * PU_latent +
                0.4 * PEoU_latent +
                0.15 * age_std +
                0.10 * gender +
                0.15 * income_std +
                0.14 * education_std +
                0.10 * credit_score_std +
                0.05 * political_ideology_std +
                0.05 * political_party +
                np.random.normal(0, 0.2, n_samples))

   # Generate indicators as integers 1-7
   PEoU_indicators = np.column_stack([
       np.clip(np.round(0.95 * PEoU_latent + 4), 1, 7),
       np.clip(np.round(0.91 * PEoU_latent + 4), 1, 7),
       np.clip(np.round(0.89 * PEoU_latent + 4), 1, 7)
   ])

   PU_indicators = np.column_stack([
       np.clip(np.round(0.94 * PU_latent + 4), 1, 7),
       np.clip(np.round(0.97 * PU_latent + 4), 1, 7),
       np.clip(np.round(0.91 * PU_latent + 4), 1, 7)
   ])

   BI_indicators = np.column_stack([
       np.clip(np.round(0.96 * BI_latent + 4), 1, 7),
       np.clip(np.round(0.94 * BI_latent + 4), 1, 7),
       np.clip(np.round(0.88 * BI_latent + 4), 1, 7)
   ])

   # Create DataFrame
   df = pd.DataFrame({
       'Age': age,
       'Gender': gender,
       'Income': income,
       'Education': education,
       'Credit_Score': credit_score,
       'Political_Ideology': political_ideology,
       'Political_Party': political_party,
       'PEoU1': PEoU_indicators[:, 0].astype(int),
       'PEoU2': PEoU_indicators[:, 1].astype(int),
       'PEoU3': PEoU_indicators[:, 2].astype(int),
       'PU1': PU_indicators[:, 0].astype(int),
       'PU2': PU_indicators[:, 1].astype(int),
       'PU3': PU_indicators[:, 2].astype(int),
       'BI1': BI_indicators[:, 0].astype(int),
       'BI2': BI_indicators[:, 1].astype(int),
       'BI3': BI_indicators[:, 2].astype(int)
   })

   return df

# Generate the data
df = generate_sem_data()
print(df.head(20))

    Age  Gender         Income  Education  Credit_Score  Political_Ideology  \
0    58       1  110959.029791          8           575                   5   
1    48       0  142062.164417          3           347                   3   
2    34       0   40182.074912          2           510                   1   
3    62       0  136550.119468          7           839                   4   
4    27       0   24548.846424          6           490                   6   
5    40       1  100846.081346          1           614                   5   
6    58       0  165757.220642          1           412                   5   
7    38       1  120419.738186          3           469                   1   
8    42       0  111191.768060          8           767                   7   
9    30       1  160058.379004          2           313                   7   
10   30       0   78908.113060          6           587                   7   
11   43       0  113019.022674          1           

# 2. Specitify the SEM model

This code defines a Structural Equation Model (SEM) specification using semopy's syntax, establishing measurement relationships between latent variables (PU, PEoU, BI) and their indicators, plus structural paths showing how demographic variables influence these constructs.

In [105]:
model_desc = """
# Measurement model
PU =~ PU1 + PU2 + PU3
PEoU =~ PEoU1 + PEoU2 + PEoU3
BI =~ BI1 + BI2 + BI3

# Structural paths
PU ~ PEoU + Age + Gender + Income + Education + Credit_Score + Political_Ideology + Political_Party
BI ~ PU + PEoU + Age + Gender + Income + Education + Credit_Score + Political_Ideology + Political_Party
"""

We will fits the previously defined SEM model to the data using semopy, calculating model fit statistics and parameter estimates, with the results showing how well the hypothesized relationships match the observed data patterns. The output includes overall model fit metrics and detailed parameter estimates with their statistical significance (p-values).

In [106]:
from semopy import Model, calc_stats

# Fit the model on original data
model = Model(model_desc)
model.fit(df)

orig_stats = calc_stats(model)
print("=== Original Model Fit Summary ===")
print(orig_stats)

print("\n=== Original Parameter Estimates ===")
orig_params = model.inspect()
print(orig_params[['lval','op','rval','Estimate','p-value']])

=== Original Model Fit Summary ===
       DoF  DoF Baseline         chi2  chi2 p-value  chi2 Baseline       CFI  \
Value  101           127  1265.002409           0.0   26822.465796  0.956397   

            GFI      AGFI       NFI       TLI     RMSEA        AIC        BIC  \
Value  0.952838  0.940697  0.952838  0.945172  0.107407  67.469995  239.24143   

         LogLik  
Value  1.265002  

=== Original Parameter Estimates ===
     lval  op                rval  Estimate   p-value
0      PU   ~                PEoU  0.343504       0.0
1      PU   ~                 Age  0.032672       0.0
2      PU   ~              Gender  0.317637       0.0
3      PU   ~              Income  0.000011       0.0
4      PU   ~           Education  0.183835       0.0
5      PU   ~        Credit_Score  0.001181       0.0
6      PU   ~  Political_Ideology  0.056284       0.0
7      PU   ~     Political_Party  0.033760  0.019815
8      BI   ~                  PU  0.479664       0.0
9      BI   ~              

In [107]:
#pip install diffprivlib



# 3. Apply the Laplace noise to the sensitive columns.

This code implements differential privacy by adding calibrated Laplace noise to specified columns in the dataset, using a privacy parameter epsilon and sensitivity value to control the level of noise injection for privacy protection.

In [108]:
from diffprivlib.mechanisms import Laplace
import pandas as pd
# Function to apply differential privacy
def add_dp_noise(df, epsilon, sensitivity, target_cols):
    noisy_df = df.copy()
    for col in target_cols:
        laplace_mech = Laplace(epsilon=epsilon, sensitivity=sensitivity)
        noisy_df[col] = df[col].apply(lambda x: x + laplace_mech.randomise(0))
    return noisy_df

Tests how different privacy levels (controlled by epsilon values) affect the model fit statistics, running the SEM analysis multiple times with varying amounts of Laplace noise and tracking key fit indices (CFI, TLI, RMSEA) to assess the privacy-utility tradeoff.

In [109]:
epsilons = [0.1, 0.5, 1,3, 5, 8, 10]
dp_results = []

for eps in epsilons:
    dp_df = add_dp_noise(df, epsilon=0.1, sensitivity=1.0, target_cols=['Age', 'Income', 'Gender', 'Education', 'Credit_Score','Political_Ideology', 'Political_Party'])
    dp_model = Model(model_desc)
    dp_model.fit(dp_df)
    dp_stats = calc_stats(dp_model)

    dp_results.append({
        'epsilon': eps,
        'CFI': dp_stats.loc['Value','CFI'],
        'TLI': dp_stats.loc['Value','TLI'],
        'RMSEA': dp_stats.loc['Value','RMSEA']
    })

Compares the model fit statistics (CFI, TLI, RMSEA) between the original model and versions with different levels of differential privacy noise, printing out all metrics to evaluate how increasing privacy protection impacts model performance.

In [110]:
# Original fit metrics
orig_cfi = orig_stats.loc['Value','CFI']
orig_tli = orig_stats.loc['Value','TLI']
orig_rmsea = orig_stats.loc['Value','RMSEA']

print("\n=== Comparison of Fit Indices ===")
print(f"Original:   CFI={orig_cfi:.3f}, TLI={orig_tli:.3f}, "
      f"RMSEA={orig_rmsea:.3f}")
for res in dp_results:
    print(f"Epsilon={res['epsilon']}: "
          f"CFI={res['CFI']:.3f}, "
          f"TLI={res['TLI']:.3f}, "
          f"RMSEA={res['RMSEA']:.3f}")


=== Comparison of Fit Indices ===
Original:   CFI=0.956, TLI=0.945, RMSEA=0.107
Epsilon=0.1: CFI=0.977, TLI=0.972, RMSEA=0.075
Epsilon=0.5: CFI=0.556, TLI=0.442, RMSEA=0.332
Epsilon=1: CFI=0.525, TLI=0.402, RMSEA=0.344
Epsilon=3: CFI=0.977, TLI=0.971, RMSEA=0.075
Epsilon=5: CFI=0.978, TLI=0.972, RMSEA=0.074
Epsilon=8: CFI=0.584, TLI=0.477, RMSEA=0.321
Epsilon=10: CFI=0.979, TLI=0.974, RMSEA=0.072


Performs an  comparison of SEM parameter estimates between the original model and versions with differential privacy, focusing on key structural paths and printing detailed statistics to evaluate how privacy protection affects the strength and significance of relationships in the model.

In [111]:
# Store original parameter estimates
key_paths = [
    ('PU', '~', 'PEoU'),
    ('BI', '~', 'PU'),
    ('BI', '~', 'PEoU'),
    ('BI', '~', 'Age'),
    ('BI', '~', 'Income'),
    ('BI', '~', 'Gender'),
    ('BI', '~', 'Education'),
    ('BI', '~', 'Credit_Score'),
    ('BI', '~', 'Political_Ideology'),
    ('BI', '~', 'Political_Party')
]

orig_params = model.inspect()[['lval','op','rval','Estimate','p-value']]
param_comparisons = {}

for eps in epsilons:
    dp_df = add_dp_noise(df, epsilon=eps, sensitivity=1.0,
                        target_cols=['Age', 'Income', 'Gender', 'Education',
                                   'Credit_Score', 'Political_Ideology', 'Political_Party'])
    dp_model = Model(model_desc)
    dp_model.fit(dp_df)
    dp_params = dp_model.inspect()[['lval','op','rval','Estimate','p-value']]
    param_comparisons[eps] = dp_params

print("\n=== Key Parameter Estimates Comparison ===")
for lval, op, rval in key_paths:
    path_mask = (orig_params['lval'] == lval) & (orig_params['op'] == op) & (orig_params['rval'] == rval)
    if not path_mask.any():
        continue

    row = orig_params[path_mask].iloc[0]
    print(f"\nPath: {lval} {op} {rval}")
    print(f"Original: Est={row['Estimate']:.6f}, p={row['p-value']:.6f}")

    for eps in epsilons:
        dp_row = param_comparisons[eps][path_mask].iloc[0]
        print(f"Epsilon={eps}: Est={dp_row['Estimate']:.6f}, p={dp_row['p-value']:.6f}")


=== Key Parameter Estimates Comparison ===

Path: PU ~ PEoU
Original: Est=0.343504, p=0.000000
Epsilon=0.1: Est=1.026438, p=0.000000
Epsilon=0.5: Est=0.759675, p=0.000000
Epsilon=1: Est=0.577568, p=0.000000
Epsilon=3: Est=0.418365, p=0.000000
Epsilon=5: Est=0.362214, p=0.000000
Epsilon=8: Est=0.363012, p=0.000000
Epsilon=10: Est=0.352474, p=0.000000

Path: BI ~ PU
Original: Est=0.479664, p=0.000000
Epsilon=0.1: Est=0.800557, p=0.000000
Epsilon=0.5: Est=0.727285, p=0.000000
Epsilon=1: Est=0.656941, p=0.000000
Epsilon=3: Est=0.553110, p=0.000000
Epsilon=5: Est=0.508011, p=0.000000
Epsilon=8: Est=0.497155, p=0.000000
Epsilon=10: Est=0.494128, p=0.000000

Path: BI ~ PEoU
Original: Est=0.301767, p=0.000000
Epsilon=0.1: Est=0.351070, p=0.000000
Epsilon=0.5: Est=0.320807, p=0.000000
Epsilon=1: Est=0.324638, p=0.000000
Epsilon=3: Est=0.316472, p=0.000000
Epsilon=5: Est=0.302191, p=0.000000
Epsilon=8: Est=0.303476, p=0.000000
Epsilon=10: Est=0.305600, p=0.000000

Path: BI ~ Age
Original: Est=0

In [119]:
import matplotlib.pyplot as plt

In [121]:
df

Unnamed: 0,Age,Gender,Income,Education,Credit_Score,Political_Ideology,Political_Party,PEoU1,PEoU2,PEoU3,PU1,PU2,PU3,BI1,BI2,BI3
0,58,1,110959.029791,8,575,5,1,5,5,5,6,6,6,6,6,6
1,48,0,142062.164417,3,347,3,2,4,4,4,4,4,4,5,5,5
2,34,0,40182.074912,2,510,1,0,3,3,3,2,2,2,2,2,2
3,62,0,136550.119468,7,839,4,1,6,6,5,7,7,7,7,7,7
4,27,0,24548.846424,6,490,6,2,4,4,4,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,45,1,147068.034870,7,639,6,0,5,5,5,6,6,6,6,6,6
996,53,0,89395.136614,3,474,2,2,4,4,4,4,4,4,4,4,4
997,64,1,68854.925587,3,750,5,1,5,5,5,5,5,5,5,5,5
998,25,1,144921.069128,2,435,7,0,4,4,4,4,4,4,4,4,4
