<a href="https://colab.research.google.com/github/jefernandezec/s2s/blob/main/S2S.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ─── Toy survey dataset generation (R) ─────────────────────────────────────

set.seed(0)
n <- 1000

# 1. Simulate raw variables
age             <- sample(18:79, n, replace = TRUE)
age_sq          <- age^2
house_ownership <- rbinom(n, 1, 0.6)   # 1 = owns house
labor_status    <- rbinom(n, 1, 0.7)   # 1 = employed
electricity     <- rbinom(n, 1, 0.8)   # 1 = has electricity

walls  <- sample(c("brick","mud"),          n, replace = TRUE, prob = c(0.7, 0.3))
roof   <- sample(c("metal","thatch"),       n, replace = TRUE, prob = c(0.8, 0.2))
floor  <- sample(c("dirt","concrete","tile"), n, replace = TRUE, prob = c(0.2,0.6,0.2))
area   <- sample(c("urban","rural"),        n, replace = TRUE, prob = c(0.5,0.5))

df <- data.frame(
  age, age_sq, house_ownership, labor_status, electricity,
  walls = factor(walls), roof = factor(roof),
  floor = factor(floor), area = factor(area)
)

# 2. Create dummy indicators (one column per level)
dummies <- model.matrix(~ walls + roof + floor + area - 1, data = df)
print(colnames(dummies))    # inspect which dummy columns you got
df <- cbind(df, dummies)

# 3. Define “base” coefficients
base_coeffs <- c(
  age             = 0.3,
  age_sq          = -0.002,
  house_ownership = 5,
  labor_status    = 10,
  electricity     = 8
)

# 4. Define dummy‐variable coefficients
#    (only for the dummy columns you actually have)
dummy_coeffs <- c(
  wallsbrick    =  3,
  wallsmud      = -3,
  roofmetal     =  4,
  roofthatch    = -4,
  floordirt     = -5,
  floorconcrete =  5,
  floortile     = 10,
  areaurban     =  7,
  arearural     = -7
)

# 5. Build the linear predictor
# 5.1 Start with the “base” terms
linear_part <- with(df,
  age * base_coeffs["age"] +
  age_sq * base_coeffs["age_sq"] +
  house_ownership * base_coeffs["house_ownership"] +
  labor_status * base_coeffs["labor_status"] +
  electricity * base_coeffs["electricity"]
)

# 5.2 Add in each dummy term that exists in df
for (nm in intersect(names(dummy_coeffs), colnames(df))) {
  linear_part <- linear_part + df[[nm]] * dummy_coeffs[nm]
}

# 6. Add non‐normal (exponential) noise
noise <- rexp(n, rate = 1/10) - 10

# 7. Final consumption per capita
df$consumption_per_capita <- linear_part + noise

# 8. Quick check
head(df)


[1] "wallsbrick" "wallsmud"   "roofthatch" "floordirt"  "floortile" 
[6] "areaurban" 


Unnamed: 0_level_0,age,age_sq,house_ownership,labor_status,electricity,walls,roof,floor,area,wallsbrick,wallsmud,roofthatch,floordirt,floortile,areaurban,consumption_per_capita
Unnamed: 0_level_1,<int>,<dbl>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,31,961,0,1,1,mud,metal,dirt,urban,0,1,0,1,0,1,14.7274878
2,74,5476,1,1,1,mud,thatch,concrete,rural,0,1,1,0,0,0,25.5128822
3,21,441,1,1,1,mud,metal,concrete,rural,0,1,0,0,0,0,16.099514
4,56,3136,1,1,1,brick,metal,concrete,urban,1,0,0,0,0,1,41.742291
5,18,324,0,0,1,mud,metal,concrete,rural,0,1,0,0,0,0,0.0273705
6,51,2601,0,1,0,brick,metal,concrete,rural,1,0,0,0,0,0,13.3279851
