In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

safi = pd.read_csv("/content/safi_data.csv")

In [13]:
safi

Unnamed: 0,key_ID,village,interview_date,no_membrs,years_liv,respondent_wall_type,rooms,memb_assoc,affect_conflicts,liv_count,...,has_radio,has_cow_plough,has_solar_panel,has_bicycle,has_solar_torch,has_table,has_motorcycle,has_television,has_cow_cart,segment
0,1,God,2016-11-17T00:00:00Z,3,4,muddaub,1,missing,missing,1,...,0,0,1,1,0,1,0,1,0,2
1,1,God,2016-11-17T00:00:00Z,7,9,muddaub,1,yes,once,3,...,1,1,1,1,1,1,0,0,1,2
2,3,God,2016-11-17T00:00:00Z,10,15,burntbricks,1,missing,missing,1,...,0,0,0,0,1,0,0,0,0,1
3,4,God,2016-11-17T00:00:00Z,7,6,burntbricks,1,missing,missing,2,...,1,1,1,1,0,0,0,0,0,1
4,5,God,2016-11-17T00:00:00Z,7,40,burntbricks,1,missing,missing,4,...,1,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,126,Ruaca,2017-05-18T00:00:00Z,3,7,burntbricks,1,no,more_once,3,...,1,0,1,0,0,0,0,0,0,1
127,193,Ruaca,2017-06-04T00:00:00Z,7,10,cement,3,no,more_once,3,...,1,1,0,0,1,1,0,1,0,2
128,194,Ruaca,2017-06-04T00:00:00Z,4,5,muddaub,1,no,more_once,1,...,1,0,1,0,1,0,0,0,0,2
129,199,Chirodzo,2017-06-04T00:00:00Z,7,17,burntbricks,2,yes,more_once,2,...,1,1,1,0,1,0,0,1,1,2


In [None]:
#M1 - previous stage
tmp = safi[["village","items_owned"]].copy()
tmp["items"] = tmp["items_owned"].fillna("no_listed_items").replace("", "no_listed_items")
tmp["items"] = tmp["items"].str.split(";")
long = tmp.explode("items")
long["items"] = long["items"].str.strip()

counts = long.groupby(["village","items"]).size().reset_index(name="n")
pct = counts.assign(pct=counts["n"] / counts.groupby("village")["n"].transform("sum") * 100)

heat = pct.pivot(index="items", columns="village", values="pct").fillna(0)

plt.figure(figsize=(6,8))
plt.imshow(heat.values, aspect="auto")
plt.xticks(range(heat.shape[1]), heat.columns, rotation=45)
plt.yticks(range(heat.shape[0]), heat.index)
plt.colorbar(label="% of respondents")
plt.title("Asset ownership by village")
plt.tight_layout()
plt.show()

In [None]:
safi["number_items"] = safi["items_owned"].fillna("").apply(
    lambda x: 0 if x == "" else len([i for i in x.split(";") if i.strip()])
)

order = sorted(safi["village"].dropna().unique())
data = [safi.loc[safi["village"]==v, "number_items"].dropna() for v in order]

plt.figure(figsize=(6,4))
plt.boxplot(data, labels=order)
plt.ylabel("number_items")
plt.title("Distribution of items owned by village")
plt.tight_layout()
plt.show()

In [25]:
# M2 - Current Stage
# EDA & Preprocessing
# Text conversion on assets
import re

def split_sc(x):
    if pd.isna(x) or str(x).strip() == "":
        return []
    return [t.strip() for t in str(x).split(";") if t.strip() != ""]

items_series = safi["items_owned"].fillna("").astype(str)
items_list = items_series.apply(split_sc)

items_exploded = items_list.explode()
items_exploded = items_exploded[items_exploded != ""]
top_items = items_exploded.value_counts().head(10).index.tolist()

for it in top_items:
    safe = re.sub(r"[^A-Za-z0-9_]+", "_", it).strip("_")
    safi[f"has_{safe}"] = items_list.apply(lambda lst: int(it in lst))

In [27]:
# Text conversion on food insecurity
safi["food_shortage_month_count"] = safi["months_lack_food"].apply(split_sc).apply(len)

safi

Unnamed: 0,key_ID,village,interview_date,no_membrs,years_liv,respondent_wall_type,rooms,memb_assoc,affect_conflicts,liv_count,...,has_cow_plough,has_solar_panel,has_bicycle,has_solar_torch,has_table,has_motorcycle,has_television,has_cow_cart,segment,has_motorcyle
0,1,God,2016-11-17T00:00:00Z,3,4,muddaub,1,missing,missing,1,...,0,1,1,0,1,0,1,0,2,0
1,1,God,2016-11-17T00:00:00Z,7,9,muddaub,1,yes,once,3,...,1,1,1,1,1,0,0,1,1,0
2,3,God,2016-11-17T00:00:00Z,10,15,burntbricks,1,missing,missing,1,...,0,0,0,1,0,0,0,0,2,0
3,4,God,2016-11-17T00:00:00Z,7,6,burntbricks,1,missing,missing,2,...,1,1,1,0,0,0,0,0,1,0
4,5,God,2016-11-17T00:00:00Z,7,40,burntbricks,1,missing,missing,4,...,1,0,0,0,0,1,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,126,Ruaca,2017-05-18T00:00:00Z,3,7,burntbricks,1,no,more_once,3,...,0,1,0,0,0,1,0,0,2,1
127,193,Ruaca,2017-06-04T00:00:00Z,7,10,cement,3,no,more_once,3,...,1,0,0,1,1,0,1,0,1,0
128,194,Ruaca,2017-06-04T00:00:00Z,4,5,muddaub,1,no,more_once,1,...,0,1,0,1,0,0,0,0,2,0
129,199,Chirodzo,2017-06-04T00:00:00Z,7,17,burntbricks,2,yes,more_once,2,...,1,1,0,1,0,1,1,1,1,1


In [31]:
# Categorical cleaning
safi["memb_assoc"] = safi["memb_assoc"].fillna("missing").astype(str)
safi["affect_conflicts"] = safi["affect_conflicts"].fillna("missing").astype(str)

safi

Unnamed: 0,key_ID,village,interview_date,no_membrs,years_liv,respondent_wall_type,rooms,memb_assoc,affect_conflicts,liv_count,...,has_cow_plough,has_solar_panel,has_bicycle,has_solar_torch,has_table,has_motorcycle,has_television,has_cow_cart,segment,has_motorcyle
0,1,God,2016-11-17T00:00:00Z,3,4,muddaub,1,missing,missing,1,...,0,1,1,0,1,0,1,0,2,0
1,1,God,2016-11-17T00:00:00Z,7,9,muddaub,1,yes,once,3,...,1,1,1,1,1,0,0,1,1,0
2,3,God,2016-11-17T00:00:00Z,10,15,burntbricks,1,missing,missing,1,...,0,0,0,1,0,0,0,0,2,0
3,4,God,2016-11-17T00:00:00Z,7,6,burntbricks,1,missing,missing,2,...,1,1,1,0,0,0,0,0,1,0
4,5,God,2016-11-17T00:00:00Z,7,40,burntbricks,1,missing,missing,4,...,1,0,0,0,0,1,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,126,Ruaca,2017-05-18T00:00:00Z,3,7,burntbricks,1,no,more_once,3,...,0,1,0,0,0,1,0,0,2,1
127,193,Ruaca,2017-06-04T00:00:00Z,7,10,cement,3,no,more_once,3,...,1,0,0,1,1,0,1,0,1,0
128,194,Ruaca,2017-06-04T00:00:00Z,4,5,muddaub,1,no,more_once,1,...,0,1,0,1,0,0,0,0,2,0
129,199,Chirodzo,2017-06-04T00:00:00Z,7,17,burntbricks,2,yes,more_once,2,...,1,1,0,1,0,1,1,1,1,1


In [35]:
# Table for clustering
binary_cols = [c for c in safi.columns if c.startswith("has_")]

feature_cols = [
    "no_meals",
    "liv_count",
    "number_items",
    "food_shortage_month_count",
    "memb_assoc",
    "affect_conflicts",
] + binary_cols

X = safi[feature_cols].copy()

num_cols = ["no_meals","liv_count","number_items","food_shortage_month_count"]
for c in num_cols:
    X[c] = pd.to_numeric(X[c], errors="coerce")
    X[c] = X[c].fillna(X[c].median())

X

Unnamed: 0,no_meals,liv_count,number_items,food_shortage_month_count,memb_assoc,affect_conflicts,has_mobile_phone,has_radio,has_cow_plough,has_solar_panel,has_bicycle,has_solar_torch,has_table,has_motorcycle,has_television,has_cow_cart,has_motorcyle
0,2,1,4,1,missing,missing,0,0,0,1,1,0,1,0,1,0,0
1,2,3,8,5,yes,once,1,1,1,1,1,1,1,0,0,1,0
2,2,1,1,6,missing,missing,0,0,0,0,0,1,0,0,0,0,0
3,2,2,5,4,missing,missing,1,1,1,1,1,0,0,0,0,0,0
4,2,4,4,4,missing,missing,1,1,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,3,3,3,3,no,more_once,0,1,0,1,0,0,0,1,0,0,1
127,3,3,12,1,no,more_once,1,1,1,0,0,1,1,0,1,0,0
128,3,1,4,3,no,more_once,1,1,0,1,0,1,0,0,0,0,0
129,3,2,12,2,yes,more_once,1,1,1,1,0,1,0,1,1,1,1


In [38]:
# Analysis & Experiments
!pip -q install gower

import gower
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import silhouette_score
import pandas as pd


In [41]:
# Compute Gower distance
D = gower.gower_matrix(X)
D_condensed = squareform(D, checks=False)
D_condensed


array([0.5392157 , 0.33529413, 0.33039215, ..., 0.40490195, 0.2784314 ,
       0.45      ], dtype=float32)

In [48]:
# clustering
Z_avg = linkage(D_condensed, method="average")
Z_avg

array([[3.00000000e+01, 3.80000000e+01, 0.00000000e+00, 2.00000000e+00],
       [6.00000000e+01, 9.00000000e+01, 0.00000000e+00, 2.00000000e+00],
       [5.00000000e+00, 7.40000000e+01, 5.88235306e-03, 2.00000000e+00],
       [1.60000000e+01, 7.10000000e+01, 5.88235306e-03, 2.00000000e+00],
       [6.20000000e+01, 1.31000000e+02, 1.76470596e-02, 3.00000000e+00],
       [7.50000000e+01, 1.33000000e+02, 1.86274517e-02, 3.00000000e+00],
       [1.80000000e+01, 2.60000000e+01, 2.64705885e-02, 2.00000000e+00],
       [4.10000000e+01, 1.15000000e+02, 2.94117648e-02, 2.00000000e+00],
       [3.70000000e+01, 3.90000000e+01, 4.11764719e-02, 2.00000000e+00],
       [5.80000000e+01, 1.36000000e+02, 4.44444455e-02, 4.00000000e+00],
       [1.02000000e+02, 1.05000000e+02, 4.50980403e-02, 2.00000000e+00],
       [2.00000000e+01, 5.30000000e+01, 4.70588244e-02, 2.00000000e+00],
       [4.70000000e+01, 5.00000000e+01, 5.29411770e-02, 2.00000000e+00],
       [6.50000000e+01, 8.20000000e+01, 5.88235296e

In [45]:
# evaluate k = 2..8 with silhouette
rows = []
for k in range(2, 9):
    labels = fcluster(Z_avg, t=k, criterion="maxclust")
    sil = silhouette_score(D, labels, metric="precomputed")
    rows.append({"k": k, "silhouette": sil})

sil_avg = pd.DataFrame(rows).sort_values("silhouette", ascending=False)
display(sil_avg)

Unnamed: 0,k,silhouette
0,2,0.288989
2,4,0.225067
3,5,0.203501
4,6,0.202721
5,7,0.195743
6,8,0.18595
1,3,0.176724


In [47]:
# Choose k and assign segments
best_k = int(sil_avg.iloc[0]["k"])
safi["segment"] = fcluster(Z_avg, t=best_k, criterion="maxclust").astype(int)

print("Chosen k =", best_k)
display(safi["segment"].value_counts().sort_index())

Chosen k = 2


Unnamed: 0_level_0,count
segment,Unnamed: 1_level_1
1,94
2,37
