In [None]:
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_stata("../data/raw/MWI_2010_individual.dta", convert_categoricals=False)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df3.isnull().sum()

In [None]:
import pandas as pd

# -------------------------------
# 1. Drop very sparse columns (>90% missing)
# -------------------------------
drop_cols = [
    "ind_health2",
    "ind_health4",
    "ind_health6",
    "ind_health7",
    "ind_health8",
    "ind_birthplace",
    "ind_birthattend",
    "ind_work5",
]

# optional: drop ind_language if you consider too sparse
drop_cols.append("ind_language")

df = df.drop(columns=drop_cols)

# -------------------------------
# 2. Impute / encode categorical missing values
# -------------------------------
# columns where missing means "unknown"
unknown_cols = [
    "ind_educfath",
    "ind_educmoth",
    "ind_religion",
    "ind_marital",
    "ind_rwchichewa",
    "ind_rwenglish",
    "ind_educ01",
    "ind_breakfast",
    "ind_work1",
    "ind_work2",
    "ind_work6",
]

for col in unknown_cols:
    df[col] = df[col].fillna("unknown")

# -------------------------------
# 3. Education conditional columns → "not_applicable"
# -------------------------------
educ_conditional = [
    "ind_educ02",
    "ind_educ03",
    "ind_educ04",
    "ind_educ05",
    "ind_educ06",
    "ind_educ07",
    "ind_educ08",
    "ind_educ09",
    "ind_educ10",
    "ind_educ11",
    "ind_educ12",
]
for col in educ_conditional:
    df[col] = df[col].fillna("not_applicable")

# -------------------------------
# 4. Health variables with smaller missingness
# -------------------------------
health_cols = ["ind_health1", "ind_health3", "ind_health5"]
for col in health_cols:
    df[col] = df[col].fillna("no_response")

# -------------------------------
# 5. Work variable with high NA but keep
# -------------------------------
df["ind_work3"] = df["ind_work3"].fillna("not_applicable")

# -------------------------------
# 6. Convert to categorical where appropriate
# -------------------------------
categorical_cols = [col for col in df.columns if col not in ["ind_age", "wta_hh"]]
df[categorical_cols] = df[categorical_cols].astype("category")

In [None]:
df.columns

In [None]:
import pandas as pd

# ---------- Load raw ----------
raw_path = "../data/raw/MWI_2010_individual.dta"
df_raw = pd.read_stata(raw_path, convert_categoricals=False)

# ---------- Missingness summary BEFORE ----------
missing_counts = df_raw.isnull().sum().sort_values(ascending=False)
print("\nMissing values BEFORE cleaning (top 30):")
print(missing_counts.head(30))

# ---------- Missingness heatmap (BEFORE) ----------
# To keep the plot readable/fast, sample rows if very large
sample_n = min(5000, len(df_raw))
df_samp = df_raw.sample(sample_n, random_state=42)

# Build a boolean matrix (1 = missing, 0 = present), order columns by % missing
miss_mat = df_samp.isnull().astype(int)
col_order = miss_mat.mean().sort_values(ascending=False).index
miss_mat = miss_mat[col_order].to_numpy()

plt.figure(figsize=(12, 6))
plt.imshow(miss_mat, aspect="auto", interpolation="nearest")
plt.title(f"Missingness Heatmap (before cleaning) — {sample_n} sampled rows")
plt.xlabel("Columns (sorted by % missing)")
plt.ylabel("Rows (sample)")
plt.tight_layout()
plt.show()

# ---------- CLEANING RULES ----------
# 1) Drop very sparse columns (>~90% missing or per earlier decision)
drop_cols = [
    "ind_health2",
    "ind_health4",
    "ind_health6",
    "ind_health7",
    "ind_health8",
    "ind_birthplace",
    "ind_birthattend",
    "ind_work5",
    "ind_language",  # optional drop (high missingness); remove if you prefer to keep
]
df = df_raw.drop(columns=[c for c in drop_cols if c in df_raw.columns])

# 2) Impute "unknown"
unknown_cols = [
    "ind_educfath",
    "ind_educmoth",
    "ind_religion",
    "ind_marital",
    "ind_rwchichewa",
    "ind_rwenglish",
    "ind_educ01",
    "ind_breakfast",
    "ind_work1",
    "ind_work2",
    "ind_work6",
]
for col in unknown_cols:
    if col in df.columns:
        df[col] = df[col].fillna("unknown")

# 3) Education conditional → "not_applicable"
educ_conditional = [
    "ind_educ02",
    "ind_educ03",
    "ind_educ04",
    "ind_educ05",
    "ind_educ06",
    "ind_educ07",
    "ind_educ08",
    "ind_educ09",
    "ind_educ10",
    "ind_educ11",
    "ind_educ12",
]
for col in educ_conditional:
    if col in df.columns:
        df[col] = df[col].fillna("not_applicable")

# 4) Health with smaller missingness → "no_response"
for col in ["ind_health1", "ind_health3", "ind_health5"]:
    if col in df.columns:
        df[col] = df[col].fillna("no_response")

# 5) Work var w/ high NA but keep → "not_applicable"
if "ind_work3" in df.columns:
    df["ind_work3"] = df["ind_work3"].fillna("not_applicable")

# 6) Dtypes
numeric_keep = ["ind_age", "wta_hh"]
categorical_cols = [c for c in df.columns if c not in numeric_keep]
df[categorical_cols] = df[categorical_cols].astype("category")

# ---------- EDA AFTER ----------
print("\nMissing values AFTER cleaning:")
print(df.isnull().sum())

# Distribution plots for imputed columns
for col in [
    *unknown_cols,
    *educ_conditional,
    "ind_work3",
    "ind_health1",
    "ind_health3",
    "ind_health5",
]:
    if col in df.columns:
        plt.figure(figsize=(6, 3))
        df[col].value_counts(dropna=False).plot(kind="bar")
        plt.title(f"Distribution after imputation: {col}")
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()

In [None]:
# balanced dataset

df.poor.value_counts()

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler


# ------------------------------
# Helper: build preprocessor
# ------------------------------
def build_preprocessor(X, model_type="tree", numeric_features=("ind_age", "wta_hh")):
    """
    Ensures categorical columns are coerced to string before encoding to avoid
    mixed-type errors like 'Encoders require uniformly strings or numbers'.
    model_type in {'tree','linear','knn','svm'}.
    """
    numeric_features = list(numeric_features)
    categorical_features = [c for c in X.columns if c not in numeric_features]

    # Step to coerce categorical block to string
    to_str = FunctionTransformer(lambda A: A.astype(str), feature_names_out="one-to-one")

    if model_type in ["linear", "knn", "svm"]:
        cat_pipe = Pipeline(
            [
                ("to_str", to_str),
                ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
            ]
        )
        num_pipe = StandardScaler()  # scale numeric for these models
        return ColumnTransformer(
            transformers=[
                ("num", num_pipe, numeric_features),
                ("cat", cat_pipe, categorical_features),
            ],
            remainder="drop",
        )

    elif model_type == "tree":
        # Trees don’t need scaling; OrdinalEncoder gives compact features
        cat_pipe = Pipeline(
            [
                ("to_str", to_str),
                ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
            ]
        )
        return ColumnTransformer(
            transformers=[
                ("num", "passthrough", numeric_features),
                ("cat", cat_pipe, categorical_features),
            ],
            remainder="drop",
        )
    else:
        raise ValueError("model_type must be 'tree', 'linear', 'knn', or 'svm'")

In [None]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# New imports
from xgboost import XGBClassifier

# ------------------------------
# Target & features
# ------------------------------
y = df["poor"]
X = df.drop(columns=["poor"])

# ------------------------------
# Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------------------
# Models (no SVM)
# ------------------------------
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000), "linear"),
    "Random Forest": (RandomForestClassifier(n_estimators=200, random_state=42), "tree"),
    "KNN": (KNeighborsClassifier(n_neighbors=5), "knn"),
    "XGBoost": (
        XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
        "tree",
    ),
    "LightGBM": (LGBMClassifier(random_state=42), "tree"),
}

# ------------------------------
# Train + Evaluate
# ------------------------------
for name, (model, model_type) in models.items():
    print(f"\n🔹 Training {name}...")
    preproc = build_preprocessor(X, model_type=model_type)
    pipe = Pipeline([("preprocess", preproc), ("model", model)])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print(f"✅ {name} Results")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))