In [33]:
import numpy as np
import pandas as pd

In [34]:
GEO = "GEODGHR4"
GEN_015 = "GEN_015"
SDC_015 = "SDC_015"
NDVI = "NDVI"

In [35]:
data = pd.read_csv("C:\\Users\\xia_t\\Desktop\\Projects\\youreka\\dataset\\all_subsetted_group.csv")

# filter mental health and indigenous status
data = data.loc[data[SDC_015] <= 2]
data = data.loc[data[GEN_015] <= 5]

In [36]:
# normalize NDVI into a range of [0, 1]
n = data[NDVI]
data[NDVI] = (n - n.min()) / (n.max() - n.min())

# min 0.40255
# 0.53163_
# 0.660716_
# max 0.7898

In [37]:
# unique respondant identifier
import uuid

sample_n = len(data)
uuids = [str(uuid.uuid4()) for _ in range(sample_n)]
data["UUID"] = uuids

In [38]:
# categorize NDVI into N_BINS intervals
N_BINS = 3
bins = np.linspace(0, 1, N_BINS + 1)

for i in range(N_BINS):
    lower = bins[i]
    upper = bins[i + 1]

    inclusive = "right"
    if i == 0:
        inclusive = "both"

    data.loc[data["NDVI"].between(lower, upper, inclusive), "NDVI_BINS"] = i

data["NDVI"] = data["NDVI_BINS"]

In [39]:
from scipy.stats.contingency import crosstab
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association

class TestResult:
    def __init__(self, chi2, cramer) -> None:
        self.chi2 = chi2
        self.cramer = cramer


def chi2_test(contingency_table, a = True):
    return TestResult(
        chi2_contingency(contingency_table, correction=True),
        association(contingency_table, method="cramer", correction=True) if a else None,
    )

def gen_table(cols, data):
    return crosstab(*[data[col] for col in cols]).count


In [40]:
from tqdm.notebook import trange

grouped_sdc_015 = data.groupby([SDC_015])
grouped_ndvi = data.groupby([NDVI])
grouped_inter = data.groupby([SDC_015, NDVI])

results_list = []

for i in trange(250):
    data_sdc_015 = grouped_sdc_015.apply(lambda x: x.sample(67, random_state=i).reset_index(drop=True))
    contingency_sdc_015 = gen_table([SDC_015, GEN_015], data_sdc_015)
    sdc_015 = chi2_test(contingency_sdc_015)

    data_ndvi = grouped_ndvi.apply(lambda x: x.sample(65, random_state=i).reset_index(drop=True))
    contingency_ndvi = gen_table([NDVI, GEN_015], data_ndvi)
    ndvi = chi2_test(contingency_ndvi)

    data_inter = grouped_inter.apply(lambda x: x.sample(47, random_state=i).reset_index(drop=True))
    contingency_inter = gen_table([SDC_015, NDVI, GEN_015], data_inter)
    inter = chi2_test(contingency_inter, a=False)

    results_list.append((sdc_015, ndvi, inter, i))


  0%|          | 0/250 [00:00<?, ?it/s]

In [41]:
TESTS = [
    SDC_015,
    NDVI,
    "INTER"
]

PROPERTIES = {
    "STATISTIC": lambda x: x.chi2.statistic,
    "PVALUE": lambda x: x.chi2.pvalue,
    "DOF": lambda x: x.chi2.dof,
    "CRAMERV": lambda x: x.cramer
}

cols = [
    f"{test}_{property_}"
    for test in TESTS
    for property_ in PROPERTIES
]
cols.append("SEED")
results = pd.DataFrame(index=np.arange(len(results_list)), columns=cols)

for i, res in enumerate(results_list):
    row = []
    for test in res[:-1]:
        for unpack_function in PROPERTIES.values():
            row.append(unpack_function(test))
    
    row.append(res[-1])
    results.loc[i] = row


In [42]:
results.to_csv("tests.csv")