# Antibody levels buildings with fixed households

Check whether antibody levels in a building are more similar than one would expect statistically, if people still stay in the same household.

Here, we implement an approximate permutation by only permuting households of the same size.

## Data preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from copy import deepcopy
import multiprocessing as mp
from jabbar import jabbar
from pyprojroot import here
import os

import sys
base_path = str(here("", project_files=[".here"]))
perm_path = os.path.join(base_path, "PermutationStudies")
if perm_path not in sys.path:
    sys.path.insert(0, perm_path)
from src.functions import *

%matplotlib inline

random.seed(0)
np.random.seed(0)

# control variable of interest
var = 'address_id'
# measurements to study
data_key = 'R2_Result'
# number of permutations
n_perm = %%n_perm%%

# identifier
id_ = f"bd2_{data_key}_{n_perm}"

In [None]:
blab = read_blab(base_path)

geo = read_geo(base_path)

# restrict to study households
geo = geo[geo.hht_ID.isin(blab.hh_id.unique())]

# merge
geo = geo.rename(columns={'hht_ID': 'hh_id'})
data = pd.merge(blab, geo)
print(blab.shape, geo.shape, data.shape, "initially")

# remove duplicate columns
data = data.drop_duplicates(subset=['ind_id'], keep='first')
print(data.shape, "after remove duplicates")

# remove nans   
data = data[data[data_key].notnull()]
print(data.shape, "after remove nans")

# translate results
data[data_key] = (data[data_key] == "Positive").astype(float)

# data plot
fig, ax = plt.subplots(figsize=(3, 3))
ax.hist(data[data_key], color='C0', bins=100)
ax.set_xlabel(data_key)
ax.set_ylabel("Frequency")
fig.tight_layout()

## Household aware permutation function

In [None]:
%%time
hh_ids_uq = np.asarray(data.hh_id.unique())
vals_by_hh = []
for hh_id in hh_ids_uq:
    vals_by_hh.append(np.asarray(data[data.hh_id==hh_id][data_key]))
    
bd_ids_uq = np.asarray(data.address_id.unique())
bds_by_hh = []
for hh_id in hh_ids_uq:
    bds_by_hh.append(float(data[data.hh_id==hh_id].address_id.unique()))
bds_by_hh = np.array(bds_by_hh)

## Define statistics

Completely vectorized numpy implementation:

In [None]:
%%time
# this matches other implementations
real_variance = mean_variance(vals_by_hh, bds_by_hh, bd_ids_uq)
real_variance

## Permutation test

In [None]:
%%time

# set random seed for reproducibility
np.random.seed(0)

# for results
variances = []

# loop over all permutations
for _ in jabbar(range(n_perm), symbols='🦄'):
    variances.append(permuted_mean_variance(
        vals_by_hh, bds_by_hh, bd_ids_uq))

# to numpy arrays
variances = np.array(variances)

In [None]:
#save data
save_data(id_, variances=variances,
          real_variance=real_variance,
          perm_path=perm_path)

## Analysis

In [None]:
# load data
variances, real_variance = load_data(
    id_=id_, obj_keys=['variances', 'real_variance'],
    perm_path=perm_path)

In [None]:
# plot for variances
plot_kde(samples=variances, obj_key='variances', real_sample=real_variance,
         data_key=data_key, id_=id_, suptitle="Average variance over buildings",
         perm_path=perm_path)
plot_hist(samples=variances, obj_key='variances', real_sample=real_variance,
          data_key=data_key, id_=id_, suptitle="Average variance over buildings",
          perm_path=perm_path)

In [None]:
print("Percentiles:")
print("Variance", data_key, sum(variances <= real_variance) / len(variances))