In [1]:
''' This script creates results/distances.pkl - calculating distance between source and target data attributes '''

' This script creates results/distances.pkl - calculating distance between source and target data attributes '

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# global imports
import time
import pickle
from scipy.stats import wasserstein_distance
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# local imports
import utils
from decision_tree_classifier import epmf

In [3]:
%%time

### parameters
subset = utils.get_subset('all')
# load data
X_train_s, X_test_s, y_train_s, y_test_s, X_train_t, X_test_t, y_train_t, y_test_t = utils.load_ACSPublicCoverage(subset)

AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA PR RI SC SD TN TX UT VA VT WA WI WV WY CPU times: total: 1min 48s
Wall time: 1min 48s


In [4]:
# calculate distances on an attribute between source and domain
def distances(source, target, att):
    is_cat = att in utils.cat_atts # categorical?
    xs = X_train_s[source][att] # source att
    xt = X_train_t[target][att] # target att
    ys = y_train_s[source] # source y
    yt = y_train_t[target] # target y
    values = sorted(list(set(xs.unique()) | set(xt.unique()))) # distinct values in source or target
    classes = sorted(list(set(ys.unique()) | set(yt.unique()))) # distinct classes in source or target
    ns, nt = len(xs), len(xt)
    # pmf at source and target
    pmf_s, pmf_t = [], []
    # cumulative at source and target
    cdf_s = cdf_t = 0
    # sum_v P_s(Y|X=v)*P_t(X=v)
    y_cond_est = 0
    # sum_v d(P_s(I(X=v)), P_t(I(X=v))*P_t(X=v)
    d_splits = 0 
    for value in values:
        p_s = sum(xs==value)/ns # P_s(value)
        pmf_s.append(p_s) 
        cdf_s += p_s # cumulative at source
        p_t = sum(xt==value)/nt # P_t(value)
        pmf_t.append(p_t)
        cdf_t += p_t # cumulative at target
        y_cond_est += epmf(ys[xs==value], classes)*p_t
        if is_cat:
            d_splits += wasserstein_distance([p_s, 1-p_s], [p_t, 1-p_t])*p_t
        else:
            d_splits += wasserstein_distance([cdf_s, 1-cdf_s], [cdf_t, 1-cdf_t])*p_t
    # d(P_s(X), P_t(X))
    d_att = wasserstein_distance(pmf_s, pmf_t) 
    # d(P_s(X), P_t(Y))
    d_y_cond = wasserstein_distance(y_cond_est, epmf(yt, classes))
    return {'d_att':d_att, 'd_y_cond':d_y_cond, 'd_splits':d_splits}

In [5]:
%%time

# compute distances between source and target
dists = dict()
for  sr in utils.states:
    for tg in utils.states:
        dists[(sr, tg)] = dict()
        for att in X_train_s[sr].columns.to_list():
            dists[(sr, tg)][att] = distances(sr, tg, att)
            #print(sr, tg, att, dists[(sr, tg)][att])
pickle.dump(dists, open("results/distances.pkl", "wb" ))

CPU times: total: 2h 40min 22s
Wall time: 2h 40min 23s
