In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

from sklearn.cluster import KMeans 
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering

In [3]:
import sys

sys.path.append('../../src')



In [4]:
from base_exp_gen import generate_experiment as GEN_EXP
from clustering_algos import agglomerative_clustering, kmeans_clustering, box_clustering
from sklearn.model_selection import train_test_split as datasplit
from sklearn.ensemble import GradientBoostingRegressor as GDBR
from evaluations import computeATT_per_cluster, predict_cf, calculate_ite, get_homogeneity



In [5]:
np.random.seed(seed = 0)

In [6]:
pd.set_option("display.max_columns", None)


In [7]:
x_partition = [[i,i**2,2*i**4, 50000] for i in [2,4,5,10,12,20,25,33]]
def its(x):
    for i, (x1,x2,x3,x4) in enumerate(x[1:]):
        x[i+1][3]=(int(x[i][3]/(2*np.sqrt(x3/x[i][2]))))
    return x

In [8]:
a = its(x_partition)
exps = pd.DataFrame(a, columns = ['X-parts','Clusters', 'N', 'iters'])

In [9]:
name = 'box'
exps.index.name = name

In [10]:
exps['mean-hom'] = 0
exps['std-hom'] = 0

In [11]:
exps

Unnamed: 0_level_0,X-parts,Clusters,N,iters,mean-hom,std-hom
box,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2,4,32,50000,0,0
1,4,16,512,6250,0,0
2,5,25,1250,2000,0,0
3,10,100,20000,250,0,0
4,12,144,41472,86,0,0
5,20,400,320000,15,0,0
6,25,625,781250,4,0,0
7,33,1089,2371842,1,0,0


In [12]:
import time

In [13]:
%%time
col_select = ['x0', 'x1']
gen = []
for i,row in exps.iterrows():
    c, n, iters = row.Clusters, row.N, row.iters
    homc = []
    print('running:',c,n)
    ss = time.time()
    for j in range(iters):
        ###gneerate data
        N,D = n,2
        kw_generate =  {'N':n, 'D':D,'f_gen_name': 'uniform_gen','low':0, 'high': 1}
        kw_cluster =  {'f_class_name':'circle_class', 'centers': [0.5, 0.5], 'r_small': 0.2, 'r_big':0.4}
        kw_treatment = {'f_treat_name' :'uniform_treat','choices':[0,1], 'probabilities':[0.5,0.5]}
        std = 5
        stats = np.array([[1, std], [0, std], [1, std], [1, std]])
        kw_outcome = {'f_outcome_name': 'outcome1','treatment':'Treatment', 'cls':'C', 'stats':stats}

        data = GEN_EXP(kw_generate, kw_cluster, kw_treatment, kw_outcome).dat
        data = calculate_ite(data.copy(), treatment = 'Treatment',
                     counterfactual = 'Ycf', outcome = 'Y', ite_name = 'ITE')
        
        ### cluster data
        data2,_ = box_clustering(data, clusters=c,col_select = col_select, cluster_name = 'A')
        ### calc hom
        res = computeATT_per_cluster(data2.copy(), hom_name = 'HOM',weight_names = 'W', 
                                     cluster_name  = "A", att = False)
        hom = (res['HOM']*res['W']).sum(axis = 0)
        homc.append(hom)
        
    ###get stats
    ee = time.time() - ss
    print('time:',ee)
    homcnp = np.array(homc)
    gen.append(homcnp)
    mean = np.mean(homcnp)
    std = np.std(homcnp)
    exps.loc[i,'mean-hom'] = mean
    exps.loc[i,'std-hom'] = std
    
    

running: 4 32
time: 7772.299128770828
running: 16 512
time: 1816.9583611488342
running: 25 1250
time: 790.0982978343964
running: 100 20000
time: 308.4517066478729
running: 144 41472
time: 148.02998614311218
running: 400 320000
time: 78.26035165786743
running: 625 781250
time: 35.72227334976196
running: 1089 2371842
time: 20.80250859260559
CPU times: user 1h 20min 13s, sys: 17.7 s, total: 1h 20min 30s
Wall time: 3h 2min 50s


In [14]:
exps

Unnamed: 0_level_0,X-parts,Clusters,N,iters,mean-hom,std-hom
box,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2,4,32,50000,0.671417,0.061389
1,4,16,512,6250,0.669695,0.01504
2,5,25,1250,2000,0.776799,0.010427
3,10,100,20000,250,0.937516,0.001523
4,12,144,41472,86,0.918619,0.001281
5,20,400,320000,15,0.96272,0.000296
6,25,625,781250,4,0.95502,0.000335
7,33,1089,2371842,1,0.973252,0.0


In [15]:
from pathlib import Path



In [16]:
p = './homogstd5/'
Path('./homogstd5/').mkdir(parents = True, exist_ok = True)
exps.to_csv(p+name+'.csv')


### MISC

%%time
homlist = []
for i in range(100):
    N,D,c = 9**2,2,9
    kw_generate =  {'N':n, 'D':D,'f_gen_name': 'uniform_gen','low':0, 'high': 1}
    kw_cluster =  {'f_class_name':'circle_class', 'centers': [0.5, 0.5], 'r_small': 0.2, 'r_big':0.4}
    kw_treatment = {'f_treat_name' :'uniform_treat','choices':[0,1], 'probabilities':[0.5,0.5]}

    stats = np.array([[1, 0.75], [0, 0.75], [1, 0.75], [1, 0.75]])
    kw_outcome = {'f_outcome_name': 'outcome1','treatment':'Treatment', 'cls':'C', 'stats':stats}

    data3 = GEN_EXP(kw_generate, kw_cluster, kw_treatment, kw_outcome).dat
    data3 = calculate_ite(data3.copy(), treatment = 'Treatment',
                 counterfactual = 'Ycf', outcome = 'Y', ite_name = 'ITE')

    ### cluster data
    data3,_ = box_clustering(data3, clusters=c,col_select = col_select, cluster_name = 'A')
    ### calc hom
    res3 = computeATT_per_cluster(data3.copy(), hom_name = 'HOM',weight_names = 'W', 
                                 cluster_name  = "A", att = False)

    hom3 = (res3['HOM']*res3['W']).sum(axis = 0)
    homlist.append(hom3)

np.array(homlist).mean()

np.array(homlist).mean()

res3

hom3 = (res3['HOM']*res3['W']).sum(axis = 0)


sc.special.comb(4,2)

import scipy as sc


mu = 0.63
ss = 0
N = 16
for k in range(N+1):
    prod= sc.special.comb(N,k)*((1/4)**k)*(3/4)**(N-k)
    sumi = 0
    for i in range(k+1):
        sumi += max(i,k-i)*sc.special.comb(k,i)*(mu**i)*(1-mu)**(k-i)
        
    ss += prod*sumi
        

ss/4

sss = 0
k= 4
mu = 0.623
for i in range(k+1):
    prod= max(k-i,i)*sc.special.comb(k,i)*((mu)**i)*(1-mu)**(k-i)
    sss += prod

sss/k

ss/4