In [1]:
import os
import ot
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import argparse
import time
from scipy.spatial import distance
from sklearn.cluster import KMeans



In [2]:
seed = 2000

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

In [3]:


class NumpyDataset:
    def __init__(self, data, colors):
        self.data = data
        self.colors = colors

    def __getitem__(self, index):
        return self.data[index], self.colors[index]

    def __len__(self):
        return len(self.data)


class NumpyDataLoader:
    def __init__(self, dataset, batch_size, shuffle=True, drop_last=False, output_ids=False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last
        self.output_ids = output_ids

    def __iter__(self):
        self.indices = np.arange(len(self.dataset))
        if self.shuffle:
            np.random.shuffle(self.indices)
        if self.drop_last:
            self.indices = self.indices[:len(self.indices) - len(self.indices) % self.batch_size]
        return self

    def __next__(self):
        if len(self.indices) == 0:
            raise StopIteration
        
        indices = self.indices[:self.batch_size]
        self.indices = self.indices[self.batch_size:]
        
        if self.output_ids:
            batch = []
            for i in indices:
                batch.append((i, self.dataset[i][0], self.dataset[i][1]))
            ids, data, labels = zip(*batch)
            return np.array(ids), np.array(data), np.array(labels)
        else:
            batch = [self.dataset[i] for i in indices]
            data, labels = zip(*batch)
            return np.array(data), np.array(labels)


def optimize_coupling(xs, centers, numItermax=5000, numThreads=3):

    # only for binary sensitive attribute
    n_colors = 2
    assert n_colors == len(xs)
    ns = [len(xs_i) for xs_i in xs]
    pis = np.array(ns) / np.sum(ns)
    # pi_0, pi_1 =  pis[0], pis[1]
    pi_0, pi_1 = 0.99, 0.01

    n_0, n_1 = ns[0], ns[1]
    w_0, w_1 = np.ones(n_0) / n_0, np.ones(n_1) / n_1
    
    D_01 = 2 * pi_0 * pi_1 * ot.dist(xs[0], xs[1])
    
    Taxs_0_1 = pi_0 * np.repeat(xs[0], n_1, 0) + pi_1 * np.tile(xs[1], (n_0, 1))
    mid_distances = distance.cdist(Taxs_0_1, centers, metric='minkowski', p=2)
    mid_assignments = np.argmin(mid_distances, axis=1)
    C_01 = (mid_distances[np.arange(mid_distances.shape[0]), mid_assignments]**2)
    # C_mami = C_mami.reshape(n_1, n_0).T
    C_01 = C_01.reshape(n_0, n_1)
    
    Coupling_0_1 = ot.emd(w_0, w_1, D_01+C_01, numItermax=numItermax, numThreads=numThreads)

    return Taxs_0_1, Coupling_0_1.flatten(), n_0, n_1



def optimize_center(Taxs, gammas, centers, centers_module, centers_optimizer, K, seed=2024, gradient_descent=False, use_cuda=False):
    if gradient_descent:
        Taxs = torch.from_numpy(Taxs).float().cuda() if use_cuda else torch.from_numpy(Taxs).float()
        gammas = torch.from_numpy(gammas).float().cuda() if use_cuda else torch.from_numpy(gammas).float()
        for _ in range(20):
            distances = torch.cdist(Taxs, centers_module.weight, p=2)
            assignments = torch.argmin(distances, dim=1)
            energy = (gammas * (distances[torch.arange(distances.shape[0]), assignments]**2)).sum()
            centers_optimizer.zero_grad()
            energy.backward()
            centers_optimizer.step()
        new_centers = centers_module.weight.data.cpu().detach().numpy()
    else:
        kmeans = KMeans(n_clusters=K, init=centers, random_state=seed)
        # kmeans = KMeans(n_clusters=K, random_state=seed)
        kmeans.fit(X=Taxs, sample_weight=gammas)
        new_centers = kmeans.cluster_centers_
    return new_centers



def random_hard_assigning(arr):
    max_values = np.max(arr, axis=1)
    max_mask = arr == max_values[:, None]
    chosen_indices = np.array([np.random.choice(np.where(row)[0]) for row in max_mask])
    return chosen_indices



def assigning(xs, Taxs, gammas, centers, K):
    """
    xs: (n, d) # n = n0 + n1
    colors: (n, )
    Taxs: (n0 * n1, d)
    gammas: (n0 * n1, )
    n_majors: (B, ) # B = batch_size
    n_minors: (B, ) # B = batch_size
    centers: (K, d)
    K: int
    """
    
    color_xs = [[], []]
    color_assignments = [[], []]
    for sub_xs, sub_Taxs, sub_gammas in zip(xs, Taxs, gammas):
        
        n_0, n_1 = sub_xs[0].shape[0], sub_xs[1].shape[0]
        
        sub_distances = distance.cdist(sub_Taxs, centers, metric='minkowski', p=2)
        sub_assignments = np.argmin(sub_distances, axis=1)
        
        # shape: (n0, n1)
        sub_gammas_i_j = sub_gammas.reshape(n_0, n_1)
        sub_assignments_i_j = sub_assignments.reshape(n_0, n_1)

        # for s = 0
        prob_assignments_0 = np.zeros(shape=(n_0, K))
        for row in range(n_0):
            for k in range(K):
                prob_assignments_0[row, k] = n_0 * np.sum(sub_gammas_i_j[row][sub_assignments_i_j[row] == k])

        # for s = 1
        prob_assignments_1 = np.zeros(shape=(n_1, K))
        for col in range(n_1):
            for k in range(K):
                prob_assignments_1[col, k] = n_1 * np.sum(sub_gammas_i_j.T[col][sub_assignments_i_j.T[col] == k])

        color_xs[0].append(sub_xs[0])
        color_xs[1].append(sub_xs[1])
        color_assignments[0].append(random_hard_assigning(prob_assignments_0))
        color_assignments[1].append(random_hard_assigning(prob_assignments_1))
    
    color_xs[0] = np.concatenate(color_xs[0])
    color_xs[1] = np.concatenate(color_xs[1])
    color_assignments[0] = np.concatenate(color_assignments[0])
    color_assignments[1] = np.concatenate(color_assignments[1])

    return color_xs, color_assignments



def evaluation(color_xs, color_assignments, centers, n_color, K):
    
    # objectives
    cluster_cnts = []
    objective = 0.0
    for xs_i, assignments_i in zip(color_xs, color_assignments):
        distances_i = distance.cdist(xs_i, centers, metric='minkowski', p=2)
        objective += (distances_i[np.arange(distances_i.shape[0]), assignments_i]**2).sum()
        
        sub_cluster_cnts = []
        for k in range(K):
            sub_cluster_cnts.append((assignments_i == k).sum())
        cluster_cnts.append(sub_cluster_cnts)
    
    # balance
    all_cluster_cnts = np.array(cluster_cnts).sum(axis=0)
    k_ratio = all_cluster_cnts / all_cluster_cnts.sum()
    s_balances = []
    for color in range(n_color):
        s_balances.append((np.array(cluster_cnts[color]) / np.sum(cluster_cnts[color])) / k_ratio)
    balance = np.array(s_balances).min(axis=1).min()

    return objective, balance



In [4]:
seed= 2024
full_batch=True
batch_size= 4096

gradient_descent = True
use_cuda = True

iters = 10
numItermax = 1000

l2_normalize = True

In [5]:
data_path ="data_combined_new.csv"
data = pd.read_csv(data_path)

In [6]:
new_data = data.iloc[:,7:].fillna(0)
# new_data = new_data.values / new_data.values.sum(axis=1).reshape(-1,1)

# new_data_gender = [0]*len(data)
new_data_gender = data.sensitive

In [7]:
    np_data = np.array(new_data)
    np_colors = np.array(new_data_gender)
    K = 5
    d = np_data.shape[1]
    n_color = 2

In [8]:
    xs = []
    for color in range(n_color):
        xs.append(np_data[np_colors == color])
        print(f'[Info] Data shape for {color}th color: {np_data[np_colors == color].shape}')
    colors = [i*np.ones(xs_i.shape[0]) for i, xs_i in enumerate(xs)]

[Info] Data shape for 0th color: (4331, 36)
[Info] Data shape for 1th color: (1709, 36)


In [9]:
    dset = NumpyDataset(np.concatenate(xs), np.concatenate(colors))
    batch_size = batch_size
    if full_batch:
        batch_size = len(np.concatenate(xs))
    dloader = NumpyDataLoader(dset, batch_size=batch_size,
                              shuffle=True, drop_last=False)

In [10]:
    # unfair clustering!
    kmeans = KMeans(n_clusters=K, random_state=2024)
    kmeans.fit(X=np.concatenate(xs))
    unfair_assignments = kmeans.predict(X=np.concatenate(xs))
    unfair_assignments = [unfair_assignments[:xs[0].shape[0]], unfair_assignments[xs[0].shape[0]:]]
    unfair_centers = kmeans.cluster_centers_
    
    objective, balance = evaluation(xs, unfair_assignments, unfair_centers, n_color, K)
    print(f'[Unfair] Energy / Balance: {objective:.3f} / {balance:.3f}')

[Unfair] Energy / Balance: 31468348.052 / 0.342


In [11]:
    # initial centers
    centers = np.random.normal(0, 1, (K, d))
    centers_module = None
    centers_optimizer = None
    if gradient_descent:
        lr = 5e-3 
        centers_module = nn.Linear(d, K, bias=False)
        centers_optimizer = torch.optim.Adam(centers_module.parameters(), lr=lr)
        if use_cuda:
            centers_module = centers_module.cuda()

In [12]:
    # optimizing
    elapsed_times = []
    best_it, best_original_energy, best_energy, best_balance = 0, 1e+10, 1e+10, 0.0
    subbest_it, subbest_original_energy, subbest_energy, subbest_balance = 0, 1e+10, 1e+10, 0.0


In [13]:
    for it in range(iters):
        start_time = time.time()
        it += 1
        if gradient_descent:
            centers = centers_module.weight.data.cpu().detach().numpy()
        
        all_xs, all_Taxs, all_gammas, all_colors = [], [], [], []
        all_n_0, all_n_1 = [], []
        for batch_xs, batch_colors in dloader:
            sub_batch_xs = []
            for color in range(n_color):
                sub_batch_xs.append(batch_xs[batch_colors == color])
            sub_batch_colors = [i*np.ones(sub_batch_xs_i.shape[0]) for i, sub_batch_xs_i in enumerate(sub_batch_xs)]
            sub_Tax_i_j, sub_gamma_i_j, sub_n_0, sub_n_1 = optimize_coupling(sub_batch_xs, centers, numItermax=numItermax)
            
            all_xs.append(sub_batch_xs)
            all_Taxs.append(sub_Tax_i_j)
            all_gammas.append(sub_gamma_i_j)
            all_colors.append(np.concatenate(sub_batch_colors))
            all_n_0.append(sub_n_0)
            all_n_1.append(sub_n_1)

        all_colors = np.concatenate(all_colors)

        # finding center
        flat_all_Taxs = np.concatenate(all_Taxs)
        flat_all_gammas = np.concatenate([sub_gamma / sub_gamma.shape[0] for sub_gamma in all_gammas])
        centers = optimize_center(flat_all_Taxs, flat_all_gammas,
                                  centers, centers_module, centers_optimizer, K, seed=seed,
                                  gradient_descent=gradient_descent, use_cuda=use_cuda)

        elapsed_times.append(time.time() - start_time)

        color_xs, color_assignments = assigning(all_xs, all_Taxs, all_gammas, centers, K)
        objective, balance = evaluation(color_xs, color_assignments, centers, n_color, K)

        print(f'[{it}/{iters}] Energy / Balance: {objective:.3f} / {balance:.3f}')
        
        if balance > best_balance:
            best_it = it
            best_balance = balance
            best_energy = objective
            best_original_energy = objective
        if objective < subbest_energy:
            subbest_it = it
            subbest_balance = balance
            subbest_energy = objective
            subbest_original_energy = objective
            
    elapsed_time_per_iter = np.mean(elapsed_times)
    elapsed_time_total = np.sum(elapsed_times)

    # results
    results = {'type': [type],
                'seed':[seed],
                'gradient_descent': [gradient_descent],
                'iters': [iters],
                'full_batch': [f'real_{full_batch}'],
                'batch_size': [batch_size],
                'l2_normalize': [l2_normalize],
                'elapsed_time_per_iter': [elapsed_time_per_iter],
                'elapsed_time_total': [elapsed_time_total],
                'best_it': [best_it],
                'best_original_energy': [best_original_energy],
                'best_energy': [best_energy],
                'best_balance': [best_balance],
                'subbest_it': [subbest_it],
                'subbest_original_energy': [subbest_original_energy],
                'subbest_energy': [subbest_energy],
                'subbest_balance': [subbest_balance]
                }

    columns = list(results.keys())
    df_results = pd.DataFrame(results, columns=columns)
    #result_name = f'results/FCA/{args.data_name}_FCA.csv'

    print(f'[BEST balance/{iters}] Energy / Balance: {best_energy:.3f} / {best_balance:.3f}')
    print(f'[BEST energy/{iters}] Energy / Balance: {subbest_energy:.3f} / {subbest_balance:.3f}')



  result_code_string = check_result(result_code)


[1/10] Energy / Balance: 194767340.683 / 0.812
[2/10] Energy / Balance: 194426670.001 / 0.765
[3/10] Energy / Balance: 194125800.546 / 0.815
[4/10] Energy / Balance: 193819699.361 / 0.797
[5/10] Energy / Balance: 193538081.526 / 0.730
[6/10] Energy / Balance: 193248024.768 / 0.765
[7/10] Energy / Balance: 192998963.297 / 0.751
[8/10] Energy / Balance: 192702360.102 / 0.829
[9/10] Energy / Balance: 192498435.736 / 0.793
[10/10] Energy / Balance: 192158444.982 / 0.787
[BEST balance/10] Energy / Balance: 192702360.102 / 0.829
[BEST energy/10] Energy / Balance: 192158444.982 / 0.787


In [None]:
# Grouping
score = 0.0
num_data = 0.0
for group_num in [0,1,2,3,4]:

    group_man_bool = color_assignments[0] == group_num
    group_woman_bool = color_assignments[1] == group_num
    group_man = color_xs[0][group_man_bool,:]
    group_woman = color_xs[1][group_woman_bool,:]

    group_all = np.concatenate([group_man,group_woman])

    recommend_genre = pd.DataFrame( group_all[:,18:].argmax(axis=1) ).value_counts()

    recommend_genre_0 = recommend_genre.index[0][0]
    recommend_genre_1 = recommend_genre.index[1][0]
    recommend_genre_2 = recommend_genre.index[2][0]

    #score = 0.0
    score += np.sum( (group_all[:,18:][:,recommend_genre_0] > 3.0) | (group_all[:,18:][:,recommend_genre_1] > 3.0) )
    num_data += group_all.shape[0]
print(score/num_data)

0.811092715231788


In [15]:
## 성별 별로
for group_num in [0,1,2,3,4]:
    group_man_bool = color_assignments[0] == group_num
    group_woman_bool = color_assignments[1] == group_num
    group_man = color_xs[0][group_man_bool,:]
    group_woman = color_xs[1][group_woman_bool,:]

    group_all = np.concatenate([group_man,group_woman])

    recommend_genre = pd.DataFrame( group_all[:,18:].argmax(axis=1) ).value_counts()

    recommend_genre_0 = recommend_genre.index[0][0]
    recommend_genre_1 = recommend_genre.index[1][0]
    recommend_genre_2 = recommend_genre.index[2][0]
    
    rating_posivie = 3.0
    
    man_score = 0.0
    man_score += np.sum( (group_man[:,18:][:,recommend_genre_0] > rating_posivie) | (group_man[:,18:][:,recommend_genre_1] > rating_posivie) )
    
    
    woman_score = 0.0
    woman_score += np.sum( (group_woman[:,18:][:,recommend_genre_0] > rating_posivie) | (group_woman[:,18:][:,recommend_genre_1] > rating_posivie) )
    
    print(f"Man :{man_score/group_man.shape[0]} , Woman : {woman_score/group_woman.shape[0]}")

Man :0.8450704225352113 , Woman : 0.860655737704918
Man :0.854320987654321 , Woman : 0.8108108108108109
Man :0.6163069544364509 , Woman : 0.581081081081081
Man :0.7412587412587412 , Woman : 0.7954545454545454
Man :0.8639788997739262 , Woman : 0.7842968075927523


In [16]:
## 성별 별로
man_score = 0.0
woman_score = 0.0
man_num = 0.0
woman_num =0.0

for group_num in [0,1,2,3,4]:
    group_man_bool = color_assignments[0] == group_num
    group_woman_bool = color_assignments[1] == group_num
    group_man = color_xs[0][group_man_bool,:]
    group_woman = color_xs[1][group_woman_bool,:]

    group_all = np.concatenate([group_man,group_woman])

    recommend_genre = pd.DataFrame( group_all[:,18:].argmax(axis=1) ).value_counts()

    recommend_genre_0 = recommend_genre.index[0][0]
    recommend_genre_1 = recommend_genre.index[1][0]
    recommend_genre_2 = recommend_genre.index[2][0]
    
    rating_posivie = 3.00

    man_score += np.sum( (group_man[:,18:][:,recommend_genre_0] >= rating_posivie) | (group_man[:,18:][:,recommend_genre_1] >= rating_posivie) )
    
    woman_score += np.sum( (group_woman[:,18:][:,recommend_genre_0] >= rating_posivie) | (group_woman[:,18:][:,recommend_genre_1] >= rating_posivie) )
    
    man_num += group_man.shape[0]
    woman_num += group_woman.shape[0]
    
print(f"Man :{man_score/man_num} , Woman : {woman_score/woman_num} , Diff : {np.abs( man_score/man_num- woman_score/woman_num ) } ")

Man :0.9194181482336643 , Woman : 0.9040374488004681 , Diff : 0.015380699433196199 
