In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import time

In [2]:
data = pd.read_csv('encoded.csv')

In [3]:
data = data.loc[0:49]
data

Unnamed: 0,Age,Gender,Days Abdominal Pain,Temperature,Tenderness,Leukocytes [10^9/l],CRP [mg/l],Number of Sonographies preop,Preoperative Diagnosis
0,15,1,1,36.9,1,11.1,6.0,1,0
1,14,0,2,37.0,1,21.28,30.8,1,0
2,16,0,0,36.9,0,13.07,1.9,1,0
3,14,0,1,35.7,0,7.95,0.0,2,0
4,16,1,7,37.9,1,14.82,14.4,3,0
5,10,1,1,37.1,1,17.62,7.0,1,0
6,10,0,1,36.3,0,17.74,18.9,1,0
7,15,0,1,36.6,0,15.24,13.8,1,0
8,10,0,0,37.2,1,13.87,1.6,1,0
9,9,0,0,36.4,0,31.6,9.95,1,0


In [4]:
data = data.to_numpy()
data

array([[ 15.  ,   1.  ,   1.  ,  36.9 ,   1.  ,  11.1 ,   6.  ,   1.  ,
          0.  ],
       [ 14.  ,   0.  ,   2.  ,  37.  ,   1.  ,  21.28,  30.8 ,   1.  ,
          0.  ],
       [ 16.  ,   0.  ,   0.  ,  36.9 ,   0.  ,  13.07,   1.9 ,   1.  ,
          0.  ],
       [ 14.  ,   0.  ,   1.  ,  35.7 ,   0.  ,   7.95,   0.  ,   2.  ,
          0.  ],
       [ 16.  ,   1.  ,   7.  ,  37.9 ,   1.  ,  14.82,  14.4 ,   3.  ,
          0.  ],
       [ 10.  ,   1.  ,   1.  ,  37.1 ,   1.  ,  17.62,   7.  ,   1.  ,
          0.  ],
       [ 10.  ,   0.  ,   1.  ,  36.3 ,   0.  ,  17.74,  18.9 ,   1.  ,
          0.  ],
       [ 15.  ,   0.  ,   1.  ,  36.6 ,   0.  ,  15.24,  13.8 ,   1.  ,
          0.  ],
       [ 10.  ,   0.  ,   0.  ,  37.2 ,   1.  ,  13.87,   1.6 ,   1.  ,
          0.  ],
       [  9.  ,   0.  ,   0.  ,  36.4 ,   0.  ,  31.6 ,   9.95,   1.  ,
          0.  ],
       [  6.  ,   0.  ,   1.  ,  37.  ,   0.  ,  16.58,  61.1 ,   1.  ,
          0.  ],
       [ 11.  ,   0. 

In [5]:
# Kernel function that measures the
# similarity of two datapoints
# Returns the value of RBF kernel

def k(xi, xj, gamma):    
    return np.exp(-gamma * np.linalg.norm(xi - xj))

In [28]:
# Computes mmd^2 using list of datapoints,
# prototypes and hyperparameter gamma

from itertools import combinations

def mmd(X, prototypes, gamma):
    # Sum of protoype proximities
    # Generate combinations of row indices
    cmb = list(combinations(np.arange(prototypes.shape[0]), 2))
    
    # Extract rows based on combinations
    # Calculate sum of proximities of combinations
    # Needs to be multiplied with 2 (lower part of proximity matrix)
    # Sum of identity matrix needs to be added (1 on diag)
    utm = np.sum(list(map(lambda z: k(z[0], z[1], gamma), prototypes[cmb])))
    spp = utm * 2 + prototypes.shape[0]
    
    
    # Sum of prototype-datapoint proximities
    spdp = np.sum([[k(z, x, gamma) for z in prototypes] for x in X])
    
    
    # Sum of datapoint proximities
    # Generate combinations of row indices
    cmb = list(combinations(np.arange(X.shape[0]), 2))
    
    # Extract rows based on combinations
    # Calculate sum of proximities of combinations
    # Needs to be multiplied with 2 (lower part of proximity matrix)
    # Sum of identity matrix needs to be added (1 on diag)
    utm = np.sum(list(map(lambda x: k(x[0], x[1], gamma), X[cmb])))
    sdp = utm * 2 + X.shape[0]
    
    
    # Averaging proximities to compute mmd^2
    mmd2 = 0.0
    if prototypes.shape[0] != 0:
        mmd2 += 1/(prototypes.shape[0]**2) * spp
    if prototypes.shape[0] != 0 and X.shape[0] != 0:
        mmd2 -= 2/(prototypes.shape[0]*X.shape[0]) * spdp
    if X.shape[0] != 0:
        mmd2 += 1/(X.shape[0]**2) * sdp
    
    return mmd2

In [24]:
def evaluate_as_prototype(args):
    X, p, idx, gamma = args
    Xi = X.copy()
    pi = p.copy()
    
    # Removing datapoint from list of datapoints
    Xi = np.delete(Xi, idx, axis=0)

    # Making the removed datapoint a prototype
    pi = np.append(pi, p)

    # Calculating mmd of new prototype list
    # Returning pair (mmd, idx) of the
    # evaluated datapoint as prototype
    return (mmd(Xi, pi, gamma), idx)

In [25]:
import concurrent.futures

def evaluate_parallel(datapoints):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit the tasks to the executor
        futures = [executor.submit(evaluate_as_prototype, datapoint) for datapoint in datapoints]
        
        # Retrieve the results as they become available
        results = [future.result() for future in concurrent.futures.as_completed(futures)]
    
    return results

In [26]:
# Adds a prototype to list prototypes
# using the datapoints from list X

def add_prototype(X, prototypes, gamma):
    min_mmd = float('+inf')
    
    # Best prototype index (lowers mmd most)
    bpi = 0
    
    datapoints = [(X, prototypes, idx, gamma) for idx, _ in enumerate(X)]
    
    mmds = evaluate_parallel(datapoints)
    
    # Consider each datapoint as prototype and evaluate like that
    for mmd, idx in mmds:
        # Checking if new mmd is lower
        if mmd < min_mmd:
            min_mmd = mmd
            bpi = idx
    
    # Making bpith datapoint a prototype in source lists
    prototypes = np.vstack((prototypes, X[bpi]))
    X = np.delete(X, bpi, 0)
    
    return X, prototypes

In [12]:
# Finds m prototypes from dataset X
# Hyperparameters m and gamma

def find_prototypes(X, m=10, gamma=0.1):
    # List of chosen prototypes
    cp = np.empty((0, data.shape[1]))
    
    # Find m prototypes
    for _ in range(m):
        X, cp = add_prototype(X, cp, gamma)
    
    return cp

In [32]:
prototypes = find_prototypes(data)

In [33]:
print(prototypes)

[[15.    1.    1.   36.9   1.   11.1   6.    1.    0.  ]
 [13.    0.    1.   37.3   1.   11.8   8.5   2.    0.  ]
 [16.    0.    0.   36.9   0.   13.07  1.9   1.    0.  ]
 [14.    1.    0.   36.8   1.   17.02  2.1   1.    0.  ]
 [17.    0.    1.   36.6   1.   12.65  5.    1.    0.  ]
 [10.    1.    1.   37.1   1.   17.62  7.    1.    0.  ]
 [16.    0.    0.   37.2   1.   12.44  1.9   1.    0.  ]
 [15.    0.    1.   36.6   0.   15.24 13.8   1.    0.  ]
 [12.    0.    1.   37.2   0.   17.52 24.4   1.    0.  ]
 [13.    0.    0.   36.2   1.    8.65  7.1   1.    0.  ]]


In [None]:
from multiprocessing import Pool

if __name__ == '__main__':
    with Pool(5) as p:
        print(p.map(f, [1, 2, 3]))