In [87]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np

In [88]:
df = pd.read_csv('encoded.csv')

In [89]:
df.head()

Unnamed: 0,Age,Gender,Days Abdominal Pain,Temperature,Tenderness,Leukocytes [10^9/l],CRP [mg/l],Number of Sonographies preop,Preoperative Diagnosis
0,15,1,1,36.9,1,11.1,6.0,1,0
1,14,0,2,37.0,1,21.28,30.8,1,0
2,16,0,0,36.9,0,13.07,1.9,1,0
3,14,0,1,35.7,0,7.95,0.0,2,0
4,16,1,7,37.9,1,14.82,14.4,3,0


In [90]:
print(df.dtypes)

Age                               int64
Gender                            int64
Days Abdominal Pain               int64
Temperature                     float64
Tenderness                        int64
Leukocytes [10^9/l]             float64
CRP [mg/l]                      float64
Number of Sonographies preop      int64
Preoperative Diagnosis            int64
dtype: object


In [99]:
# Kernel function that measures the
# similarity of two datapoints

def k(xi, xj, gamma):
    # Converting from Series to np object
    # For computing Euclidean distance
    xi = xi.to_numpy()
    xj = xj.to_numpy()
    
    # Returns the value of RBF kernel
    return np.exp(-gamma * np.linalg.norm(xi - xj))

In [92]:
# Computes mmd^2 using list of datapoints,
# prototypes and hyperparameter gamma

def mmd(X, prototypes, gamma):
    m = len(X.index)
    n = len(prototypes.index)
    
    # Sum of protoype proximities
    spp = 0.0
    for i in prototypes.index:
        for j in prototypes.index:
            spp += k(prototypes.loc[i], prototypes.loc[j], gamma)
    
    # Sum of prototype-datapoint proximities
    spdp = 0.0
    for i in prototypes.index:
        for j in X.index:
            spp += k(prototypes.loc[i], X.loc[j], gamma)
            
    # Sum of datapoint proximities
    sdp = 0.0
    for i in X.index:
        for j in X.index:
            spp += k(X.loc[i], X.loc[j], gamma)
            
    # Averaging proximities to compute mmd^2
    mmd2 = 1/(m**2) * spp - 2/(m*n) * spdp + 1/(n**2) * sdp
    
    return mmd2

In [93]:
# Adds a prototype to list prototypes
# using the datapoints from list X

def add_prototype(X, prototypes, gamma):
    min_mmd = float('+inf')
    
    # Best prototype index (lowers mmd most)
    bpi = 0
    
    for i in X.index:
        # Creating a copy of datapoints and prototypes
        Xi = X.copy()
        pi = prototypes.copy()
        
        # Making ith datapoint a prototype
        pi.loc[len(pi.index)] = Xi.loc[i]
        Xi = Xi.drop(i)
        
        # Calculating mmd of new prototype list
        mmd_i = mmd(Xi, pi, gamma)
        
        # Checking if new mmd is lower
        if mmd_i < min_mmd:
            min_mmd = mmd_i
            bpi = i
    
    # Making bpith datapoint a prototype in source lists
    prototypes.loc[len(prototypes.index)] = X.loc[bpi]
    X = X.drop(bpi)

In [97]:
# Finds m prototypes from dataset X
# Hyperparameters m and gamma

def find_prototypes(X, m=10, gamma=0.1):
    # List of chosen prototypes
    cp = pd.DataFrame().reindex_like(X)
    cp = cp.iloc[0:0]
    
    
    # While number of prototypes is under m, find another
    while cp.shape[0] < m:
        add_prototype(X, cp, gamma)
        print(f'Added a prototype: {cp.loc[len(cp.index) - 1]}')
    
    return cp

In [100]:
prototypes = find_prototypes(df.copy())

KeyboardInterrupt: 

In [96]:
print(prototypes)

     Age  Gender  Days Abdominal Pain  Temperature  Tenderness   
0    NaN     NaN                  NaN          NaN         NaN  \
1    NaN     NaN                  NaN          NaN         NaN   
2    NaN     NaN                  NaN          NaN         NaN   
3    NaN     NaN                  NaN          NaN         NaN   
4    NaN     NaN                  NaN          NaN         NaN   
..   ...     ...                  ...          ...         ...   
419  NaN     NaN                  NaN          NaN         NaN   
420  NaN     NaN                  NaN          NaN         NaN   
421  NaN     NaN                  NaN          NaN         NaN   
422  NaN     NaN                  NaN          NaN         NaN   
423  NaN     NaN                  NaN          NaN         NaN   

     Leukocytes [10^9/l]  CRP [mg/l]  Number of Sonographies preop   
0                    NaN         NaN                           NaN  \
1                    NaN         NaN                           NaN 