In [199]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

In [200]:
readdata = pd.read_csv("../dataset/glass.csv")
labels = readdata['Type']
data = readdata.drop(['Type'],axis=1)
data = data.to_numpy()

In [201]:
def mahalanobis(inp):
    theta = np.mean(inp,axis=0)
    cov = np.cov(inp,rowvar=False)

    dist_out = np.zeros(inp.shape[0])

    for i in range(inp.shape[0]):
        x = inp[i]
        y = x-theta
        dist = np.sqrt(np.dot(np.dot(y.T, np.linalg.inv(cov)),y))
        dist_out[i] = dist

    f = inp.shape[1]
    threshold = np.sqrt(f*np.percentile(dist_out,95))
    outliers = np.where(dist_out > threshold)[0]

    return outliers

In [202]:
def lof(inp,k):
    n = inp.shape[0]
    dist = cdist(inp,inp)

    kinds = np.argsort(dist,axis=0)[1:k+1].T
    kdist = np.take_along_axis(dist,kinds,axis=1)

    reachdist = np.maximum(kdist,np.tile(np.expand_dims(dist.diagonal(),axis=1),(1,k)))
    lrd = k / np.sum(reachdist,axis=1)

    lofout = np.zeros(n)
    for i in range(n):
        ninds = kinds[i]
        nlrd = lrd[ninds]
        lofout[i] = np.mean(nlrd)/lrd[i]

    lofout = lofout.reshape(-1,1)

    km = KMeans(n_clusters=2,n_init=10,random_state=42).fit(lofout)
    outliers = np.argmax(np.abs(km.cluster_centers_ - 1))
    outliers = km.labels_ == outliers

    return outliers

In [203]:
def otsu_thresholding(inp):
    n, f = inp.shape
    mins = np.min(inp, axis=0, keepdims=True)
    maxs = np.max(inp, axis=0, keepdims=True)
    scaled = (inp - mins) / (maxs - mins)

    num_bins = min(n, 256)
    hist, bins = np.histogram(scaled, bins=num_bins, range=(0, 1))
    print(hist.shape)
    hist = hist.reshape(f, -1)

    hist = hist / np.sum(hist, axis=2, keepdims=True)

    cumsum = np.cumsum(hist, axis=2)
    bins = np.linspace(0, 1, num=num_bins)
    prod = hist * bins
    cummean = np.cumsum(prod, axis=2)
    glob = np.sum(prod, axis=2)

    sigma = ((glob.reshape(f, -1) * cumsum - cummean) ** 2) / (cumsum * (1 - cumsum))
    opt = np.argmax(sigma, axis=1)

    return opt


In [204]:
maha = mahalanobis(data)
print(data[maha])

[[ 1.53125 10.73     0.       2.1     69.81     0.58    13.3      3.15
   0.28   ]
 [ 1.51643 12.16     3.52     1.35    72.89     0.57     8.53     0.
   0.     ]
 [ 1.51316 13.02     0.       3.04    70.48     6.21     6.96     0.
   0.     ]
 [ 1.51321 13.       0.       3.02    70.7      6.21     6.93     0.
   0.     ]
 [ 1.51115 17.38     0.       0.34    75.41     0.       6.65     0.
   0.     ]
 [ 1.51831 14.39     0.       1.82    72.86     1.41     6.47     2.88
   0.     ]]


In [205]:
lofs = lof(data,5)
print(data[lofs])

[[ 1.52725 13.8      3.15     0.66    70.57     0.08    11.64     0.
   0.     ]
 [ 1.51299 14.4      1.74     1.54    74.55     0.       7.59     0.
   0.     ]
 [ 1.51115 17.38     0.       0.34    75.41     0.       6.65     0.
   0.     ]
 [ 1.51131 13.69     3.2      1.81    72.81     1.76     5.43     1.19
   0.     ]
 [ 1.52365 15.79     1.83     1.31    70.43     0.31     8.61     1.68
   0.     ]
 [ 1.51831 14.39     0.       1.82    72.86     1.41     6.47     2.88
   0.     ]]


In [206]:
otsu = []
for i in range(1,8):
    indices = np.where(labels==i)[0]
    opt = otsu_thresholding(data[indices])
    otsu.extend(indices[opt])
otsu = np.unique(otsu)

print(otsu)

(70,)


ValueError: cannot reshape array of size 70 into shape (9,newaxis)