In [1]:
import numpy as np
import pandas as pd
from random import randrange

# Agglomerative Clustering

## Main Class

In [2]:
#Class of AgglomerativeClustering providing the option to use single, complete and average linkage

class AgglomerativeClustering:
    
    def __init__(self,n_clusters=2,linkage="single"):
        
        self.n_clusters = n_clusters
        self.linkage = linkage

    def fit_predict(self,X):
        
        n=X.shape[0]     
        d=self.d_matrix(X) #distance matrix between data points 
        cluster=self.get_initial_cluster(n) #inividual clusters are assigned to each data point
        s=set(range(n))     
        for _ in range(n-self.n_clusters): 
            p,q=np.unravel_index(np.argmin(d, axis=None), d.shape)
            t_set=s-{p,q} 
            d=self.update_d(d,p,q,t_set,self.linkage) #distances are updated
            cluster=self.update_cluster(cluster,p,q) #clusters are combined
            s=s-{max(p,q)} 
        decor_l=[]
        for v in cluster.values():
            decor_l.append(v)
        
        self.labels_= self.clustertolabels(decor_l)
        return self.labels_

    def clustertolabels(self,clusters):
        
        ln = sum([len(c) for c in clusters])
        labels = np.zeros(ln,dtype = np.int)
        ind = -1
        for c in clusters:
            ind+=1
            for i in c:
                labels[i] = ind
        return labels


    def d_matrix(self,data):
        
        n=data.shape[0]  
        d=np.empty(shape=[n,n]) 
        d.fill(np.inf)  
        for i in range(n-1):
            for j in range(i+1,n):
                d[i,j]=distance(data[i],data[j]) 
        return d

    
    def get_initial_cluster(self,n):
        
        c={}
        for i in range(n):
            c[i]={i}   
        return c

   
    def update_d(self,d,p,q,t_set,linkage):
        
        for i in t_set: 
            
            u,v=min(i,p),max(i,p) 
            w,x=min(i,q),max(i,q)
            if(linkage=="complete"):
                t=max(d[u,v],d[w,x])
            elif(linkage=="average"):
                t=(d[u,v]+d[w,x])/2
            else:     
                t=min(d[u,v],d[w,x])
        
            d[u,v]=t
            d[w,x]=t
            
        m_pq=max(p,q)
        d[m_pq,:]=np.inf
        d[:,m_pq]=np.inf
        return d


    def update_cluster(self,c,p,q):
        
        i=c.pop(max(p,q)) 
        m=min(p,q)
        c[m]=c[m].union(i) 
        return c

### Helper Function

In [3]:
def distance(pt1,pt2):

        if(len(pt1)!=len(pt2)):
            print("Dimensions of the points are not equal")
            return  
        dim=len(pt1)  
        s=0
        for i in range(dim):
            s+=(pt1[i]-pt2[i])**2 
        dist=np.sqrt(s)  
        return dist

# KNN

## Main Function

In [4]:
def cartesian_distance(sample, inputs):
    
    diff = np.subtract(sample, inputs)
    sum_pow = np.sum(np.power(diff, 2), axis=1)
    
    return np.power(sum_pow, 0.5)

def classify(k, sorted_labels):
    
    k_neighbors = sorted_labels[:k]
    target = np.unique(k_neighbors)
    count = []
    for i in target:
        x = np.count_nonzero(k_neighbors == i)
        count.append(x)

    return target[np.argmax(count)]

def KNN_classification(sample, k, X, y):

    labels = list(y)
    inputs = list(X)

    cart_distance = cartesian_distance(sample, inputs) # get the cartesian distance from each data point

    labeled_cart = np.vstack((cart_distance, labels)) # create a 2D array with the 1st column being the above distances and the second corresponding label

    sorted_cart = labeled_cart.T[labeled_cart.T[:, 0].argsort()] # sort in an ascending manner based on the distances

    sorted_labels = sorted_cart.T[1]

    return classify(k, sorted_labels)

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual))

### Helper function that generates data set with features extracted from clustering  

In [5]:
def dataset(df, n, linkage):
    
    df = df.drop("target", axis=1) #Target column is dropped
    df = df.sample(frac=1).reset_index(drop = True) #Dataframe is randomly sampled
    
    train_set = df.iloc[:150, :].reset_index(drop = True) #split data set into train and test data sets
    test_set = df.iloc[150:, :].reset_index(drop = True)
    
    clustering = AgglomerativeClustering(n_clusters=n, linkage=linkage) #Instatiation of agglomerative clustering class
    pred_clusters = clustering.fit_predict(train_set.values) #training data divided into n clusters
    
    train_set['labels'] = pred_clusters #cluster ID's are assigned as lables to training data
    
    for i in range(n): #Loop iterates the number of times as the number of clusters
        
        indexes = np.where(train_set['labels'] == i) #indexes are fetched where training data has labels pertaining to cluster ID
        df_i = train_set.iloc[indexes[0], :-1].reset_index(drop = True) #creating a dataframe using the fetched indexes
        centroid = list(df_i.mean()) #Cluster representative is determined as centroid for the training set
        dataset = train_set.iloc[:,:-1] 
        column_name = "cluster" + str(i)

        train_set[column_name] = [np.power(np.sum(np.square(row-centroid)), 0.5) for row in dataset.values] #Features are added to the training dataset based on cluster representative i.e. centroid. each training data point will get extra feature values that is calculated based on the similarity between data point and cluster.
        train_set[column_name] = train_set[column_name] - train_set[column_name].min() #Scaled column using the equation for min max scaler
        train_set[column_name] = train_set[column_name]/train_set[column_name].max()

        test_set[column_name] = [np.power(np.sum(np.square(row-centroid)), 0.5) for row in test_set.values] #unlabeled test data points are transformed into same feature space as the training data set.
        test_set[column_name] = test_set[column_name] - test_set[column_name].min()
        test_set[column_name] = test_set[column_name]/test_set[column_name].max()
    
    train_set['labels'] = train_set.pop('labels')
    
    return train_set, test_set

# Kfold to evaluate KNN model

In [6]:
def cross_validation_split(dataset, folds):
    
    dataset_split = []
    df_copy = dataset
    fold_size = int(df_copy.shape[0] / folds)
        
    for i in range(folds): # for loop to save each fold
        fold = []
        
        while len(fold) < fold_size: # while loop to add elements to the folds
            
            r = randrange(df_copy.shape[0]) # select a random element
            index = df_copy.index[r] # determine the index of this element 
            fold.append(df_copy.loc[index].values.tolist())# save the randomly selected line 
            df_copy = df_copy.drop(index) # delete the randomly selected line from
  
        dataset_split.append(np.asarray(fold)) # save the fold   
            
    return dataset_split 

def kfoldCV(dataset, f, k):
    data=cross_validation_split(dataset,f)
    result=[]

    for i in range(f):
        r = list(range(f))
        r.pop(i)
        for j in r :
            if j == r[0]:
                cv = data[j]
            else:    
                cv=np.concatenate((cv,data[j]), axis=0)
  
        predictions = []
        for sample in data[i][:,:-1]:
            prediction_1 = KNN_classification(sample, k, cv[:,:-1], cv[:,-1])
            predictions.append(prediction_1)
            
        #print(len(predictions), len(cv[:,-1]))
        acc = accuracy_metric(data[i][:,-1], predictions)   
        result.append(acc) 
        
    return result

In [7]:
df = pd.read_csv("Seed_Data.csv")
df.head(3)

Unnamed: 0,A,P,C,LK,WK,A_Coef,LKG,target
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,0
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,0
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,0


In [8]:
train_set, test_set = dataset(df, 5, "complete")

### Training data with additional features based on cluster representatives 

In [9]:
train_set.head(3)

Unnamed: 0,A,P,C,LK,WK,A_Coef,LKG,cluster0,cluster1,cluster2,cluster3,cluster4,labels
0,16.16,15.33,0.8644,5.845,3.395,4.266,5.795,0.067598,0.328605,0.527411,0.498841,0.478774,0
1,19.13,16.31,0.9035,6.183,3.902,2.109,5.924,0.450274,0.02075,0.852825,0.756658,0.397873,1
2,17.99,15.86,0.8992,5.89,3.694,2.068,5.837,0.270181,0.090756,0.748573,0.629697,0.398456,1


### Test data points are transformed into the same feature space as the training data set, using cluster representatives obtained from training data

In [10]:
test_set.head(3)

Unnamed: 0,A,P,C,LK,WK,A_Coef,LKG,cluster0,cluster1,cluster2,cluster3,cluster4
0,12.89,13.77,0.8541,5.495,3.026,6.185,5.316,0.676038,0.769207,0.062635,0.177446,0.640257
1,11.23,12.63,0.884,4.902,2.879,2.269,4.703,0.858417,0.906614,0.287402,0.108402,1.0
2,12.73,13.75,0.8458,5.412,2.882,3.533,5.067,0.53792,0.702306,0.192287,0.0,0.674996


In [11]:
n_clusters = [3,4,5,6,7]
knn = [3,4,5,6]
linkages = ['complete', 'single', 'average']
Accuracy_linkage = []

    
for linkage in linkages:
    
    print(f"\033[1m-------------------------For linkage = {linkage}--------------------------\033[0m\n")
    Accuracy = []
    
    for n in n_clusters:
        
        train_set, test_set = dataset(df, n, linkage)
        print(f"      \033[1mFor clusters n = {n}\033[0m")
        acc_clusters = []
        
        for k in knn:
            
            result = kfoldCV(train_set, 10, k)
            acc = sum(result)/len(result)
            print(f"       Accuracy using {k} knn: {acc}")
            acc_clusters.append(acc)
            
        Accuracy.append(sum(acc_clusters)/len(acc_clusters))
        
        print(f"\n       \033[1mAccuracy for {n} clusters : {sum(acc_clusters)/len(acc_clusters)} \033[0m")
        print("\n") 
    print(f"             \033[1mGood number of clusters would be: {n_clusters[np.argmax(Accuracy)]}\033[0m\n\n")
    
    Accuracy_linkage.append(max(Accuracy))
    
print(f"                           \033[1mAccuracy using {linkage} : {max(Accuracy_linkage)}\033[0m")

[1m-------------------------For linkage = complete--------------------------[0m

      [1mFor clusters n = 3[0m
       Accuracy using 3 knn: 0.9933333333333334
       Accuracy using 4 knn: 0.9800000000000001
       Accuracy using 5 knn: 0.9800000000000001
       Accuracy using 6 knn: 0.9733333333333334

       [1mAccuracy for 3 clusters : 0.9816666666666667 [0m


      [1mFor clusters n = 4[0m
       Accuracy using 3 knn: 0.9400000000000001
       Accuracy using 4 knn: 0.9733333333333334
       Accuracy using 5 knn: 0.9533333333333335
       Accuracy using 6 knn: 0.9400000000000001

       [1mAccuracy for 4 clusters : 0.9516666666666668 [0m


      [1mFor clusters n = 5[0m
       Accuracy using 3 knn: 0.9933333333333334
       Accuracy using 4 knn: 0.9533333333333334
       Accuracy using 5 knn: 0.9733333333333334
       Accuracy using 6 knn: 0.9600000000000002

       [1mAccuracy for 5 clusters : 0.97 [0m


      [1mFor clusters n = 6[0m
       Accuracy using 3 knn: 0.

# Predictions on test_set using best parameters 

In [12]:
train_set, test_set = dataset(df, 3, 'average')

In [13]:
predictions = []
X = train_set.iloc[:,:-1].values
y = train_set.iloc[:,-1].values
for sample in test_set.values:
    prediction_1 = KNN_classification(sample, 5, X, y)
    predictions.append(prediction_1)

In [14]:
print(predictions)

[0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 2.0, 2.0, 0.0, 2.0, 0.0, 2.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0]


# Using KNN on original dataset

In [15]:
df = pd.read_csv("Seed_Data.csv")
df = df.sample(frac=1).reset_index(drop = True)
train_set = df.iloc[:150, :].reset_index(drop = True)
test_set = df.iloc[150:, :].reset_index(drop = True)

In [16]:
X = train_set.iloc[:,:-1].values
y = train_set.iloc[:,-1].values
X_test = test_set.iloc[:,:-1].values
y_test = test_set.iloc[:,-1].values

In [17]:
knn = [3,4,5,6]
Accuracy = []
for k in knn:
    result = kfoldCV(train_set.iloc[:150,:], 10, k)
    acc = sum(result)/len(result)
    print(f"Accuracy using {k} knn: {acc}")
    Accuracy.append(acc)
print(f"\n\n     Accuracy using KNN on original dataset: {sum(Accuracy)/len(Accuracy)}")

Accuracy using 3 knn: 0.9000000000000001
Accuracy using 4 knn: 0.8866666666666667
Accuracy using 5 knn: 0.9133333333333334
Accuracy using 6 knn: 0.9133333333333334


     Accuracy using KNN on original dataset: 0.9033333333333334


In [18]:
predictions = []
for sample in X_test:
    prediction_1 = KNN_classification(sample, 5, X, y)
    predictions.append(prediction_1)

In [19]:
accuracy_metric(y_test, predictions)

0.8833333333333333