In [19]:
import numpy as np
import pandas as pd

# Clustering


In [20]:
class AgglomerativeClustering:
    
    def __init__(self,n_clusters=2,linkage="single"):
        
        self.n_clusters = n_clusters
        self.linkage = linkage

    def fit_predict(self,X):
        
        n=X.shape[0]     
        d=self.d_matrix(X) 
        cluster=self.get_initial_cluster(n)
        s=set(range(n))     
        for _ in range(n-self.n_clusters): 
            p,q=np.unravel_index(np.argmin(d, axis=None), d.shape)
            t_set=s-{p,q} 
            d=self.update_d(d,p,q,t_set,self.linkage) 
            cluster=self.update_cluster(cluster,p,q) 
            s=s-{max(p,q)} 
        decor_l=[]
        for v in cluster.values():
            decor_l.append(v)
        
        self.labels_= self.clustertolabels(decor_l)
        return self.labels_

    def clustertolabels(self,clusters):
        
        ln = sum([len(c) for c in clusters])
        labels = np.zeros(ln,dtype = np.int)
        ind = -1
        for c in clusters:
            ind+=1
            for i in c:
                labels[i] = ind
        return labels


    def d_matrix(self,data):
        
        n=data.shape[0]  
        d=np.empty(shape=[n,n]) 
        d.fill(np.inf)  
        for i in range(n-1):
            for j in range(i+1,n):
                d[i,j]=distance(data[i],data[j]) 
        return d

    
    def get_initial_cluster(self,n):
        
        c={}
        for i in range(n):
            c[i]={i}   
        return c

   
    def update_d(self,d,p,q,t_set,linkage):
        
        for i in t_set: 
            
            u,v=min(i,p),max(i,p) 
            w,x=min(i,q),max(i,q)
            if(linkage=="complete"):
                t=max(d[u,v],d[w,x])
            elif(linkage=="average"):
                t=(d[u,v]+d[w,x])/2
            else:     
                t=min(d[u,v],d[w,x])
        
            d[u,v]=t
            d[w,x]=t
            
        m_pq=max(p,q)
        d[m_pq,:]=np.inf
        d[:,m_pq]=np.inf
        return d


    def update_cluster(self,c,p,q):
        
        i=c.pop(max(p,q)) 
        m=min(p,q)
        c[m]=c[m].union(i) 
        return c
def distance(pt1,pt2):

        if(len(pt1)!=len(pt2)):
            print("Dimensions of the points are not equal")
            return  
        dim=len(pt1)  
        s=0
        for i in range(dim):
            s+=(pt1[i]-pt2[i])**2 
        dist=np.sqrt(s)  
        return dist

# KNN

In [21]:
#calculate cartesian distance
def dist_cartesian(sample, inputs):
    
    diff = sample - inputs
    sum_pow = np.sum(diff**2, axis=1)
    
    return sum_pow**0.5
    
def lbl_classify(k, sorted_labels):
    
    k_neighbors = sorted_labels[:k]
    target = np.unique(k_neighbors)
    count = []
    for i in target:
        x = np.count_nonzero(k_neighbors == i)
        count.append(x)

    return target[np.argmax(count)]

def KNN_classification(sample, k, X, y):

    labels = list(y)
    inputs = list(X)

    cart_distance = dist_cartesian(sample, inputs)
    labeled_cart = np.vstack((cart_distance, labels))
    sorted_cart = labeled_cart.T[labeled_cart.T[:, 0].argsort()]
    sorted_labels = sorted_cart.T[1]

    return lbl_classify(k, sorted_labels)
# acuuracy function
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual))

# K fold

In [22]:
from random import randrange
def splits_CV(dataset_KNN, folds):
    
    dataset_KNN_split = []
    df_copy = dataset_KNN
    fold_size = int(df_copy.shape[0] / folds)
        
    for i in range(folds):
        fold = []
        while len(fold) < fold_size:
            r = randrange(df_copy.shape[0]) 
            index = df_copy.index[r]
            fold.append(df_copy.loc[index].values.tolist())
            df_copy = df_copy.drop(index)  
        dataset_KNN_split.append(np.asarray(fold))
            
    return dataset_KNN_split 


In [23]:
def k_Fold(dataset_KNN, f, k):
    data=splits_CV(dataset_KNN,f)
    result=[]

    for i in range(f):
        r = list(range(f))
        r.pop(i)
        for j in r :
            if j == r[0]:
                cv = data[j]
            else:    
                cv=np.concatenate((cv,data[j]), axis=0)
  
        predictions = []
        for sample in data[i][:,:-1]:
            prediction_1 = KNN_classification(sample, k, cv[:,:-1], cv[:,-1])
            predictions.append(prediction_1)
            
        acc = accuracy_metric(data[i][:,-1], predictions)   
        result.append(acc) 
        
    return result

In [24]:
def dataset_KNN(df, n, linkage):
    
    df = df.drop("target", axis=1)
    #shuffle dataframe
    df = df.sample(frac=1).reset_index(drop = True)

    train_set = df.iloc[:170, :].reset_index(drop = True)
    test_set = df.iloc[170:, :].reset_index(drop = True)

    clustering = AgglomerativeClustering(n_clusters=n, linkage=linkage)
    pred_clusters = clustering.fit_predict(train_set.values)
    
    train_set['labels'] = pred_clusters
    
    for i in range(n):
        
        indexes = np.where(train_set['labels'] == i)
        df_i = train_set.iloc[indexes[0], :-1].reset_index(drop = True)
        centroid = list(df_i.mean())
        dataset_KNN = train_set.iloc[:,:-1]
        column_name = "cluster_feature_" + str(i)
        #distance of datapoints
        train_set[column_name] = [np.sum(np.square(row-centroid)) for row in dataset_KNN.values]
        #min max scaling
        train_set[column_name] = train_set[column_name]/train_set[column_name].max()
        test_set[column_name] = [np.sum(np.square(row-centroid)) for row in test_set.values]
        test_set[column_name] = test_set[column_name]/test_set[column_name].max()
    
    train_set['labels'] = train_set.pop('labels')
    #return modified train and test set
    return train_set, test_set
  

In [25]:
df = pd.read_csv("Seed_Data.csv")

In [26]:
df.head()

Unnamed: 0,A,P,C,LK,WK,A_Coef,LKG,target
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,0
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,0
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,0
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,0
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,0


In [27]:
'''
numbers of clusters
number of neighbours
linkages(single, complete, average)
'''
n_clusters = [3,4,5,6,7]
knn = [3,5,7,9,11] 
linkages = ['single', 'complete', 'average']
Accuracy_linkage = []

In [28]:
for linkage in linkages:
    print("\u0332".join(f"Scenario for {linkage} Linkage :\n"))
    Accuracy = []
    for n in n_clusters:
        
        train_set, test_set = dataset_KNN(df, n, linkage)
        #df - dataframe
        #n - clusters
        #linkage = single, complete, averagee
        train_set.to_csv("train_set.csv")
        test_set.to_csv("test_set.csv")
        print(f"No of clusters :{n}")
        acc_clusters = []
        for k in knn:
            result = k_Fold(train_set, 8, k) #fold accuracy
            #fold value = 8
            #k = neighbors
            acc = sum(result)/len(result) #KNN accuracy based on  folds
            print(f"Accuracy for {n} clusters using {k} nearest data points: {acc}")
            acc_clusters.append(acc)
        Accuracy.append(sum(acc_clusters)/len(acc_clusters)) #average cluster accuracy
        acc = (sum(acc_clusters)/len(acc_clusters))*100 #percentage
        print(f"Accuracy : {acc} %   ") 
     
    print(f"Best scenario for no of clusters : {n_clusters[np.argmax(Accuracy)]} \n\n") # for best case 
    Accuracy_linkage.append(max(Accuracy))
    

S̲c̲e̲n̲a̲r̲i̲o̲ ̲f̲o̲r̲ ̲s̲i̲n̲g̲l̲e̲ ̲L̲i̲n̲k̲a̲g̲e̲ ̲:̲

No of clusters :3


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Accuracy for 3 clusters using 3 nearest data points: 0.9880952380952381
Accuracy for 3 clusters using 5 nearest data points: 0.9880952380952381
Accuracy for 3 clusters using 7 nearest data points: 0.9761904761904763
Accuracy for 3 clusters using 9 nearest data points: 0.9583333333333334
Accuracy for 3 clusters using 11 nearest data points: 0.9583333333333334
Accuracy : 97.3809523809524 %   
No of clusters :4
Accuracy for 4 clusters using 3 nearest data points: 0.9880952380952381
Accuracy for 4 clusters using 5 nearest data points: 0.9880952380952381
Accuracy for 4 clusters using 7 nearest data points: 0.9642857142857143
Accuracy for 4 clusters using 9 nearest data points: 0.9642857142857143
Accuracy for 4 clusters using 11 nearest data points: 0.9642857142857142
Accuracy : 97.3809523809524 %   
No of clusters :5
Accuracy for 5 clusters using 3 nearest data points: 0.9761904761904763
Accuracy for 5 clusters using 5 nearest data points: 0.9761904761904763
Accuracy for 5 clusters using 7 

In [29]:
train_set.head()

Unnamed: 0,A,P,C,LK,WK,A_Coef,LKG,cluster_feature_0,cluster_feature_1,cluster_feature_2,cluster_feature_3,cluster_feature_4,cluster_feature_5,cluster_feature_6,labels
0,14.11,14.18,0.882,5.541,3.221,2.754,5.038,0.005152,0.329711,0.312028,0.159162,0.515252,0.365653,0.313451,0
1,14.34,14.37,0.8726,5.63,3.19,1.313,5.15,0.023709,0.3555,0.291783,0.229295,0.613841,0.484573,0.319621,0
2,19.18,16.63,0.8717,6.369,3.681,3.357,6.229,0.514152,0.00759,0.018362,0.675569,0.46807,0.746136,0.697348,1
3,19.51,16.71,0.878,6.366,3.801,2.962,6.185,0.566277,0.024617,0.006652,0.706756,0.42707,0.74964,0.664003,2
4,12.73,13.75,0.8458,5.412,2.882,3.533,5.067,0.084872,0.507964,0.456413,0.013898,0.355259,0.100908,0.095821,3


In [30]:
test_set.head()

Unnamed: 0,A,P,C,LK,WK,A_Coef,LKG,cluster_feature_0,cluster_feature_1,cluster_feature_2,cluster_feature_3,cluster_feature_4,cluster_feature_5,cluster_feature_6
0,10.82,12.83,0.8256,5.18,2.63,4.853,5.089,0.33671,0.974303,0.926311,0.082955,0.959146,0.267475,0.309169
1,12.15,13.45,0.8443,5.417,2.837,3.638,5.338,0.122312,0.692189,0.63103,0.079049,0.671361,0.288233,0.27027
2,12.08,13.23,0.8664,5.099,2.936,1.415,4.961,0.131036,0.815678,0.661187,0.141963,0.841779,0.438289,0.247127
3,19.11,16.26,0.9081,6.154,3.93,2.936,6.079,0.401317,0.024588,0.031491,0.644162,0.466876,0.77872,0.656493
4,14.7,14.21,0.9153,5.205,3.466,1.767,4.649,0.013639,0.401971,0.310326,0.213739,0.487416,0.457952,0.308272


# Predictions on test_set

In [31]:
train_set, test_set = dataset_KNN(df, 3, 'complete')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [32]:
predictions = []
X = train_set.iloc[:,:-1].values
y = train_set.iloc[:,-1].values
for sample in test_set.values:
    prediction_1 = KNN_classification(sample, 5, X, y)
    predictions.append(prediction_1)
print(predictions)

[1.0, 1.0, 1.0, 1.0, 0.0, 2.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0]


# KNN on original dataset_KNN

In [33]:
df = pd.read_csv("Seed_Data.csv")
df = df.sample(frac=1).reset_index(drop = True)
train_set = df.iloc[:170, :].reset_index(drop = True)
test_set = df.iloc[170:, :].reset_index(drop = True)

X = train_set.iloc[:,:-1].values
y = train_set.iloc[:,-1].values
X_test = test_set.iloc[:,:-1].values
y_test = test_set.iloc[:,-1].values

In [34]:
knn = [3,5,7,9,11] 
Accuracy = []
for k in knn:
    result = k_Fold(train_set.iloc[:150,:], 10, k)
    acc = sum(result)/len(result)
    print(f"Accuracy using {k} knn: {acc}")
    Accuracy.append(acc)
    av_acc = sum(Accuracy)/len(Accuracy)
print(f"Accuracy using KNN: {av_acc} ")

Accuracy using 3 knn: 0.8800000000000001
Accuracy using 5 knn: 0.8733333333333333
Accuracy using 7 knn: 0.9
Accuracy using 9 knn: 0.8866666666666669
Accuracy using 11 knn: 0.8933333333333333
Accuracy using KNN: 0.8866666666666667 


In [35]:
predictions = []
for sample in X_test:
    prediction_1 = KNN_classification(sample, 5, X, y)
    predictions.append(prediction_1)

In [36]:
accuracy_metric(y_test, predictions)

0.9