# Given a dataset of documents with content from 5 different fields. To cluster the documents  I have used KMeans algorithm where k = 5 and iteration is used to converge is 10

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import normalize
import re
import os
from nltk.corpus import stopwords
from collections import Counter
import math

# To measure euclidean distance

In [3]:
def euclidean_distance(x,c):
    sqrsum = 0
    for i in range(len(x)):
        sqrsum = sqrsum + ((x[i] - c[i]) ** 2)
    return math.sqrt(sqrsum)

# Loading data

After loading the data three fields have been used.
"text" - The content of the documents is stored in this field
"filename" - The name of the file is stored in this field
"label" - The label of the documents is stored in this field

In [4]:
df = pd.DataFrame(columns = ['filename','text', 'label']) 
base_dir = "/media/indranil/New Volume/second sem/SMAI/Assignment 2/q6/data/dataset/"
for filename in os.listdir(base_dir):
    path = os.path.join(base_dir, filename)
    with open(path, "r", encoding='latin1') as file:
        text = file.read()
        label = filename[filename.find("_")+1:filename.find(".")]
        df = df.append({'filename':filename,'text': text, 'label':label}, ignore_index=True)

# Data pre-processing

Data is preprocessed. Punctuations are removes. Digits are removed. All the contents are converted to lower case. Whitespace is removed.

In [5]:
df.loc[:,"text"] = df.text.apply(lambda x : str.lower(x))
df.loc[:,"text"] = df.text.apply(lambda x : " ".join(re.findall('[\w]+',x)))
df["text"] = df['text'].str.replace('[^\w\s]','')
df.loc[:,"text"] = df.text.apply(lambda x: x.strip())

# Vectorizer

For feature extraction the vectorizer is provided by sklearn. It is normalized after the vectorization as many documents vary is size of content and the count of words have been normalized.

In [6]:
tfvect = TfidfVectorizer(stop_words = 'english')
tfdf = tfvect.fit_transform(df['text'])
tfdfnorm = normalize(tfdf)
X_train = tfdfnorm.toarray()

# Initialization of k and iteration

In [10]:
k = 5
iteration = 10

# Randomly 5 centroids are choosen from train data

In [9]:
centroids = X_train[np.random.choice(X_train.shape[0],k,replace = False)]

# Initializatin of clusters

In [213]:
clusters = np.zeros(X_train.shape[0])

# Implementation of KMeans algorithm

In [214]:
for it in range(iteration):
    cluster_dist_1 = -1
    cluster_dist_2 = -1
    cluster_dist_3 = -1
    cluster_dist_4 = -1
    cluster_dist_5 = -1
    for i in range(X_train.shape[0]):
        cluster_dist_1 = euclidean_distance(X_train[i],centroids[0])
        cluster_dist_2 = euclidean_distance(X_train[i],centroids[1])
        cluster_dist_3 = euclidean_distance(X_train[i],centroids[2])
        cluster_dist_4 = euclidean_distance(X_train[i],centroids[3])
        cluster_dist_5 = euclidean_distance(X_train[i],centroids[4])
        min_dist = min(cluster_dist_1,cluster_dist_2,cluster_dist_3,cluster_dist_4,cluster_dist_5)
        if(min_dist == cluster_dist_1):
            clusters[i] = 1
        elif(min_dist == cluster_dist_2):
            clusters[i] = 2
        elif(min_dist == cluster_dist_3):
            clusters[i] = 3
        elif(min_dist == cluster_dist_4):
            clusters[i] = 4
        elif(min_dist == cluster_dist_5):
            clusters[i] = 5
    np_c1 = []
    np_c2 = []
    np_c3 = []
    np_c4 = []
    np_c5 = []
    
    for i in range(clusters.shape[0]):
        if(clusters[i] == 1):
            np_c1.append(X_train[i])
    for i in range(clusters.shape[0]):
        if(clusters[i] == 2):
            np_c2.append(X_train[i])
    for i in range(clusters.shape[0]):
        if(clusters[i] == 3):
            np_c3.append(X_train[i])
    for i in range(clusters.shape[0]):
        if(clusters[i] == 4):
            np_c4.append(X_train[i])
    for i in range(clusters.shape[0]):
        if(clusters[i] == 5):
            np_c5.append(X_train[i])
    np_c11 = np.array(np_c1)
    np_c21 = np.array(np_c2)
    np_c31 = np.array(np_c3)
    np_c41 = np.array(np_c4)
    np_c51 = np.array(np_c5)
    
    centroids[0] = np.mean(np_c11,axis=0)
    centroids[1] = np.mean(np_c21,axis=0)
    centroids[2] = np.mean(np_c31,axis=0)
    centroids[3] = np.mean(np_c41,axis=0)
    centroids[4] = np.mean(np_c51,axis=0)

# Creation of test data

In [11]:
df= df.iloc[:50,:]

In [216]:
X_test,Y_temp_test = df['text'],df['label']

In [217]:
Y_test = []
Y_temp_test = np.array(Y_temp_test)
for i in range(Y_temp_test.shape[0]):
    Y_test.append(int(Y_temp_test[i][0]))

# Vectorizing and normalizing the test data

In [219]:
tfdf = tfvect.transform(X_test)
tfdfnorm = normalize(tfdf)
X_test = tfdf.toarray()

# Prediction of test data

In [220]:
predict = []   
cluster_dist_1 = -1
cluster_dist_2 = -1
cluster_dist_3 = -1
cluster_dist_4 = -1
cluster_dist_5 = -1
for i in range(X_train.shape[0]):
    cluster_dist_1 = euclidean_distance(X_train[i],centroids[0])
    cluster_dist_2 = euclidean_distance(X_train[i],centroids[1])
    cluster_dist_3 = euclidean_distance(X_train[i],centroids[2])
    cluster_dist_4 = euclidean_distance(X_train[i],centroids[3])
    cluster_dist_5 = euclidean_distance(X_train[i],centroids[4])
    min_dist = min(cluster_dist_1,cluster_dist_2,cluster_dist_3,cluster_dist_4,cluster_dist_5)
    if(min_dist == cluster_dist_1):
        predict.append(1)
    elif(min_dist == cluster_dist_2):
        predict.append(2)
    elif(min_dist == cluster_dist_3):
        predict.append(3)
    elif(min_dist == cluster_dist_4):
        predict.append(4)
    elif(min_dist == cluster_dist_5):
        predict.append(5)


# Mapping the cluster of KMeans with the original cluster

In [221]:
result={}
value = []
for i in range(len(predict)):
    temp =[]
    if(predict[i] == 1):
        for j in range(X_train.shape[0]):
            if(predict[j] == 1):
                temp.append(Y_train[j])
        result[df['filename'][i]] = (Counter(temp).most_common(1)[0][0])
        value.append(Counter(temp).most_common(1)[0][0])
    elif(predict[i] == 2):
        for j in range(X_train.shape[0]):
            if(predict[j] == 2):
                temp.append(Y_train[j])
        result[df['filename'][i]] = (Counter(temp).most_common(1)[0][0])
        value.append(Counter(temp).most_common(1)[0][0])
    elif(predict[i] == 3):
        for j in range(X_train.shape[0]):
            if(predict[j] == 3):
                temp.append(Y_train[j])
        result[df['filename'][i]] = (Counter(temp).most_common(1)[0][0])
        value.append(Counter(temp).most_common(1)[0][0])
    elif(predict[i] == 4):
        for j in range(X_train.shape[0]):
            if(predict[j] == 4):
                temp.append(Y_train[j])
        result[df['filename'][i]] = (Counter(temp).most_common(1)[0][0])
        value.append(Counter(temp).most_common(1)[0][0])
    elif(predict[i] == 5):
        for j in range(X_train.shape[0]):
            if(predict[j] == 5):
                temp.append(Y_train[j])
        result[df['filename'][i]] = (Counter(temp).most_common(1)[0][0])
        value.append(Counter(temp).most_common(1)[0][0])


# Accuracy measurement

In [158]:
accuracy = np.mean(np.array(result) == Y_train)
accuracy

0.8510144927536232

# Result

In [223]:
result

{'274_5.txt': '5',
 '100_1.txt': '1',
 '100_2.txt': '2',
 '100_3.txt': '3',
 '100_4.txt': '5',
 '100_5.txt': '5',
 '101_1.txt': '1',
 '101_2.txt': '1',
 '101_3.txt': '1',
 '101_4.txt': '1',
 '101_5.txt': '5',
 '102_1.txt': '1',
 '102_2.txt': '3',
 '102_3.txt': '3',
 '102_4.txt': '2',
 '102_5.txt': '3',
 '103_1.txt': '5',
 '103_2.txt': '2',
 '103_3.txt': '1',
 '103_4.txt': '3',
 '103_5.txt': '1',
 '104_1.txt': '1',
 '104_2.txt': '2',
 '104_3.txt': '5',
 '104_4.txt': '3',
 '104_5.txt': '2',
 '105_1.txt': '1',
 '105_2.txt': '2',
 '105_3.txt': '1',
 '105_4.txt': '3',
 '105_5.txt': '5',
 '106_1.txt': '3',
 '106_2.txt': '2',
 '106_3.txt': '1',
 '106_4.txt': '2',
 '106_5.txt': '5',
 '107_1.txt': '1',
 '107_2.txt': '2',
 '107_3.txt': '3',
 '200_2.txt': '2',
 '200_3.txt': '1',
 '200_4.txt': '1',
 '200_5.txt': '5',
 '201_1.txt': '1',
 '201_2.txt': '2',
 '201_3.txt': '3',
 '201_4.txt': '3',
 '201_5.txt': '1',
 '202_1.txt': '1',
 '202_2.txt': '2'}