## Imports and Declerations:

In [1]:
import pickle
import os
import string
import pandas as pd
import re
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import clear_output
import numpy as np
from copy import deepcopy
import random
from nltk.stem import PorterStemmer
from collections import Counter
import joblib
InteractiveShell.ast_node_interactivity = "all"


## Helper Functions:

In [2]:
# Remove Punctuation
def remove_punctuation(word):
    return word.translate(word.maketrans('', '', string.punctuation))
printable = set(string.printable)

# Clean Query Term
def clean_word(word):
    # Case Folding
    ps = PorterStemmer()
    word = word.lower()
    # Filter non-ASCII characters
    word = ''.join(filter(lambda x: x in printable, word))
    #     print(word)
    # Remove Punctuations
    if word != '(' and word != ')':
        word = remove_punctuation(word)
#     print(word)
    if re.match('\d+[A-Za-z]+', word):
        word = re.split('\d+', word)[1]
    if re.match('[A-Za-z]+\d+', word):
        word = re.split('\d+', word)[0]


#     print(word)
    word = ps.stem(word)
    #     print(word)
    return word

In [28]:
DOCUMENTS_PATH = ('data', )
STOPWORD_PATH = ('Stopword-List.txt', )

In [63]:
class JSONDocToVec(object):
    def __init__(self, DOCUMENTS_PATH, STOP_WORD_PATH):
        self.doc_index = {}
        self.documents_path = DOCUMENTS_PATH
        self.stop_word_path = STOP_WORD_PATH
        self.stop_words = self.load_stop_words()
        self.Xindex = []
        self.vocab_index = self.file_extraction_wrapper(extract_vocab=True)
        self.vectors = self.file_extraction_wrapper(extract_vectors=True)
        self.files = {}
        
        self.X = self.vectors[0]
        self.y = self.vectors[1]
        
        data = pd.DataFrame(self.X)
        # # Feature Selection
        # Drop Features with Df < 3
        data.drop([
            col for col, val in data.sum().iteritems() if int(val) <= 3
        ],axis=1,inplace=True)
        data.mul(data.sum().apply(lambda df: np.log10(data.shape[0] / (df + 1))),
         axis=1)
        self.data = data
        # Tf - Idf Calculation
        self.idf = data.sum().apply(lambda df: np.log10(data.shape[0] / (df + 1)))

    def file_extraction_wrapper(self,
                                extract_vocab=False,
                                extract_vectors=False):
        vocab = set()
        docs = {}
        printable = set(string.printable)
        raw_data = []
        if extract_vectors:
            X = []
            y = []
            Xindex = []
        doc_count = 0
        # Printable characters are
        # 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
        # !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c
        ps = PorterStemmer()
        json_files = next(os.walk(os.path.join(self.documents_path)))[2]
        print('dir : ',  list(os.walk(os.path.join(self.documents_path))))
        print(json_files)
        for jfile in json_files:
#             docs_in_c = next(os.walk(os.path.join(self.documents_path, c)))[2]
            if jfile.startswith('test'):
                continue
            print(jfile)
            print(self.documents_path)
            print('filepath : ',(os.path.join(self.documents_path,jfile)))
            
            with open(os.path.join(self.documents_path, jfile),
                          'r') as file1:
                rows = json.load(file1)
                
                for doc in rows:
                    
                    if extract_vectors:
                        doc_vector = np.zeros((len(self.vocab_index)))
#                         doc_name = os.path.join(self.documents_path, c, doc)
                        self.doc_index[doc_count] =doc['id']
                        doc_count+=1
                 
#                         symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
#                         for i in symbols:
#                             line = line.replace(i, ' ')
                    for word in doc['ingredients']:

                        # Case Folding
                        word = word.lower()

                        # Filter non-ASCII characters
                        word = ''.join(
                            filter(lambda x: x in printable, word))

                        if word in self.stop_words:
                            continue

                        # Remove Punctuations
                        word = remove_punctuation(word)

                        if re.match('\d+[A-Za-z]+', word):
                            word = re.split('\d+', word)[1]
                        if re.match('[A-Za-z]+\d+', word):
                            word = re.split('\d+', word)[0]

                        if len(word) == 0 or len(
                                word) == 1 or word == '' or word == ' ':
                            continue

                        word = ps.stem(word)

                        if extract_vocab:
                            vocab.add(word)
                        if extract_vectors:
                            doc_vector[self.vocab_index[word]] += 1

                    if extract_vectors:
                        Xindex.append(doc['id'])
                        X.append(doc_vector)
                        if jfile.startswith('test'):
                            y.append(None)
                        else:
                            y.append(doc['cuisine'])
        if extract_vocab:
            print(f'Vocab Size : {len(vocab)}')
            vocab_list = sorted(list(vocab))
            vocab_hash = dict.fromkeys(vocab_list, 0)
            vocab_index = {
                word: index
                for index, word in enumerate(vocab_list)
            }
            return vocab_index

        if extract_vectors:
            self.Xindex = Xindex
            return (X, y)

    def get_query_vector(self, query_terms):
        ps = PorterStemmer()
        query_vector =pd.Series(self.data.T[0])
        query_terms = [ps.stem(word.lower()) for word in query_terms]
        for term in query_terms:
            if term in self.vocab_index.keys():
                if self.vocab_index[term] in self.idf.index:
                    query_vector.loc[self.vocab_index[term]] += 1
        for index in query_vector.index[query_vector > 0]:
            query_vector.loc[index] *= self.idf.loc[index] 
        return query_vector
        
    def load_stop_words(self):
        stop_words = set()
        with open(self.stop_word_path, 'r') as stop_word_file:
            lines = stop_word_file.readlines()
            for line in lines:
                stop_words.add(line.split('\n')[0])
        return stop_words

In [None]:

dv = JSONDocToVec(DOCUMENTS_PATH=os.path.join(*DOCUMENTS_PATH),
              STOP_WORD_PATH=os.path.join(*STOPWORD_PATH))


dir :  [('data', [], ['test.json', 'train.json'])]
['test.json', 'train.json']
test.json
data
filepath :  data\test.json
train.json
data
filepath :  data\train.json
Vocab Size : 7108
dir :  [('data', [], ['test.json', 'train.json'])]
['test.json', 'train.json']
test.json
data
filepath :  data\test.json
train.json
data
filepath :  data\train.json


In [191]:
# Save Vectors
vectors_file_name = 'VectorSpace'
pickle.dump(dv, open(vectors_file_name , 'wb'))

In [192]:
# # Get Tf Feature Rows
# data = pd.DataFrame(dv.X)
# data.shape
# # Feature Selection
# # Drop Features with Df < 3
# data.drop([
#     col for col, val in pd.DataFrame(dv.X).sum().iteritems() if int(val) <= 3
# ],axis=1,inplace=True)
# data.shape

In [193]:
# # Tf-Idf Calculations
# data.mul(data.sum().apply(lambda df: np.log10(data.shape[0] / (df + 1))),
#          axis=1)

## Train Test Split:

In [194]:
data = dv.data.copy()
data['label'] = dv.y
shuffled_data = data

shuffled_data.sample(frac=1)
train_size = 0.8
test_size = 0.2

train_data, test_data = shuffled_data.sample(frac=train_size), shuffled_data.sample(frac=test_size)
X_train, y_train = train_data.loc[:, train_data.columns != 'label'], train_data['label']
X_test, y_test = test_data.loc[:, test_data.columns != 'label'], test_data['label']

Unnamed: 0,1,10,16,22,23,24,30,33,34,35,...,9564,9568,9572,9573,9575,9577,9579,9583,9584,label
569,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rugby
551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,rugby
160,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cricket
503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,rugby
153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cricket
712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tennis
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,football
210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cricket
390,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,football
672,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tennis


## Distance Formula:

In [195]:
def euclidian_distance(p1, p2):
    return np.linalg.norm(np.array(p2) - np.array(p1))


def cosine_similarity(p1, p2):
    return ((np.dot(p1, p2)) / (np.linalg.norm(p1) * np.linalg.norm(p2)))

In [196]:
def accuracy(y_test, pred):
    return len([1 for p, y in zip(pred, y_test) if p == y]) / len(pred) * 100

## k-nearest neighbors (KNN):

In [197]:
class KNNClassifier():
    def __init__(self, neighbors=3, distance_formula=euclidian_distance):
        self.distance_formula = distance_formula
        self.neighbors = neighbors
        self.X_train = []
        self.y_train = []

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test, k = 3):
        self.neighbors = k
        pred = []
        for index, test_row in X_test.iterrows():
            print(index)
            clear_output(wait=True)
            if self.distance_formula == euclidian_distance:
                pred.append(
                    self.X_train.apply(
                        (lambda row: self.distance_formula(row, test_row)),
                        axis=1).sort_values(ascending=True))
            else:
                pred.append(
                    self.X_train.apply(
                        (lambda row: self.distance_formula(row, test_row)),
                        axis=1).sort_values(ascending=False))

        new_pred = [x[:self.neighbors] for x in pred]
        label_pred = []
        for indexes in new_pred:
            labels = []
            #     print(indexes)
            for index, value in indexes.items():
                #         print(index)
                #         print(y[index])
                labels.append(self.y_train[index])
            label_pred.append(Counter(labels).most_common(1)[0][0])
        return pd.Series(label_pred)

In [198]:
knn = KNNClassifier(distance_formula=cosine_similarity)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

131


In [199]:
accuracy(y_test, pred)

98.63945578231292

In [201]:
y_actu = pd.Categorical(y_test,categories= ['athletics', 'rugby', 'football', 'tennis', 'cricket'] )
y_pred = pd.Categorical(pred, categories= ['athletics', 'rugby', 'football', 'tennis', 'cricket'])

df_confusion = pd.crosstab(y_actu, y_pred, margins=True, rownames=['actual'], colnames=['predicted'])
df_confusion

predicted,athletics,rugby,football,tennis,cricket,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
athletics,23,0,0,0,0,23
rugby,0,26,0,0,0,26
football,0,0,57,0,0,57
tennis,0,0,0,18,0,18
cricket,0,2,0,0,21,23
All,23,28,57,18,21,147


In [202]:
knn = KNNClassifier(distance_formula=euclidian_distance)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

131


In [203]:
accuracy(y_test, pred)

92.51700680272108

## K-Means Clustering:

In [204]:
# PreProcessing For Kmeans
clustering_data = data
clustering_data['file_names'] = dv.doc_index.values()
clustering_data = clustering_data.sample(frac = 1)
clustering_data_labels, clustering_data_file_names = clustering_data['label'], clustering_data['file_names']
clustering_data = clustering_data.drop(['label', 'file_names'], axis = 1)
clustering_data_input = clustering_data

In [205]:
clustering_data

Unnamed: 0,1,10,16,22,23,24,30,33,34,35,...,9563,9564,9568,9572,9573,9575,9577,9579,9583,9584
404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [206]:
from collections import Counter


class KMeansCluster():
    def __init__(self, n_clusters=2, distance_formula=euclidian_distance):
        self.num_clusters = n_clusters
        self.n_clusters = n_clusters
        self.max_iterations = 100
        self.distance_formula = distance_formula

    def get_labels(self, X_train, centroids):
        a1 = pd.DataFrame(centroids).apply(lambda center: X_train.apply(
            (lambda row: self.distance_formula(row, center)), axis=1),
                                           axis=1)
        a2 = []
        print(a1.shape)
        sort_ascending = self.distance_formula == euclidian_distance
        for col in a1:
            if sort_ascending:
#                 print(a1[col].sort_values(ascending=True).index[:5])
                a2.append(a1[col].sort_values(ascending=True).index[0])
            else:
                a2.append(a1[col].sort_values(ascending=False).index[0])
        return a2


#     def nearest_centeroid(self, x, centroids):
#             return pd.DataFrame(centroids).apply(lambda center: euclidian_distance(x, center), axis=1).sort_values(ascending=True).index[0]

    def fit(self, X_train):
        self.X_train = X_train

        self.centroids = []
        for c in range(0, self.num_clusters):
            self.centroids.append(
                pd.DataFrame(X_train.sample(self.n_clusters).values).apply(
                    sum, axis=0) / self.n_clusters)

        def new_centroids(X, labels):

            centroids = []
            for c in range(0, self.num_clusters):
                clustered_rows = X.iloc[[
                    x[0] for x in enumerate(labels) if x[1] == c
                ]]
                if len(clustered_rows.index) == 0:
                    print("NULL")
                    centroids.append(
                        pd.DataFrame(X.sample(self.n_clusters).values).apply(
                            sum, axis=0) / self.n_clusters)
                else:
                    centroids.append(
                        clustered_rows.apply(sum) / len(clustered_rows.index))
                    centroids[c].index = range(0, len(X.columns))
            return centroids

        iterations = 0
        old_centroids = self.centroids
        while iterations < self.max_iterations:
            clear_output(wait=True)
            print(iterations)
            old_centroids = self.centroids
            labels = self.get_labels(self.X_train, self.centroids)
            self.centroids = new_centroids(self.X_train, labels)

            converged = True
            for c in range(0, len(self.centroids)):
                if not self.centroids[c].equals(old_centroids[c]):
                    converged = False
                    break
            if converged:
                break

            iterations += 1
        self.labels = labels
        self.cluster_centers_ = self.centroids

    def predict(self, X_test):
        #         return  X_test.apply(lambda x : self.nearest_centeroid(x, self.centroids), axis= 1)
        return np.array(self.get_labels(X_test, self.centroids))

    def purity(self, labels):
        total = 0
        for c in range(0, self.n_clusters):
            la = labels.iloc[[
                x[0] for x in enumerate(self.labels) if x[1] == c
            ]]
            if len(la) != 0:
                print(f'Cluster {c} size : {len(la)}')
                print(f'Cluster {c} Most Common Label : {Counter(la).most_common(1)[0][0]}')
                print(f'Cluster {c} Most Common Label Count : {Counter(la).most_common(1)[0][1]}')
                print(Counter(la).most_common(1)[0][1] / len(la))
                total += Counter(la).most_common(1)[0][1]

        purity = total / self.X_train.shape[0]
        print()
        print(f'Purity : {purity}')
        return purity

In [210]:
%%time
kmeans = KMeansCluster(n_clusters=5, distance_formula=cosine_similarity)
kmeans.fit(clustering_data)

21
(5, 737)
Wall time: 46.1 s


In [211]:
kmeans.purity(clustering_data_labels)

Cluster 0 size : 147
Cluster 0 Most Common Label : rugby
Cluster 0 Most Common Label Count : 133
0.9047619047619048
Cluster 1 size : 121
Cluster 1 Most Common Label : tennis
Cluster 1 Most Common Label Count : 96
0.7933884297520661
Cluster 2 size : 117
Cluster 2 Most Common Label : cricket
Cluster 2 Most Common Label Count : 116
0.9914529914529915
Cluster 3 size : 81
Cluster 3 Most Common Label : athletics
Cluster 3 Most Common Label Count : 77
0.9506172839506173
Cluster 4 size : 271
Cluster 4 Most Common Label : football
Cluster 4 Most Common Label Count : 248
0.915129151291513

Purity : 0.9090909090909091


0.9090909090909091

In [209]:
# Documents in cluster
for c in range(0, kmeans.n_clusters):
    print(f'Documents in Cluster {c}')
    print(clustering_data_file_names.iloc[[x[0] for x in enumerate(kmeans.labels) if x[1] == c ]])

Documents in Cluster 0
404     football\180.txt
419     football\195.txt
571        rugby\082.txt
245     football\021.txt
99     athletics\100.txt
300     football\076.txt
440     football\216.txt
595        rugby\106.txt
421     football\197.txt
613        rugby\124.txt
258     football\034.txt
591        rugby\102.txt
535        rugby\046.txt
227     football\003.txt
350     football\126.txt
268     football\044.txt
603        rugby\114.txt
441     football\217.txt
392     football\168.txt
425     football\201.txt
462     football\238.txt
480     football\256.txt
405     football\181.txt
331     football\107.txt
452     football\228.txt
228     football\004.txt
680       tennis\044.txt
294     football\070.txt
325     football\101.txt
575        rugby\086.txt
             ...        
384     football\160.txt
615        rugby\126.txt
273     football\049.txt
342     football\118.txt
387     football\163.txt
395     football\171.txt
329     football\105.txt
451     football\227.txt
40

## Saving Trained Models:

In [168]:
# Note : KNN has no trainng step
knn_file_name = 'KNN.sav'
kmeans_file_name = 'KMeans.sav'
joblib.dump(knn, knn_file_name)
joblib.dump(kmeans, kmeans_file_name)

['KNN.sav']

['KMeans.sav']