## Imports and Declerations:

In [2]:
import pickle
import os
import string
import scipy
import pandas as pd
from pandas import HDFStore 
import re
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import clear_output
import numpy as np
from copy import deepcopy
import random
from nltk.stem import PorterStemmer
from collections import Counter
import joblib
InteractiveShell.ast_node_interactivity = "all"


## Helper Functions:

In [3]:

# Remove Punctuation
def remove_punctuation(word):
    return word.translate(word.maketrans('', '', string.punctuation))
printable = set(string.printable)

# Clean Query Term
def clean_word(word):
    # Case Folding
    ps = PorterStemmer()
    word = word.lower()
    # Filter non-ASCII characters
    word = ''.join(filter(lambda x: x in printable, word))
    #     print(word)
    # Remove Punctuations
    if word != '(' and word != ')':
        word = remove_punctuation(word)
#     print(word)
    if re.match('\d+[A-Za-z]+', word):
        word = re.split('\d+', word)[1]
    if re.match('[A-Za-z]+\d+', word):
        word = re.split('\d+', word)[0]


#     print(word)
    word = ps.stem(word)
    #     print(word)
    return word

In [4]:
DOCUMENTS_PATH = ('data', )
STOPWORD_PATH = ('Stopword-List.txt', )

In [5]:
class JSONDocToVec(object):
    def __init__(self, DOCUMENTS_PATH, STOP_WORD_PATH):
        self.ingredients = set()
        self.doc_index = {}
        self.documents_path = DOCUMENTS_PATH
        self.stop_word_path = STOP_WORD_PATH
        self.stop_words = self.load_stop_words()
        self.Xindex = []
        self.vocab_index = self.file_extraction_wrapper(extract_vocab=True)
        vectors = self.file_extraction_wrapper(extract_vectors=True)
        self.files = {}
        
#         self.X = self.vectors[0]
        self.y = vectors[1]
        
        data = pd.DataFrame(vectors[0])
        # # Feature Selection
        # Drop Features with Df < 3
        data.drop([
            col for col, val in data.sum().iteritems() if int(val) <= 3
        ],axis=1,inplace=True)
        data.mul(data.sum().apply(lambda df: np.log10(data.shape[0] / (df + 1))),
         axis=1)
        self.data = scipy.sparse.csr_matrix(data.values)
        # Tf - Idf Calculation
        self.idf = data.sum().apply(lambda df: np.log10(data.shape[0] / (df + 1)))

    def file_extraction_wrapper(self,
                                extract_vocab=False,
                                extract_vectors=False):
        vocab = set()
        docs = {}
        printable = set(string.printable)
        raw_data = []
        if extract_vectors:
            X = []
            y = []
            Xindex = []
        doc_count = 0
        # Printable characters are
        # 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
        # !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c
        ps = PorterStemmer()
        json_files = next(os.walk(os.path.join(self.documents_path)))[2]
        print('dir : ',  list(os.walk(os.path.join(self.documents_path))))
        print(json_files)
        for jfile in json_files:
#             docs_in_c = next(os.walk(os.path.join(self.documents_path, c)))[2]
            if jfile.startswith('test'):
                continue
            print(jfile)
            print(self.documents_path)
            print('filepath : ',(os.path.join(self.documents_path,jfile)))
            
            with open(os.path.join(self.documents_path, jfile),
                          'r') as file1:
                rows = json.load(file1)
                
                for doc in rows:
                    
                    if extract_vectors:
                        doc_vector = np.zeros((len(self.vocab_index)))
#                         doc_name = os.path.join(self.documents_path, c, doc)
                        self.doc_index[doc_count] =doc['id']
                        doc_count+=1
                 
#                         symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
#                         for i in symbols:
#                             line = line.replace(i, ' ')
                    for word in doc['ingredients']:

                        # Case Folding
                        word = word.lower()

                        # Filter non-ASCII characters
                        word = ''.join(
                            filter(lambda x: x in printable, word))

                        if word in self.stop_words:
                            continue

                        # Remove Punctuations
                        word = remove_punctuation(word)

                        if re.match('\d+[A-Za-z]+', word):
                            word = re.split('\d+', word)[1]
                        if re.match('[A-Za-z]+\d+', word):
                            word = re.split('\d+', word)[0]

                        if len(word) == 0 or len(
                                word) == 1 or word == '' or word == ' ':
                            continue
                        if extract_vocab:
                            self.ingredients.add(word)
                        word = ps.stem(word)
#                         print(word)
                        if extract_vocab:
                            
                            vocab.add(word)
                        if extract_vectors:
                            doc_vector[self.vocab_index[word]] += 1

                    if extract_vectors:
                        Xindex.append(doc['id'])
                        X.append(doc_vector)
                        if jfile.startswith('test'):
                            y.append(None)
                        else:
                            y.append(doc['cuisine'])
        if extract_vocab:
            print(f'Vocab Size : {len(vocab)}')
            vocab_list = sorted(list(vocab))
            vocab_hash = dict.fromkeys(vocab_list, 0)
            vocab_index = {
                word: index
                for index, word in enumerate(vocab_list)
            }
            return vocab_index

        if extract_vectors:
            self.Xindex = Xindex
            return (X, y)

    def get_query_vector(self, query_terms):
        ps = PorterStemmer()
        pd_data = pd.DataFrame(self.data.toarray())
        query_vector =pd.Series(pd_data.T[0])
#         print(query_vector.index)
        query_terms = [ps.stem(word.lower()) for word in query_terms]
        for term in query_terms:
            if term in self.vocab_index.keys():
                if self.vocab_index[term] in self.idf.index:
#                     print(term)
                    query_vector.loc[ self.idf.index.get_loc(self.vocab_index[term])] += 1
#         print(query_vector.col[query_vector > 0])
        for index in query_vector.index[query_vector > 0]:
            query_vector.iloc[index] *= self.idf.iloc[index] 
        return query_vector
        
    def load_stop_words(self):
        stop_words = set()
        with open(self.stop_word_path, 'r') as stop_word_file:
            lines = stop_word_file.readlines()
            for line in lines:
                stop_words.add(line.split('\n')[0])
        return stop_words

## Data Pre-Processing:

In [46]:

dv = JSONDocToVec(DOCUMENTS_PATH=os.path.join(*DOCUMENTS_PATH),
              STOP_WORD_PATH=os.path.join(*STOPWORD_PATH))


dir :  [('data', [], ['test.json', 'train.json'])]
['test.json', 'train.json']
train.json
data
filepath :  data\train.json
Vocab Size : 6686
dir :  [('data', [], ['test.json', 'train.json'])]
['test.json', 'train.json']
train.json
data
filepath :  data\train.json


In [5]:
loaded_data = pd.DataFrame(scipy.sparse.load_npz('sparse_matrix.npz').toarray())

In [7]:
dv.data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3665,3666,3667,3668,3669,3670,3671,3672,3673,3674
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
dv.ingredients

{'sweet yellow corn',
 'seedless red grapes',
 'chinese red rice vinegar',
 'vanilla wafers',
 'baby kale',
 'parmigiano',
 'cooked brown rice',
 'fresh onion',
 'vietnamese coriander',
 'fresh lemon',
 'liverwurst',
 'rapeseed oil',
 'idaho potatoes',
 'pancake',
 'chocolate chip cookie dough ice cream',
 'chocolatehazelnut spread',
 'softshelled crabs',
 'chocolate leaves',
 'cooked bone in ham',
 'chocolate extract',
 'rose water',
 'cut up chicken',
 'ground caraway',
 'biscuit mix',
 'spot prawns',
 'tuaca liqueur',
 'morcilla',
 'dried apple',
 'bulb',
 'prepared pie crusts',
 'barilla ovenready lasagne',
 'cold meatloaf',
 'pure maple syrup',
 'fresh mozzarella balls',
 'instant banana cream pudding',
 'crab sticks',
 'diced green chilies',
 'jack',
 'crystal farms reduced fat shredded marble jack cheese',
 'veggies',
 'reduced sodium vegetable stock',
 'skinless chicken breasts',
 'cooki vanilla wafer',
 'vegetarian oyster sauce',
 'halibut fillets',
 'mixed greens',
 'fully co

## Saving VectorSpace:

In [51]:
# Save Vectors
vectors_file_name = 'DV'
pickle.dump(dv, open(vectors_file_name , 'wb'))

dv = pickle.load(dv, open(vectors_file_name , 'rb'))


In [14]:
vectors_file_name = 'DV'
dv = pickle.load(open(vectors_file_name , 'rb'))

In [15]:
dv.y

['greek',
 'southern_us',
 'filipino',
 'indian',
 'indian',
 'jamaican',
 'spanish',
 'italian',
 'mexican',
 'italian',
 'italian',
 'chinese',
 'italian',
 'mexican',
 'italian',
 'indian',
 'british',
 'italian',
 'thai',
 'vietnamese',
 'thai',
 'mexican',
 'southern_us',
 'chinese',
 'italian',
 'chinese',
 'cajun_creole',
 'italian',
 'chinese',
 'mexican',
 'italian',
 'cajun_creole',
 'mexican',
 'thai',
 'italian',
 'cajun_creole',
 'italian',
 'filipino',
 'southern_us',
 'southern_us',
 'italian',
 'brazilian',
 'mexican',
 'indian',
 'mexican',
 'chinese',
 'french',
 'southern_us',
 'southern_us',
 'southern_us',
 'japanese',
 'southern_us',
 'italian',
 'southern_us',
 'italian',
 'jamaican',
 'japanese',
 'indian',
 'italian',
 'irish',
 'thai',
 'thai',
 'indian',
 'jamaican',
 'italian',
 'thai',
 'korean',
 'french',
 'french',
 'southern_us',
 'spanish',
 'indian',
 'moroccan',
 'italian',
 'italian',
 'moroccan',
 'moroccan',
 'vietnamese',
 'japanese',
 'mexican',

## Train Test Split:

In [16]:

# data = dv.data.copy()
# data = pd.DataFrame(X_self.toarray())
data = pd.DataFrame(dv.data.toarray())
data['label'] = dv.y
shuffled_data = data

shuffled_data.sample(frac=1)
train_size = 0.8
test_size = 0.2

train_data, test_data = shuffled_data.sample(frac=train_size), shuffled_data.sample(frac=test_size)
X_train, y_train = train_data.loc[:, train_data.columns != 'label'], train_data['label']
X_test, y_test = test_data.loc[:, test_data.columns != 'label'], test_data['label']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3666,3667,3668,3669,3670,3671,3672,3673,3674,label
31091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mexican
12692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chinese
29424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,italian
33413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,italian
23420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,thai
26901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,indian
38957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,thai
14468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,french
10062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,brazilian
27262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,japanese


## Model Training:

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix
from matplotlib import pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [18]:

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [19]:
print("Random Forest Accuracy")
print(accuracy_score(y_test,pred))

y_actu = pd.Categorical(y_test,categories= list(set(dv.y)))
y_pred = pd.Categorical(pred, categories= list(set(dv.y)))

df_confusion = pd.crosstab(y_actu, y_pred, margins=True, rownames=['actual'], colnames=['predicted'])
df_confusion

Random Forest Accuracy
0.9301068510370836


predicted,irish,italian,chinese,thai,russian,british,southern_us,jamaican,korean,vietnamese,...,french,japanese,greek,spanish,cajun_creole,brazilian,mexican,indian,moroccan,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
irish,116,3,0,0,0,3,2,0,0,0,...,4,0,0,0,0,1,2,1,0,133
italian,1,1540,1,0,0,0,5,0,0,0,...,26,0,4,2,3,0,5,3,0,1590
chinese,0,0,499,2,0,0,1,0,1,1,...,1,7,1,0,0,0,5,1,0,523
thai,0,0,9,284,0,1,0,0,2,6,...,0,0,0,0,0,0,1,3,0,306
russian,2,4,1,0,79,5,2,0,0,0,...,5,0,1,0,1,0,4,0,0,104
british,1,4,0,0,0,145,6,0,0,0,...,5,0,0,0,0,0,0,0,0,162
southern_us,3,19,1,0,0,6,768,0,0,0,...,14,0,0,1,7,2,5,4,1,833
jamaican,1,1,1,0,0,1,1,87,0,0,...,0,0,0,0,1,0,2,5,0,101
korean,1,2,4,0,0,0,1,0,151,0,...,0,6,1,0,0,0,1,0,0,168
vietnamese,0,1,4,8,0,0,0,0,1,135,...,1,1,0,0,0,0,4,0,1,159


In [13]:

qury = dv.get_query_vector([ "green chile",
        "jalapeno chilies",
        "onions",
        "ground black pepper",
        "salt",
        "chopped cilantro fresh",
        "green bell pepper",
        "garlic",
        "white sugar",
        "roma tomatoes",
        "celery",
        "dried oregano"])

clf.predict([qury])[0]

'mexican'

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

LR = LogisticRegression()

LR.fit(X_train,y_train)
pred = LR.predict(X_test)

print("\nLogistic Regression")
print(accuracy_score(y_test,pred))



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


Logistic Regression
0.8451288497800126


In [15]:
y_actu = pd.Categorical(y_test,categories= list(set(dv.y)))
y_pred = pd.Categorical(pred, categories= list(set(dv.y)))

df_confusion = pd.crosstab(y_actu, y_pred, margins=True, rownames=['actual'], colnames=['predicted'])
df_confusion

predicted,italian,vietnamese,russian,french,chinese,cajun_creole,filipino,mexican,british,greek,...,southern_us,indian,jamaican,korean,spanish,thai,moroccan,japanese,irish,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
italian,1487,0,3,42,0,3,0,7,2,9,...,28,2,1,0,4,0,1,0,2,1592
vietnamese,2,127,0,0,17,0,3,3,0,0,...,1,2,0,4,0,24,0,0,0,184
russian,12,0,71,11,0,0,0,2,1,1,...,7,0,1,0,0,0,0,0,1,107
french,94,0,2,395,2,1,1,5,4,4,...,22,2,0,0,2,1,0,1,1,537
chinese,4,5,1,3,488,1,4,4,0,0,...,7,1,0,4,0,12,0,1,0,535
cajun_creole,24,0,1,7,0,263,0,5,0,0,...,36,0,0,0,1,0,0,0,0,337
filipino,4,1,0,3,13,1,104,5,0,0,...,6,3,0,1,1,3,0,0,0,147
mexican,27,0,0,6,2,3,3,1206,0,1,...,19,3,1,0,3,0,0,0,0,1275
british,16,0,0,19,0,1,0,6,95,1,...,26,3,0,0,1,0,0,0,3,171
greek,34,0,2,8,0,0,0,2,0,172,...,4,2,0,0,2,0,0,0,0,226


In [None]:

svm = SVC()
svm.fit(X_train,y_train)
pred = svm.predict(X_test)
print("Support Vector Machine")
print(accuracy_score(y_test,pred))


In [22]:
pred = []
with open('data/test.json',
                          'r') as file1:
                rows = json.load(file1)
                
                for doc in rows:
                    print(doc['ingredients'])
                    qury = dv.get_query_vector(doc['ingredients'])
                    cuisine = clf.predict([qury])[0]
                    print(cuisine)
                    pred.append({'id':doc['id'], 'predictions':cuisine})
test_pred = pd.DataFrame(pred, columns={'id', 'predictions'})

KeyboardInterrupt: 