In [None]:
from __future__ import print_function

import json
import pickle
import numpy as np 
import pandas as pd 
import seaborn as sns
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import matplotlib.pyplot as plt
# from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


%matplotlib inline

pd.options.display.max_rows = 15

### Load Data 

In [None]:
with open('../../../../raw_data/train.json', 'rb') as f:
    dict_ = json.load(f)
    
df = pd.DataFrame.from_dict(dict_)

### Data Preprocessing 

In [None]:
df.ingredients = [','.join(i).replace(' ','_').lower()
                  .strip() for i in df.ingredients]  

In [None]:
key_ing = ['flour','egg','salt','oil','water','cilantro','garlic','milk'
           ,'pepper','tomato','corn','onion','butter','sugar','almond'
           ,'broth','chicken','beef','pork','sausage','rice','peanut'
           ,'cream','yeast','olives','lettuce','carrot','shrimp','walnut'
           ,'lemon','orange','ginger','allspice','turkey','cinnamon'
           ,'mint','parsley','thym','spinach','chive','dill','basil'
           ,'tarragon','coriander','parmesan','rosemary','lime','pecan'
           ,'peas','apple','vinegar','celery','cumin','turmeric','lamb'
           ,'cardamom','oregano','chili','cabbage','soy_sauce','mustard'
           ,'coconut_milk','raisins','nutmeg','bread','apricot','syrup'
           ,'cheddar','mozzarella','parmesan','romano','ricotta','jack'
           ,'squash','paprika','chocolate','potato','cocoa','sour_cream'
           ,'catfish','salmon','yogurt','sesame_seeds','vanilla'
           ,'feta_cheese']

def standardize_ingredient(row):
    """ This function returns a list of ingredients 
    with standardized names. """
    for i in row.split(','):
        for j in key_ing:
            if j in i:
                row = row.replace(i,j)
            else: 
                pass
    row = [row.replace(',',' ')]
    return row

df['ingredients'] = df['ingredients'].apply(standardize_ingredient)

In [None]:
top_10_ingredients = ['flour','egg','salt','oil','water'
                      ,'onion','pepper','sugar','tomato','garlic']
def remove_common_ingredient(row):
    """ This function returns a list of ingredients 
    after removing the top 10 common ingredients across cuisines. """
    for i in row[0].split(' '):
        if i in top_10_ingredients:
            row[0] = row[0].replace(i,'')
            row[0] = ' '.join(row[0].split())
        else: 
            pass
    return row[0]

df['ingredients'] = df['ingredients'].apply(remove_common_ingredient)

### Word Vectorization 

In [2]:
X = df['ingredients']
y = df['cuisine']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.30, random_state=7777)

In [None]:
vec = CountVectorizer(min_df=50, analyzer = 'word')
count = vec.fit_transform(X_train).toarray()
igt_names = vec.get_feature_names()
X_test = vec.transform(X_test)
counts = np.clip(count, 0, 1, out=count)
counts_reshaped = counts.reshape(len(X_train),len(igt_names))
X_train = pd.DataFrame(counts_reshaped)
X_train.columns = igt_names