##### This version involves further cleaning of data, which it to be used for the 2nd round of model training and testing (1st round is complete and can be found in the 'Code' folder named "Project_3_McNulty_Modeling_V1").

### List of Imports

In [620]:
from __future__ import print_function

In [621]:
import json
import pickle
import numpy as np 
import pandas as pd 
import seaborn as sns
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import matplotlib.pyplot as plt
# from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


%matplotlib inline

In [622]:
pd.options.display.max_rows = 20

In [623]:
with open('../../../../raw_data/train.json', 'rb') as f:
    dict_ = json.load(f)

In [624]:
df = pd.DataFrame.from_dict(dict_)

In [625]:
df.head(2)

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."


In [627]:
# get rid of white spaces;
# replace space in between a phrase by "_";
# change all to lower-case letters 
df.ingredients = [','.join(i).replace(' ','_').lower()
                  .strip() for i in df.ingredients]  

In [628]:
df.head(5)

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"romaine_lettuce,black_olives,grape_tomatoes,ga..."
1,southern_us,25693,"plain_flour,ground_pepper,salt,tomatoes,ground..."
2,filipino,20130,"eggs,pepper,salt,mayonaise,cooking_oil,green_c..."
3,indian,22213,"water,vegetable_oil,wheat,salt"
4,indian,13162,"black_pepper,shallots,cornflour,cayenne_pepper..."


In [629]:
# an example on a greek cuisine 
df.ingredients[0]

'romaine_lettuce,black_olives,grape_tomatoes,garlic,pepper,purple_onion,seasoning,garbanzo_beans,feta_cheese_crumbles'

In [630]:
key_ing = ['flour','egg','salt','oil','water','cilantro','garlic','milk'
           ,'pepper','tomato','corn','onion','butter','sugar','almond'
           ,'broth','chicken','beef','pork','sausage','rice','peanut'
           ,'cream','yeast','olives','lettuce','carrot','shrimp','walnut'
           ,'lemon','orange','ginger','allspice','turkey','cinnamon'
           ,'mint','parsley','thym','spinach','chive','dill','basil'
           ,'tarragon','coriander','parmesan','rosemary','lime','pecan'
           ,'peas','apple','vinegar','celery','cumin','turmeric','lamb'
           ,'cardamom','oregano','chili','cabbage','soy_sauce','mustard'
           ,'coconut_milk','raisins','nutmeg','bread','apricot','syrup'
           ,'cheddar','mozzarella','parmesan','romano','ricotta','jack'
           ,'squash','paprika','chocolate','potato','cocoa','sour_cream'
           ,'catfish','salmon','yogurt','sesame_seeds','vanilla'
           ,'feta_cheese']

def standardize_ingredient(row):
    """ This function returns a list of ingredients 
    with standardized names. """
    for i in row.split(','):
        for j in key_ing:
            if j in i:
                row = row.replace(i,j)
            else: 
                pass
    row = [row.replace(',',' ')]
    return row

In [631]:
df['ingredients'] = df['ingredients'].apply(standardize_ingredient)

In [632]:
d = df['ingredients'][0][0]

In [633]:
top_10_ingredients = ['flour','egg','salt','oil','water'
                      ,'onion','pepper','sugar','tomato','garlic']
for i in d.split(' '):
    if i in top_10_ingredients:
        print(i)
        d = d.replace(i,'')
        d = ' '.join(d.split())
        print(d)

tomato
lettuce olives garlic pepper onion seasoning garbanzo_beans feta_cheese
garlic
lettuce olives pepper onion seasoning garbanzo_beans feta_cheese
pepper
lettuce olives onion seasoning garbanzo_beans feta_cheese
onion
lettuce olives seasoning garbanzo_beans feta_cheese


In [634]:
top_10_ingredients = ['flour','egg','salt','oil','water'
                      ,'onion','pepper','sugar','tomato','garlic']
def remove_common_ingredient(row):
    """ This function returns a list of ingredients 
    after removing the top 10 common ingredients across cuisines. """
    for i in row[0].split(' '):
        if i in top_10_ingredients:
            row[0] = row[0].replace(i,'')
            row[0] = ' '.join(row[0].split())
        else: 
            pass
    return row[0]

In [635]:
df['ingredients'] = df['ingredients'].apply(remove_common_ingredient)

In [636]:
df['ingredients'][0] 

'lettuce olives seasoning garbanzo_beans feta_cheese'

In [637]:
X = df['ingredients']
y = df['cuisine']

In [638]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.30, random_state=7777)

In [639]:
print('The dimensinality of training features: ', X_train.shape)
print('The dimensinality of training target: ', y_train.shape)
print('The dimensinality of testing features: ', X_test.shape)
print('The dimensinality of testing target: ', y_test.shape)

The dimensinality of training features:  (27841,)
The dimensinality of training target:  (27841,)
The dimensinality of testing features:  (11933,)
The dimensinality of testing target:  (11933,)


### Build volcabulary for ingredients 

#### There are nearly 90k ingredients (with duplicates) out of 39774 cuisines in the dataset, so when building the vocabulary for ingredients, I wanted to set a reasonable threshold to get rid of terms that have a low document frequency. Given the size of the data and ingredients count, I decided to set the initial threshold at 50, meaning only ingredient terms appear more than 50 times across the dataset will be kept. 

In [640]:
vec = CountVectorizer(min_df=50, analyzer = 'word')
count = vec.fit_transform(X_train).toarray()
igt_names = vec.get_feature_names()

##### Pickle countvectorizer for later use 

In [641]:
# with open('../../Data/countvectorizer.pickle', 'wb') as handle: 
#     pickle.dump(vec, handle, pickle.HIGHEST_PROTOCOL)

In [642]:
X_test = vec.transform(X_test)

In [643]:
# igt_frequency = pd.Series(count.sum(axis=0),index = igt_names
#                          ).sort_values(ascending=False)
# ax = igt_frequency[:10
#               ].plot(kind='bar'
#                      ,color='maroon'
#                      , figsize=(10,6)
#                      , width=.8
#                      , fontsize=14
#                      , rot=45
#                      ,title='10 Most Common Ingredients Across Cuisine')
# ax.title.set_size(18)
# plt.xlabel('Ingredients',fontsize=15)
# plt.ylabel('Counts', fontsize=15)

# plt.savefig('../../Visuals/mostcommonigt.png', dpi=200, bbox_inches = 'tight')

In [644]:
counts = np.clip(count, 0, 1, out=count)
counts_reshaped = counts.reshape(len(X_train),len(igt_names))
X_train = pd.DataFrame(counts_reshaped)
X_train.columns = igt_names

In [645]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27841 entries, 0 to 27840
Columns: 345 entries, _drain to zucchini
dtypes: int64(345)
memory usage: 73.3 MB


In [646]:
X_train.head(2)

Unnamed: 0,_drain,_half,_soften,adobo_sauce,agave_nectar,allspice,almond,amchur,ancho_powder,anchovy_fillets,...,white_beans,white_wine,whole_cloves,whole_wheat_tortillas,wonton_wrappers,worcestershire_sauce,yeast,yoghurt,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [647]:
X_train.columns

Index(['_drain', '_half', '_soften', 'adobo_sauce', 'agave_nectar', 'allspice',
       'almond', 'amchur', 'ancho_powder', 'anchovy_fillets',
       ...
       'white_beans', 'white_wine', 'whole_cloves', 'whole_wheat_tortillas',
       'wonton_wrappers', 'worcestershire_sauce', 'yeast', 'yoghurt', 'yogurt',
       'zucchini'],
      dtype='object', length=345)

##### Pickle the cleaned dataframe & a small subset as a sample 

In [648]:
df2_cleaned_sample = X_train[0:50]

In [649]:
df2_cleaned_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Columns: 345 entries, _drain to zucchini
dtypes: int64(345)
memory usage: 134.8 KB


In [650]:
# with open('../../Data/train_standardized.pickle', 'wb') as handle: 
#     pickle.dump(X_train, handle, pickle.HIGHEST_PROTOCOL)

In [651]:
# with open('../../Data/test_transformed.pickle', 'wb') as handle: 
#     pickle.dump(X_test, handle, pickle.HIGHEST_PROTOCOL)

In [652]:
# with open('../../Data/train_target.pickle', 'wb') as handle: 
#     pickle.dump(y_train, handle, pickle.HIGHEST_PROTOCOL)

In [653]:
# with open('../../Data/test_target.pickle', 'wb') as handle: 
#     pickle.dump(y_test, handle, pickle.HIGHEST_PROTOCOL)

In [654]:
# with open('../../Data/standardized_sample.pickle', 'wb') as handle: 
#     pickle.dump(df2_cleaned_sample, handle, pickle.HIGHEST_PROTOCOL)