# Whats Cooking:

In [2]:
import os
import json # for preprocessing
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for Training Model
from sklearn.ensemble import ExtraTreesClassifier # For training model
from sklearn.metrics import accuracy_score
import joblib # for saving trained model


## Loading Dataset:

In [29]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
BASE_DIR = os.path.abspath('')


train_file = f'{BASE_DIR}/data/train.json'
# load data file into Python
train = pd.read_json(f'{BASE_DIR}/data/train.json')
test = pd.read_json(f'{BASE_DIR}/data/test.json') 

# clean data
train['ingredients_clean_string'] = [' , '.join(z).strip() for z in train['ingredients']]  
test['ingredients_clean_string'] = [' , '.join(z).strip() for z in test['ingredients']]

# further clean data and extract information through word lemmatization
train['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                         for line in lists]).strip() for lists in train['ingredients']]       
test['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                          for line in lists]).strip() for lists in test['ingredients']]       

# create corpus based on newly processed data
train_corpus = train['ingredients_string']
test_corpus = test['ingredients_string']

# convert a collection of raw documents to a matrix of TF-IDF features
train_vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)

test_vectorizer = TfidfVectorizer(stop_words='english')

# transform the corpus to a dense matrix representation
train_tfidf=train_vectorizer.fit_transform(train_corpus).todense()
test_tfidf=train_vectorizer.transform(test_corpus)


# prepare data for prediction
train_predictor = train_tfidf
test_predictor = test_tfidf

train_target = train['cuisine']


# build Linear Support Vector Classification model
# set penalty parameter as 0.8 with standard penaliation l2
# select the algorithm to solve primal optiomization problem
classifier = LinearSVC(C=0.80, penalty="l2", dual=False)

# model = LinearSVC()
model = LogisticRegression()

# process exhaustive search over specified parameter values for the model
parameters = {'C':[1, 10]}
classifier = GridSearchCV(model, parameters)

# fit classification model to data
classifier=classifier.fit(train_predictor,train_target)

# make prediction
prediction=classifier.predict(test_predictor)

# assign predicted values to cuisine in TEST set
test['cuisine'] = prediction

from sklearn.metrics import accuracy_score
prediction=classifier.predict(train_predictor)
print(accuracy_score(train['cuisine'],prediction))

# write csv file (no index for submission)
test[['id','cuisine' ]].to_csv("LogisticRegression.csv",index=False)



0.8565394478805249


In [23]:
BASE_DIR = os.path.abspath('')


train_file = f'{BASE_DIR}/data/train.json'
with open(train_file) as train_file:
    dict_train = json.load(train_file)


# for row in dict_train:
ingredients = set()
for row in dict_train:
    for ing in row['ingredients']:
        ingredients.add(ing)
print(len(ingredients))

# # converting json dataset from dictionary to dataframe
train = pd.DataFrame.from_dict(dict_train)
train
# train.reset_index(level=0, inplace=True)
# train

# df = pd.read_json(f'{BASE_DIR}/data/train.json')
# df

6714


Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."
5,jamaican,6602,"[plain flour, sugar, butter, eggs, fresh ginge..."
6,spanish,42779,"[olive oil, salt, medium shrimp, pepper, garli..."
7,italian,3735,"[sugar, pistachio nuts, white almond bark, flo..."
8,mexican,16903,"[olive oil, purple onion, fresh pineapple, por..."
9,italian,12734,"[chopped tomatoes, fresh basil, garlic, extra-..."


In [9]:
# Drop Result Label 
x_cols = [col for col in df.columns if col != 'income']

# Set input matrix and target column
X = df[x_cols]
y = df['income']

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 420)

## Data PreProcessing:

In [11]:
# Fill missing Data, as RandomForest in sklearn package cant handle missing values
train_mode = dict(X_train.mode().iloc[0]) # Mode of each column
X_train = X_train.fillna(train_mode) # Replace Nan's with mode
train_mode

{'age': 36,
 'workclass': 'Private',
 'fnlwgt': 123011,
 'education': 'HS-grad',
 'education-num': 9,
 'marital-status': 'Married-civ-spouse',
 'occupation': 'Craft-repair',
 'relationship': 'Husband',
 'race': 'White',
 'sex': 'Male',
 'capital-gain': 0,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': 'United-States'}

In [12]:
# Convert categorials such as workclass : 'private'
encoders = {}
for col in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']:
    categoriacal_convert = LabelEncoder() # Encode target labels with value between 0 and n_classes-1.
    X_train[col] = categoriacal_convert.fit_transform(X_train[col])
    encoders[col] = categoriacal_convert

## Algorithms Training:

In [13]:
%%time
# train the random forest model
rf = RandomForestClassifier(n_estimators=1000) # n_estimators is number of trees
rf = rf.fit(X_train, y_train)

Wall time: 27.5 s


In [14]:
%%time
# train the extra tree model
et = ExtraTreesClassifier(n_estimators=1000)
et = et.fit(X_train, y_train)

Wall time: 24.5 s


In [15]:
%%time
# Save Preprocessing and trained model artifacts
joblib.dump(train_mode, './train_mode.joblib', compress=True)
joblib.dump(encoders, './encoders.joblib', compress=True)
joblib.dump(rf, './random_forest.joblib', compress=True)
joblib.dump(et, './extra_trees.joblib', compress=True)


Wall time: 26 s


['./extra_trees.joblib']

## Prediction:

In [18]:
print(X_test.shape)
print(y_test.shape)

test_mode = dict(X_test.mode().iloc[0]) # Mode of each column
X_test = X_test.fillna(test_mode) # Replace Nan's with mode

for column, encoder in encoders.items():
    X_test[column] = encoder.transform(X_test[column])

pred = rf.predict(X_test)

(6513, 14)
(6513,)


In [21]:
accuracy_score(y_test, pred)

0.8550591125441425