In [1]:
import re
import json
import nltk
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

In [3]:
with open('data/train.json') as json_file:
    json_data = json.load(json_file)

len(json_data)

39774

In [4]:
# remove id for training data
list(map(lambda d: d.pop('id'), json_data));
# get the ingredients
json_ingredients = [x['ingredients'] for x in json_data]

In [5]:
# make lowercase, clear punctuation, and stem
# dashes are added so ngram isn't confused when lists are combined
lowercase = [[x.lower() for x in y] for y in json_ingredients]
punctuation_lowercase = [[re.sub(r'([^\s\w]|_)+', '', x) for x in y] for y in lowercase]
sno = nltk.stem.SnowballStemmer('english')
punctuation_lowercase_stemmed = [[sno.stem(x) for x in y] for y in punctuation_lowercase]
punctuation_lowercase_stemmed_dash = [[x.replace(' ', '-') for x in y] for y in punctuation_lowercase_stemmed]
json_ingredients_cleaned = [' '.join(x) for x in punctuation_lowercase_stemmed_dash]

In [6]:
# grab cuisine variables
json_cuisine = [x['cuisine'] for x in json_data]

In [7]:
# turn targets into ints
lb = LabelEncoder()
y = lb.fit_transform(json_cuisine)

In [8]:
# implement tfid
tfidf = TfidfVectorizer(use_idf=True, ngram_range=(1,3))
X = tfidf.fit_transform(json_ingredients_cleaned)

In [10]:
model = SVC(
            kernel='rbf',
            C = 100,
            gamma=.01,
            tol=0.001,
            verbose=False,
            max_iter=-1,
            )
model.fit(X, y)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
with open('data/test.json') as json_file:
    json_data_test = json.load(json_file)

len(json_data_test)

json_ingredients_test = [x['ingredients'] for x in json_data_test]

In [12]:
# make lowercase, clear punctuation, and stem
# dashes are added so ngram isn't confused
lowercase_test = [[x.lower() for x in y] for y in json_ingredients_test]
punctuation_lowercase_test = [[re.sub(r'([^\s\w]|_)+', '', x) for x in y] for y in lowercase_test]
punctuation_lowercase_stemmed_test = [[sno.stem(x) for x in y] for y in punctuation_lowercase_test]
punctuation_lowercase_stemmed_dash_test = [[x.replace(' ', '-') for x in y] for y in punctuation_lowercase_stemmed_test]
json_ingredients_cleaned_test = [' '.join(x) for x in punctuation_lowercase_stemmed_dash_test]

X_test = tfidf.transform(json_ingredients_cleaned_test)

In [13]:
# Predictions
y_test = model.predict(X_test)
y_pred = lb.inverse_transform(y_test)

test_id = [doc['id'] for doc in json_data_test]
df = pd.DataFrame({'id':test_id, 'cuisine':y_pred}, columns=['id','cuisine'])
df.to_csv('cuisine_output.csv', index=False)

In [None]:
def predict_from_list(input_list):
    lowercase_text = [x.lower() for x in input_list]
    