In [22]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../')

import os

os.environ['MKL_NUM_THREADS'] = '4'
os.environ['GOTO_NUM_THREADS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'

import theano
theano.config.openmp = True

os.environ['THEANO_FLAGS'] = 'device=cpu,blas.ldflags=-lblas -lgfortran'

import cPickle as pickle

import json
from copy import copy

from itertools import groupby,chain,tee,izip,islice
from collections import Iterable,Counter
from operator import itemgetter 

import numpy as np
from random import shuffle

from ipywidgets import interact

from sklearn.cross_validation import train_test_split

import re

import nltk
from nltk.stem import RSLPStemmer,SnowballStemmer
from nltk import PunktSentenceTokenizer,FreqDist
from nltk.corpus import stopwords
from nltk import UnigramTagger,BigramTagger

from gensim import corpora

from keras.models import Sequential  
from keras.layers.core import TimeDistributedDense,RepeatVector, Activation, Dropout,Dense,Flatten
from keras.layers.recurrent import GRU, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint    
# from keras.utils.visualize_util import plot

import seq2seq
from seq2seq.models import SimpleSeq2seq

from utils_pack.utils import pickle_out,pickle_in,ensure_dir
from utils_pack.word_embeddings_utils import IngredientDataTransformer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
ingredients = pickle_in(os.path.join("../datasets","cozinhabrasileira","dataset_ingredients.pkl"))

In [66]:
flat_ingredients = [word for ingredient in ingredients for word in ingredient]

In [67]:
dictionary = corpora.Dictionary(flat_ingredients)
dictionary.filter_extremes()
filtered_ingredients = [[[word for word in entry if word in dictionary.values()] for entry in ingr] for ingr in ingredients]
corpus = [[[dictionary.token2id[word]+1 for word in entry] for entry in ingr] for ingr in filtered_ingredients]
pickle_out(os.path.join("../datasets","deep_learning_datasets","dictionary.pkl"),dictionary)

In [68]:
corpus_train,corpus_rest = train_test_split(corpus,train_size = 0.7)
corpus_valid,corpus_test = train_test_split(corpus_rest,train_size = 0.5)

In [69]:
IDG = IngredientDataTransformer(dictionary_word_count = len(dictionary),
                              maxlen_ingredient = 6,
                              max_ingredients = 6,
                              runs = 8
                             )
X_train,Y_train = IDG.corpus_to_x_y_tensors(corpus_train,
                                            os.path.join("../datasets","deep_learning_datasets","train.pkl"))
X_valid,Y_valid = IDG.corpus_to_x_y_tensors(corpus_valid,
                                            os.path.join("../datasets","deep_learning_datasets","valid.pkl"))
X_test,Y_test = IDG.corpus_to_x_y_tensors(corpus_test,
                                          os.path.join("../datasets","deep_learning_datasets","test.pkl"))

(4599, 6, 6)
(4599, 6, 681)
(1008, 6, 6)
(1008, 6, 681)
(982, 6, 6)
(982, 6, 681)


In [70]:
model_name = "model_seq2seq_1"
model_folder = "../models"
ensure_dir(os.path.join(model_folder,model_name))

nb_word = len(dictionary)+1
timesteps = 6
words_in_ingredient = 6

In [83]:
model = SimpleSeq2seq(input_dim=6, hidden_dim=16, output_length=6, output_dim=nb_word)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

pickle_out(os.path.join(model_folder,model_name,'model_architecture.json'),model)
# json_string = model.to_json()
# with open(os.path.join(model_folder,model_name,'model_architecture.json'), 'w+') as f:
#     f.write(json_string)

In [None]:
checkpointer = ModelCheckpoint(filepath=os.path.join(model_folder,model_name,"model_weights.h5")
                               , verbose=2, save_best_only=True)
model.fit(X_train, Y_train, 
          batch_size=16, nb_epoch=300,
          validation_data=(X_valid,Y_valid), 
          show_accuracy=True,verbose=2,callbacks = [checkpointer])

In [None]:
def predict_text_ingredient(model,x,y):
    ingredients_text = []
    for i in range(x.shape[0]):
        if x[i].tolist() != [0]*x.shape[0]:
            ingredients_text.append(" ".join([dictionary[j-1] for j in x[i] if j!=0]))
    ingredients_text = "\n".join(ingredients_text)
    ground_truth_text =" ".join([dictionary[np.argmax(word)-1] for word in y if np.argmax(word)!=0])
    prediction = model.predict_classes(x.reshape(1,x.shape[0],x.shape[1])) 
    print prediction
    prediction_text = " ".join([dictionary[i-1] for i in prediction[0] if i!=0])
    return (ingredients_text,ground_truth_text,prediction_text)

In [None]:
def present_results(nr):
    nr = int(nr)
    x = X_test[nr]
    y = Y_test[nr]
    
    ingredients_text,ground_truth_text,prediction_text = predict_text_ingredient(model,x,y)
    print "INGREDIENTS LIST:\n",ingredients_text
    print "\nGROUND TRUTH:\n",ground_truth_text
    print "\nPREDICTION:\n",prediction_text
    
interact(present_results,nr="10")