<div class="alert alert-info">
    <h1>Imports</h1>
    </div>

In [39]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import re
import nltk
import gzip
import shutil
import tarfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support
from gensim.models.keyedvectors import KeyedVectors
import keras
import tensorflow as tf
from keras import backend
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, Dropout, LSTM, Bidirectional, TimeDistributed, Dense, Flatten
from keras import layers
from keras.layers.merge import concatenate
from keras import preprocessing

<div class="alert alert-info">
    <h1>Data preparation</h1>
    </div>

In [2]:
if('generation-projet-trn.tar.gz' not in os.listdir()):
    ! wget https://sophierosset.github.io/docs/1718/generation-projet-trn.tar.gz
    ! wget https://sophierosset.github.io/docs/1718/generation-projet-dev.tar.gz
    ! wget https://sophierosset.github.io/docs/1718/generation-projet-test.tar.gz

    for file in os.listdir():
        if(file.endswith("tar.gz")):
            tar = tarfile.open(file, "r:gz")
            tar.extractall()
            tar.close()

In [3]:
def get_data(filename):
    l = []
    for file in os.listdir(filename):
        if(file.endswith("xml")):
            with open(os.path.join(filename,file)) as f:
                l += f.read().split("\n")
    
    return [el for el in l if len(el) != 0]

In [4]:
train = get_data("generation-projet-trn")
val = get_data("generation-projet-dev")
test = get_data("generation-projet-test")
print("nbre of sentences in training set: {}\nnbre of sentences in \
validation set: {}\nnbre of sentences in testing set: {}"\
      .format(len(train),len(val),len(test)))

nbre of sentences in training set: 2772244
nbre of sentences in validation set: 1839
nbre of sentences in testing set: 2450


In [5]:
tags = set()
l = re.findall('<[^/].*?>',str(train))
for el in l:
    tags.add(el)
    
closed_tags = set()
l = re.findall('</.*?>',str(train))
for el in l:
    closed_tags.add(el)
print("The set of tags in the xml files are: {}".format(tags))

The set of tags in the xml files are: {'<ingredient>', '<neg_cat-ingredient>', '<neg_ingredient>', '<recipe>', '<cat-ingredient>'}


In [6]:
tag2index  = {}
index2tag = {}

for i,tag in enumerate(tags):
    tag2index[tag] = i+1
    index2tag[i+1] = tag

In [7]:
def prepare_data(l):
    
    X = []
    y = []
    
    for el in l:
        tmp = []
        tmp1 = []
        tag = 0
        
        for w in el.split(" "):
            if(w in tags or w in closed_tags):
                if(w in tags):
                    tag = tag2index[w]
                else:
                    tag = 0
            else:
                tmp.append(w)
                tmp1.append(tag)
        
        X.append(tmp)
        y.append(tmp1)
        
    return X,y

In [8]:
X_train,y_train = prepare_data(train)
X_val,y_val = prepare_data(val)
X_test,y_test = prepare_data(test)

In [9]:
lengths = [len(x) for x in X_train]
print("nombre maximal de mots dans un phrase:{0}\n\
nombre minimal de mots dans un phrase:{1}\n\
nombre moyenne de mots dans un phrase:{2:.1f}"\
      .format(np.max(lengths),np.min(lengths),np.mean(lengths)))

nombre maximal de mots dans un phrase:92
nombre minimal de mots dans un phrase:2
nombre moyenne de mots dans un phrase:8.5


In [10]:
maxlen = np.max(lengths)
max_words = 50000
embedding_dim = 200

In [11]:
tokenizer = Tokenizer(num_words = max_words)

tokenizer.fit_on_texts(X_train)

sequences = np.array(tokenizer.texts_to_sequences(X_train))

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Found 10725 unique tokens.


In [12]:
X_train = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
y_train = preprocessing.sequence.pad_sequences(y_train, maxlen=maxlen)

In [13]:
sequences = np.array(tokenizer.texts_to_sequences(X_val))
X_val = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
y_val = preprocessing.sequence.pad_sequences(y_val, maxlen=maxlen)

sequences = np.array(tokenizer.texts_to_sequences(X_test))
X_test = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
y_test = preprocessing.sequence.pad_sequences(y_test, maxlen=maxlen)

In [14]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape

((2772244, 92), (2772244, 92), (2450, 92), (2450, 92), (1839, 92), (1839, 92))

<div class="alert alert-info">
    <h1>Pretrained word embeddings</h1>
    </div>

In [15]:
if('output.txt' not in os.listdir()):
    ! wget http://embeddings.net/frWac_non_lem_no_postag_no_phrase_200_cbow_cut0.bin
    ! git clone https://github.com/marekrei/convertvec
    os.chdir(os.path.join(os.getcwd(),"convertvec"))
    ! make
    ! ./convertvec bin2txt ../frWac_non_lem_no_postag_no_phrase_200_cbow_cut0.bin ../output.txt
    os.chdir("/".join(os.getcwd().split("/")[:-1]))

In [16]:
embeddings_index = {}

with open('output.txt','r',encoding='ISO-8859-1') as f:
    for i,line in enumerate(f):
        if(i != 0):
            values = line.split()
            word = "".join([values[i] for i in range(len(values) - 200)])

            coefs = np.asarray(values[len(values) - 200:], dtype='float32')

            embeddings_index[word] = coefs
            

embeddings_index["unk"] = np.zeros(200)
print('Found %s word vectors.' % len(embeddings_index))

Found 2516802 word vectors.


In [17]:
max_words = np.min([50000,len(word_index)])

embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        
        embedding_vector = embeddings_index.get(word)
        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = embeddings_index.get("unk")

embedding_matrix.shape

(10725, 200)

In [74]:
model = Sequential()
model.add(Embedding(max_words , embedding_dim, weights = [embedding_matrix], input_length=maxlen))
model.add(Bidirectional(LSTM(300, return_sequences=True),merge_mode='ave'))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(300, return_sequences=True),merge_mode='ave'))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(6, activation = 'softmax'), input_shape=(92, 300)))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 92, 200)           2145000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 92, 300)           1202400   
_________________________________________________________________
dropout_7 (Dropout)          (None, 92, 300)           0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 92, 300)           1442400   
_________________________________________________________________
dropout_8 (Dropout)          (None, 92, 300)           0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 92, 6)             1806      
Total params: 4,791,606
Trainable params: 4,791,606
Non-trainable params: 0
_________________________________________________________________


In [75]:
y_train = (np.arange(y_train.max()+1) == y_train[...,None]).astype(int)
y_val = (np.arange(y_val.max()+1) == y_val[...,None]).astype(int)

In [76]:
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

In [78]:
model.fit(X_train, y_train,
          batch_size=256,
          epochs=1,
          validation_data=[X_val, y_val])

Train on 2772244 samples, validate on 1839 samples
Epoch 1/1


<keras.callbacks.History at 0x7f11511a8e10>

In [84]:
ypred = model.predict(X_test)

In [89]:
from sklearn.metrics import accuracy_score

In [101]:
accuracy_score(np.ravel(y_test), np.ravel(np.argmax(ypred,axis=2)))

0.9820984915705413

In [102]:
model.save_weights("weights.h5")