## LSTM RNN

#### based on: https://github.com/iphysresearch/AI_Programs/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%20Machine%20Learning/ml_lecture_3/Chinese-sentiment-analysis/chinese-sentiment-analysis_w2v_lstm.ipynb

In [21]:
from sklearn.cross_validation import train_test_split
import numpy as np

from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary

In [22]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation
from keras.models import model_from_yaml

In [23]:
import pandas as pd
import yaml

In [24]:
vocab_dim = 300
maxlen = 300
n_iterations = 5 
n_exposures = 10
window_size = 10
batch_size = 32
n_epoch = 8
input_length = 300

### methods

In [25]:
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return 0
    
def loadfile():
    corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
    corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
    corpus.reset_index()
    corpus['class'] = corpus.apply(classFit,axis=1)
    y = corpus['class'].values

    combined= corpus.content

    return combined,y

In [26]:
import nltk
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def tokenizer(text):
    ''' Simple Parser converting each document to lower-case, then
        removing the breaks for new lines and finally splitting on the
        whitespace
    '''
    return_text = []
    for sentence in text:
        reg_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        tokens = reg_tokenizer.tokenize(sentence)
        return_text.append([remove_accents(w.lower()) for w in tokens])
        
    return return_text

In [27]:
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        
        w2v = dict(zip(model.wv.index2word, model.wv.syn0))
        
        gensim_dict.doc2bow(w2v.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)
        
        return w2indx, w2vec,combined
    else:
        print ('No data provided...')

In [28]:
def word2vec_train(combined):    
    model = Word2Vec(combined, size=vocab_dim, workers=16, iter=10, negative=20)
    # trim memory
    model.init_sims(replace=True)
    
    model.save('Word2vec_model.pkl')
    index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined)
    return   index_dict, word_vectors,combined

In [29]:
def get_data(index_dict,word_vectors,combined,y):

    n_symbols = len(index_dict) + 1
    embedding_weights = np.zeros((n_symbols, vocab_dim))
    for word, index in index_dict.items():
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
    print ("embedding_weights", embedding_weights.shape)
    print (x_train.shape,y_train.shape)
    return n_symbols,embedding_weights,x_train,y_train,x_test,y_test

In [30]:
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    print ('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    
    model.add(LSTM(units=50, recurrent_activation="hard_sigmoid", activation="sigmoid"))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    print ('Compiling the Model...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])

    print ("Train...")
    model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1, validation_data=(x_test, y_test))

    print ("Evaluate...")
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    yaml_string = model.to_yaml()
    with open('W2V_lstm.yml', 'w') as outfile:
        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
    model.save_weights('W2V_lstm.h5')
    print ('Test score:', score)

In [31]:
def train():
    print ('Loading Data...')
    combined,y=loadfile()
    print (len(combined),len(y))
    print ('Tokenising...')
    combined = tokenizer(combined)
    print ('Training a Word2vec model...')
    index_dict, word_vectors,combined=word2vec_train(combined)
    print ('Setting up Arrays for Keras Embedding Layer...')
    n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y)
    print (x_train.shape,y_train.shape)
    train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)

In [32]:
def input_transform(string):
    words = []
    reg_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = reg_tokenizer.tokenize(string)
    words.append([remove_accents(w.lower()) for w in tokens])
    
    words=np.array(words).reshape(1,-1)
    model=Word2Vec.load('Word2vec_model.pkl')
    _,_,combined=create_dictionaries(model,words)
    return combined

In [33]:
def lstm_predict(string):
    print ('loading model......')
    with open('W2V_lstm.yml', 'r') as f:
        yaml_string = yaml.load(f)
    model = model_from_yaml(yaml_string)

    print ('loading weights......')
    model.load_weights('W2V_lstm.h5')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])
    data=input_transform(string)
    data.reshape(1,-1)
    
    #print (data)
    result=model.predict_classes(data, verbose=0)
    if result[0][0]==1:
        print (string,' positive')
    else:
        print (string,' negative')

In [34]:
train()

Loading Data...
534 534
Tokenising...
Training a Word2vec model...
Setting up Arrays for Keras Embedding Layer...
embedding_weights (4525, 300)
(427, 300) (427,)
(427, 300) (427,)
Defining a Simple Keras Model...
Compiling the Model...
Train...
Train on 427 samples, validate on 107 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Evaluate...


In [35]:
combined,y=loadfile()

idx = 466
string = combined[idx]
lstm_predict(string)

print(y[idx])

loading model......
loading weights......
Esse ano decidi eu mesma fazer os presentes que vou dar de páscoa para a minha família. Depois de muita pesquisa na net descobri o site da Cozinha da Janita que tem várias receitas diferentes de brigadeiro. vale procurar em outros sites também. Eu achei mais de 10 receitas entre brigadeiro de limão com mel, pistache, capuccino, macadâmia e até de panetone (esse vou fazer em dezembro com certeza!).   Fiquei com medo de fica muito duro e não conseguir enrolar e acabou que ficou meio mole...rs... Mas acho que ficaram gostosos!!!     Ai vão as fotos para que vocês possam ver minha aventura gastronômica que durou toda a manhã dessa sexta-feira da paixão!      Lá no fundo as caixinhas...No meio as forminhas coloridas...No cantinho esquerdo o granulado...Por último, brigadeiro de maracujá, capuccino, tradicional e de limão!     Detalhe do desenho nas caixinhas - ovinhos de páscoa     Depois de enrolados com muito capricho...Tradicional com granulado e

In [36]:
from sklearn.metrics import accuracy_score, f1_score

with open('W2V_lstm.yml', 'r') as f:
    yaml_string = yaml.load(f)
model = model_from_yaml(yaml_string)

model.load_weights('W2V_lstm.h5')
model.compile(loss='binary_crossentropy',
              optimizer='adam',metrics=['accuracy'])

combined,y=loadfile()

y_pred = []

idx = 0
for text in combined:
    data = input_transform(text)
    
    result=model.predict_classes(data, verbose=0)
    y_pred.append(result[0][0])
    
    #print(result[0][0], y[idx])
    #idx += 1
    #if idx ==10: break
    
print("acc", accuracy_score(y, y_pred))
print("f1", f1_score(y, y_pred))

acc 0.852059925094
f1 0.893959731544
