## LSTM RNN

#### imports

In [77]:
from sklearn.cross_validation import train_test_split
import numpy as np

from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary

In [78]:
#容易上手的keras，为了保持和Aaron老师用的DL库一致...
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation
from keras.models import model_from_yaml

In [79]:
import pandas as pd
import yaml

In [80]:
vocab_dim = 100
maxlen = 100
n_iterations = 5 
n_exposures = 10
window_size = 7
batch_size = 32
n_epoch = 4
input_length = 100

### methods

In [81]:
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
def loadfile():
    corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
    corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
    corpus.reset_index()
    corpus['class'] = corpus.apply(classFit,axis=1)
    y = corpus['class'].values

    combined= corpus.content

    return combined,y

In [82]:
import nltk
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def tokenizer(text):
    ''' Simple Parser converting each document to lower-case, then
        removing the breaks for new lines and finally splitting on the
        whitespace
    '''
    return_text = []
    for sentence in text:
        reg_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        tokens = reg_tokenizer.tokenize(sentence)
        return_text.append([remove_accents(w.lower()) for w in tokens])
        
    return return_text

In [83]:
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        
        w2v = dict(zip(model.wv.index2word, model.wv.syn0))
        
        gensim_dict.doc2bow(w2v.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)
        return w2indx, w2vec,combined
    else:
        print ('No data provided...')

In [84]:
def word2vec_train(combined):    
    model = Word2Vec(combined, size=vocab_dim, workers=16, iter=10, negative=20)
    # trim memory
    model.init_sims(replace=True)
    
    model.save('Word2vec_model.pkl')
    index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined)
    return   index_dict, word_vectors,combined

In [85]:
def get_data(index_dict,word_vectors,combined,y):

    n_symbols = len(index_dict) + 1
    embedding_weights = np.zeros((n_symbols, vocab_dim))
    for word, index in index_dict.items():
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
    print (x_train.shape,y_train.shape)
    return n_symbols,embedding_weights,x_train,y_train,x_test,y_test

In [86]:
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    print ('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    
    model.add(LSTM(units=50, recurrent_activation="hard_sigmoid", activation="sigmoid"))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    print ('Compiling the Model...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])

    print ("Train...")
    model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=0, validation_data=(x_test, y_test))

    print ("Evaluate...")
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    yaml_string = model.to_yaml()
    with open('lstm.yml', 'w') as outfile:
        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
    model.save_weights('lstm.h5')
    print ('Test score:', score)

In [87]:
def train():
    print ('Loading Data...')
    combined,y=loadfile()
    print (len(combined),len(y))
    print ('Tokenising...')
    combined = tokenizer(combined)
    print ('Training a Word2vec model...')
    index_dict, word_vectors,combined=word2vec_train(combined)
    print ('Setting up Arrays for Keras Embedding Layer...')
    n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y)
    print (x_train.shape,y_train.shape)
    train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)

In [88]:
def input_transform(string):
    words = []
    reg_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = reg_tokenizer.tokenize(string)
    words.append([remove_accents(w.lower()) for w in tokens])
    
    words=np.array(words).reshape(1,-1)
    model=Word2Vec.load('Word2vec_model.pkl')
    _,_,combined=create_dictionaries(model,words)
    return combined

In [89]:
def lstm_predict(string):
    print ('loading model......')
    with open('lstm.yml', 'r') as f:
        yaml_string = yaml.load(f)
    model = model_from_yaml(yaml_string)

    print ('loading weights......')
    model.load_weights('lstm.h5')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])
    data=input_transform(string)
    data.reshape(1,-1)
    #print data
    result=model.predict_classes(data)
    if result[0][0]==1:
        print (string,' positive')
    else:
        print (string,' negative')

In [90]:
train()

Loading Data...
551 551
Tokenising...
Training a Word2vec model...
Setting up Arrays for Keras Embedding Layer...
(440, 100) (440,)
(440, 100) (440,)
Defining a Simple Keras Model...
Compiling the Model...
Train...
Evaluate...


In [91]:
combined,y=loadfile()

idx = 4
string = combined[idx]
lstm_predict(string)

print(y[idx])

loading model......
loading weights......
Nunca imaginei ver isso um dia na mídia, mas para a nossa felicidade aconteceu: tricotar está na moda! Agora é comum ver um grupo de jovens ou senhoras batendo papo e tricotando. Ótima opção para relaxar. Sei tricotar o básico... o ponto tricô e o ponto laçada e meia. Fiz há um tempo estes singelos sapatinhos para o frio. Lembro de tricotar desde criança para as minhas barbies e tentando terminar meu primeiro cachecol. Eu adorava tricotar, mas me faltava paciência na época. Ainda assim, como qualquer arte, recomendo. É ótimo para quem deseja relaxar, principalmente as pessoas que não trabalham nesta área. Afinal, quem não gosta de usar algo lindo e dizer para todos que foi você mesma quem fez? Abaixo deixo um vídeo que fala exatamente desta nova moda. Vale a Dica! Fazer tricô é a nova mania em São Paulo E você, o que mais gosta de tricotar? Como aprendeu? Compartilhe a sua experiência!  negative
1
