# Develop Word-Level Language Model

## Import libraries

In [0]:
from pickle import dump,load
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential,load_model
from keras.utils.np_utils import to_categorical
import numpy as np
import io

## Reading the dataset

In [0]:
!wget https://raw.githubusercontent.com/ichaparroc/Word-Level-Neural-Languaje-Model-for-Generate-Text/master/spanish_emojis5.csv

In [180]:
link="spanish_emojis5.csv"
with io.open(link,encoding='utf-8') as f:
  data=f.read().lower() #.replace('\n',' \n ')
print('Corpus lenght in characters:',len(data))

Corpus lenght in characters: 44428


## Tokenization

In [141]:
tokenizer=Tokenizer()
corpus=data.lower().split("\n")    
tokenizer.fit_on_texts(corpus)
total_words=len(tokenizer.word_index) + 1
print('Total words: ',total_words)

Total words:  2518


## Generate Sequences

In [142]:
input_sequences=[]
for line in corpus:
  token_list=tokenizer.texts_to_sequences([line])[0]
  for i in range(1,len(token_list)):
    n_gram_sequence=token_list[:i+1]
    input_sequences.append(n_gram_sequence)
print('Total Sequences:',len(input_sequences))

Total Sequences: 7651


## Pre-Padding Sequences

In [177]:
max_sequence_len=max([len(x) for x in input_sequences])
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
print('Max Sequence Length:',max_sequence_len)

Max Sequence Length: 30


## Divide in X (inputs) and y (outpus - *onehot encoding*)

In [0]:
sequences=np.array(input_sequences)
X,y=sequences[:,:-1],sequences[:,-1]
y=to_categorical(y,num_classes=total_words,dtype='int8')

## Define Model RNN/LSTM

In [145]:
model=Sequential()
model.add(Embedding(total_words,10,input_length=max_sequence_len-1))
model.add(LSTM(50))
model.add(Dense(total_words,activation='softmax'))
print(model.summary())

#model = Sequential()
#model.add(Embedding(vocab_size, 50, input_length=seq_length))
#model.add(LSTM(100, return_sequences=True))
#model.add(LSTM(100))
#model.add(Dense(100, activation='relu'))
#model.add(Dense(vocab_size, activation='softmax'))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 29, 10)            25180     
_________________________________________________________________
lstm_7 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_7 (Dense)              (None, 2518)              128418    
Total params: 165,798
Trainable params: 165,798
Non-trainable params: 0
_________________________________________________________________
None


## Compile Model with Categorial Loss

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Fit Model

""""You will get different results, but perhaps an accuracy of just over **50%** of predicting the next word in the sequence, which is not bad. We are not aiming for 100% accuracy (e.g. a model that memorized the text), but rather a model that captures the essence of the text.""""

https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

In [0]:
model.fit(X,y,epochs=350,verbose=1)

## Save Model

In [0]:
model.save('model_5.h5')
dump(tokenizer,open('tokenizer_5.pkl','wb'))

# Generate Text using Neural Language Model

## Load Model

In [0]:
!wget https://raw.githubusercontent.com/ichaparroc/Word-Level-Neural-Languaje-Model-for-Generate-Text/master/model_5.h5
!wget https://raw.githubusercontent.com/ichaparroc/Word-Level-Neural-Languaje-Model-for-Generate-Text/master/tokenizer_5.pkl

In [0]:
model = load_model('model_5.h5')
tokenizer = load(open('tokenizer_5.pkl','rb'))

## Define "Recursive" Function For Generate Text

In [0]:
def generate_text(seed_text,next_words,max_sequence_len, model):
  for j in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word
  return seed_text

## Enjoy The Generation Of Text :D

In [185]:
text=generate_text("en la calle",25,max_sequence_len,model)
print(text)

en la calle pera 1000 anos en septiembre la historias de mi casa y me equivoque creo que se beba la vida del estudiante que algun 13 octobre


In [150]:
text=generate_text("todos vamos a",25,max_sequence_len,model)
print(text)

todos vamos a la 9 am a la cama y tampoco esta que me duele el huevo me creen que la verdaderos blink siempre solo a 100 con


In [152]:
text=generate_text("estoy viendo",25,max_sequence_len,model)
print(text)

estoy viendo el dia re raro no fui del colectivo a la casa me bianca gente lo mas bueno que te ventilador pensados bien o me duele


In [153]:
text=generate_text("cada vez que",25,max_sequence_len,model)
print(text)

cada vez que madrugo mucho al dia mas huevos que la cube para confirmar la maleta participaron en el palo en camiseta de 091 en leroy que buenos


In [159]:
text=generate_text("como no",25,max_sequence_len,model)
print(text)

como no me maltraten el pelo morado y evoluciona a turquesa en do suceda de parecer un completo alcoholico amaia y alfred estoy despierta a la 430


In [164]:
text=generate_text("sigo en",25,max_sequence_len,model)
print(text)

sigo en la mejor godo de la casa y me voy a la casa y me encanta la vuelta siempre me encanta la vuelta siempre me da


In [176]:
text=generate_text("estoy con",25,max_sequence_len,model)
print(text)

estoy con la mina mas linda que me duele el huevo me entretiene cereales internacionales era postureo y creo que cambio cada 2x3 sistersorpresa miercoles ven a
