In [1]:
from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional
import numpy as np
import sys
import io
import os
import codecs
import operator

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [3]:
text_in_words=[]
MIN_WORD_FREQUENCY = 12
vocabulary = "vocabulary.txt"

In [4]:
def get_corpus(corpus_filename,isJoke,Tag=""):
    with open(corpus_filename) as myfile:
        head = [next(myfile) for x in range(2000)]
    size= len(head)
    totalline =[""]*size
    for i in range(size):
        if isJoke == True:
            x = head[i].split(',',1)
        else:
            x = head[i].split('\t')
            
        myline = x[1].replace(".", " .")
        myline = x[1].replace("\n", "  \n")
        totalline[i]= Tag + myline
    return totalline

def get_text_in_words(tline):
    get_this_text_in_words=[]
    for line in tline:
        word = line.split(' ')
        for eachword in word:
            if eachword !='':
                get_this_text_in_words.append(eachword)
    return get_this_text_in_words
                
def print_vocabulary(words_file_path, words_set):
    words_file = codecs.open(words_file_path, 'w', encoding='utf8')
    for w in words_set:
        if w != "\n":
            words_file.write(w+"\n")
        else:
            words_file.write(w)
    words_file.close()


In [5]:
#get all dataset 
quotedataset= get_corpus("author-quote.txt",False)
jokedataset= get_corpus("shortjokes.csv",True)
 
#text_in_words = quotedataset + jokedataset


In [6]:
#fill up the text_in_words with all dataset
quote_text_in_words= get_text_in_words(quotedataset)
joke_text_in_words= get_text_in_words(jokedataset)
text_in_words = quote_text_in_words+ joke_text_in_words +quote_text_in_words+ joke_text_in_words

In [7]:
 # Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

In [8]:
# to check the freq count 
sorted_x = sorted(word_freq.items(), key=operator.itemgetter(1))
sorted_x.reverse()

# to print out the freq only. 
#sorted_x



In [9]:
sorted_x

[('\n', 8000),
 ('the', 6168),
 ('a', 5150),
 ('to', 4470),
 ('I', 3824),
 ('and', 3396),
 ('of', 3358),
 ('is', 2590),
 ('in', 2434),
 ('you', 1964),
 ('that', 1600),
 ('my', 1274),
 ('have', 1216),
 ('for', 1128),
 ('it', 1124),
 ('with', 1016),
 ('do', 1016),
 ('be', 1016),
 ('are', 970),
 ('was', 930),
 ('on', 846),
 ('not', 840),
 ('The', 708),
 ('but', 690),
 ('like', 642),
 ('they', 634),
 ('as', 614),
 ('your', 602),
 ('we', 586),
 ('"What', 564),
 ('can', 558),
 ('an', 556),
 ("I'm", 554),
 ('who', 552),
 ('at', 544),
 ('when', 532),
 ('me', 532),
 ("don't", 528),
 ('just', 528),
 ('people', 518),
 ('all', 506),
 ('or', 500),
 ('about', 488),
 ('so', 484),
 ('one', 480),
 ('what', 444),
 ('get', 432),
 ('"I', 424),
 ('from', 404),
 ('think', 394),
 ('this', 390),
 ('more', 388),
 ('by', 384),
 ('has', 366),
 ('if', 364),
 ('than', 358),
 ('up', 356),
 ('out', 354),
 ('will', 354),
 ('his', 346),
 ('know', 342),
 ('A', 340),
 ("it's", 330),
 ('-', 328),
 ('because', 326),
 ('he

In [10]:
ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

In [11]:
words = set(text_in_words)
words.add("#@Joke")
words.add("#@Quote")
words.add("#@NoTag")

print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
#words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))
print_vocabulary(vocabulary, words)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before ignoring: 17285
Ignoring words with frequency < 12
Unique words after ignoring: 17285


In [12]:
SEQUENCE_LEN =3
MY_SEQUENCE_LEN =2
STEP =1
# cut the text in semi-redundant sequences of SEQUENCE_LEN words


def processXandY(myText_in_words,tag=""):
    sentences = []
    next_words = []
    ignored = 0
    for i in range(0, len(myText_in_words) - MY_SEQUENCE_LEN, STEP):
        # Only add the sequences where no word is in ignored_words
        if len(set(text_in_words[i: i+MY_SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
            mysentences = []
            mysentences.append(tag)
            for myword in myText_in_words[i: i + MY_SEQUENCE_LEN]:
                mysentences.append(myword)
            #sentences.append(myText_in_words[i: i + SEQUENCE_LEN])
            sentences.append(mysentences)
            next_words.append(myText_in_words[i + MY_SEQUENCE_LEN])
        else:
            ignored = ignored + 1
    print('Ignored sequences:', ignored)
    print('Remaining sequences:', len(sentences))
    return sentences,next_words

            
 

In [13]:
Joke_Tag_Sentences,Joke_Tag_Next_Words =processXandY(joke_text_in_words,"#@Joke")
Joke_NoTag_Sentences,Joke_NoTag_Next_Words =processXandY(joke_text_in_words,"#@NoTag")
quote_Tag_Sentences,quote_Tag_Next_Words =processXandY(quote_text_in_words,"#@Quote")
quote_NoTag_Sentences,quote_NoTag_Next_Words =processXandY(quote_text_in_words,"#@NoTag")

sentences = Joke_Tag_Sentences + Joke_NoTag_Sentences + quote_Tag_Sentences + quote_NoTag_Sentences
next_words = Joke_Tag_Next_Words + Joke_NoTag_Next_Words + quote_Tag_Next_Words + quote_NoTag_Next_Words

Ignored sequences: 20485
Remaining sequences: 16399
Ignored sequences: 20485
Remaining sequences: 16399
Ignored sequences: 27236
Remaining sequences: 22329
Ignored sequences: 27236
Remaining sequences: 22329


In [14]:
#sentences

In [15]:
#quote_NoTag_Sentences 

In [16]:
# =================end of preprocesing ===============================

In [39]:
examples = "examples2.txt"
BATCH_SIZE =64
if not os.path.isdir('./checkpoints2/'):
    os.makedirs('./checkpoints2/')

In [40]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=0.1):
    # shuffle at unison
    print('Shuffling sentences')
    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])
    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]
    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)


In [41]:
# Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, 3, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t, word_indices[w]] = 1
            y[i, word_indices[next_word_list[index % len(sentence_list)]]] = 1
            index = index + 1
        yield x, y

In [42]:
def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, len(words))))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model

In [43]:
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [44]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)
    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]
    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))
        for i in range(50):
            x_pred = np.zeros((1, SEQUENCE_LEN, len(words)))
            for t, word in enumerate(sentence):
                x_pred[0, t, word_indices[word]] = 1.
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            sentence = sentence[1:]
            sentence.append(next_word)
            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [45]:
  # x, y, x_test, y_test
(sentences, next_words), (sentences_test, next_words_test) = shuffle_and_split_training_set(
    sentences, next_words
)

Shuffling sentences
Size of training set = 77222
Size of test set = 78


In [46]:
model = get_model()
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

Build model...


In [47]:
file_path = "./checkpoints2/LSTM_try-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
                "loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % \
                (len(words), SEQUENCE_LEN, MIN_WORD_FREQUENCY)

In [48]:
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [checkpoint, print_callback, early_stopping]

In [49]:
examples_file = open(examples, "w")
model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


KeyboardInterrupt: 

In [50]:
from keras.models import load_model

model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
#del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
#model = load_model('my_model.h5')

In [80]:
def predict_nex(sentence,diversity):
    x_pred = np.zeros((1, SEQUENCE_LEN, len(words)))
    for t, word in enumerate(sentence):
        x_pred[0, t, word_indices[word]] = 1.
    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, diversity)
    next_word = indices_word[next_index]
    return next_word
 
#sentence = ["#@Joke", "I","am"]
#predicted =predict_nex(sentence)

def printSentence(sentence):
    fullSentence =""
    for word in sentence:
        fullSentence =fullSentence +" " + word
    print(fullSentence)

In [81]:
sentence = ["I","am"]
cat ="#@Joke"
def predict_Sentence(sentence,cat):
    for i in range(50):
        threeWords= []
        threeWords.append(cat)
        threeWords.append(sentence[len(sentence)-2])
        threeWords.append(sentence[len(sentence)-1])
        sentence.append(predict_nex(threeWords,0.3))
    printSentence(sentence)

    


In [84]:
predict_Sentence(["I","am"],"#@Joke")

 I am a rock. I can't tell you a jar of jelly beans and when you've got to bring sunscreen the border and frisking, it it is to trust this person is mute? When you see a glass of water on lap*" 
 "Fun a beer" 
 "What do you call a


In [85]:
predict_Sentence(["I","am"],"#@Joke")

 I am a rock. I can't tell you a jar of jelly beans and when you've got to bring sunscreen the year, I'd like to be a real problem in the FDA bank" 
 "What do you call a small, noisy keeps my fruit-picking business trapped in a Mexican restaurant an actor


In [86]:
predict_Sentence(["I","am"],"#@Quote")


 I am more than you can be a real problem in English football and, in the world to revolve around them." 
 "What do you call a body builder? Jim." 
 "I just met someone as the strength of money as dollars into its own life is still there, but the idea


In [87]:
predict_Sentence(["I","am"],"#@NoTag")

 I am a rock. I can't wait for the Conservatives." 
 "How do you call a body builder? Jim." 
 "I don't trust anything in the world to revolve around them." 
 "What do you call a small, noisy the cause of success and is the greatest plenty in the world we
