In [65]:
import numpy as np
import pylab as plt

import json 
import pickle 

# NLP imports
import re
import nltk, gensim
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

data_dir = '../nlp_datasets/'

In [2]:
def load_text_data():
    file = open(data_dir + 'Metamorphosis_Franz_Kafka.txt', 'r', encoding = "utf8")
    doc = ''
    for line in file.readlines():
        doc = doc + line
    return doc

def preprocess_text(raw_doc=None, return_raw_data=False):
    
    if raw_doc is None:
        print ("Loading the Full text.")
        raw_doc = load_text_data()
    
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\n]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))

    doc = raw_doc.lower()
    doc = REPLACE_BY_SPACE_RE.sub(' ',doc)
    doc = BAD_SYMBOLS_RE.sub('', doc)
    doc = ' '.join([word for word in doc.split() if word not in STOPWORDS])
    
    # remove the different form of the same word
    doc = doc.split(' ')
    stemmer = PorterStemmer()
    doc = [stemmer.stem(d) for d in doc]
    
    if return_raw_data:
        return raw_doc, doc
    return doc

data = preprocess_text()

print (data[:10])

Loading the Full text.
['project', 'gutenberg', 'ebook', 'metamorphosi', 'franz', 'kafka', 'translat', 'david', 'wylli', 'ebook']


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df =5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)')
X=vectorizer.fit_transform(data)

In [15]:
for k,v in list(vectorizer.vocabulary_.items())[:10]:
    print (f"word: {k} \t Frequency:{v}")

word: project 	 Frequency:407
word: gutenberg 	 Frequency:234
word: ebook 	 Frequency:149
word: metamorphosi 	 Frequency:333
word: david 	 Frequency:115
word: wylli 	 Frequency:588
word: use 	 Frequency:550
word: anyon 	 Frequency:23
word: cost 	 Frequency:102
word: almost 	 Frequency:13


In [17]:
tokenizer = Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n', lower=True)
tokenizer.fit_on_texts(data)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 2252 unique tokens.


In [19]:
for k,v in list(word_index.items())[:10]:
    print (f"Token: {k} \t index:{v}")

Token: gregor 	 index:1
Token: would 	 index:2
Token: room 	 index:3
Token: could 	 index:4
Token: work 	 index:5
Token: even 	 index:6
Token: father 	 index:7
Token: sister 	 index:8
Token: door 	 index:9
Token: mother 	 index:10


In [83]:
tokenizer.index_word[11]# 51] 118]

'project'

In [161]:
def prepare_sequence(data, n_steps=16, num_words=10):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(data)
    vocab_size = len(tokenizer.word_index) + 1
    seq = tokenizer.texts_to_sequences(data)
    
    # the following is same as flatten() but for lists with different lengths
    import operator
    from functools import reduce
    seq = reduce(operator.concat, seq)
    #seq = seq.flatten()
    #seq = np.array(seq).flatten()

    X, y = list(), list()
    for i in range(len(seq)-n_steps-1):
        X.append(seq[i:i+n_steps])
        #X.append(seq[i])
        y.append(seq[i+n_steps])
    return (np.array(X), np.array(y), tokenizer, vocab_size)

n_steps = 16;
num_words=100
(X, y, tokenizer, vocab_size) = prepare_sequence(data, n_steps, num_words)

print (f"X.shape: {X.shape}, y.shape: {y.shape}")

X.shape: (4384, 16), y.shape: (4384,)


In [162]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

print (f"X_train.shape: {X_train.shape} y_train.shape:{y_train.shape}" )
print (f"X_test.shape: {X_test.shape} y_test.shape:{y_test.shape}" )


X_train.shape: (3507, 16) y_train.shape:(3507,)
X_test.shape: (877, 16) y_test.shape:(877,)


In [163]:
print (max(y_train))

99


Printing some samples from X_train and y_train

In [164]:
for i in range(5):
    print (f"X: {X_train[i]} \t y: {y_train[i]}")

X: [85 50 13 15 50  4 92  2 27 31  4 20  4 82 37  6] 	 y: 1
X: [57 11 51 11 17 62 66 11 17  5  5 11 51 11 51 66] 	 y: 87
X: [ 2 23  7  2 93  9  2 91 37  7 13 76 13  1 10 59] 	 y: 25
X: [23 28  1 28  1  2 92  2 10 54 36 16 36  4 59 34] 	 y: 8
X: [14 46 44 21 39 41 30 62 93 83 29 39 41 35 28  7] 	 y: 9


The actual words can be found from the `tokenizer.index_word`

In [165]:
for i in range(5):
    print (f"X: {X_train[i]} \t y: {y_train[i]}")
    print ("X:", [tokenizer.index_word[X_train[i][j]] for j in range(len(X_train[i]))], "\ty:", tokenizer.index_word[y_train[i]]) 

X: [85 50 13 15 50  4 92  2 27 31  4 20  4 82 37  6] 	 y: 1
X: ['help', 'famili', 'time', 'look', 'famili', 'could', 'think', 'would', 'want', 'made', 'could', 'get', 'could', 'take', 'thing', 'even'] 	y: gregor
X: [57 11 51 11 17 62 66 11 17  5  5 11 51 11 51 66] 	 y: 87
X: ['state', 'project', 'gutenberg', 'project', 'gutenbergtm', 'must', 'copi', 'project', 'gutenbergtm', 'work', 'work', 'project', 'gutenberg', 'project', 'gutenberg', 'copi'] 	y: distribut
X: [ 2 23  7  2 93  9  2 91 37  7 13 76 13  1 10 59] 	 y: 25
X: ['would', 'go', 'father', 'would', 'say', 'door', 'would', 'slowli', 'thing', 'father', 'time', 'long', 'time', 'gregor', 'mother', 'everyth'] 	y: first
X: [23 28  1 28  1  2 92  2 10 54 36 16 36  4 59 34] 	 y: 8
X: ['go', 'see', 'gregor', 'see', 'gregor', 'would', 'think', 'would', 'mother', 'came', 'day', 'one', 'day', 'could', 'everyth', 'much'] 	y: sister
X: [14 46 44 21 39 41 30 62 93 83 29 39 41 35 28  7] 	 y: 9
X: ['way', 'mr', 'samsa', 'said', 'chief', 'clerk'

In [166]:
# change y to categorical variable
y_train = to_categorical(y_train, num_classes=num_words)
y_test  = to_categorical(y_test,  num_classes=num_words)

print (f"X_train.shape: {X_train.shape} y_train.shape:{y_train.shape}" )
print (f"X_test.shape: {X_test.shape} y_test.shape:{y_test.shape}" )

X_train.shape: (3507, 16) y_train.shape:(3507, 100)
X_test.shape: (877, 16) y_test.shape:(877, 100)


## Model Building

In [167]:
def make_model(X, y, embedding_length = 16):
    model = Sequential()
    model.add(Embedding(100, embedding_length, input_length=X.shape[1]))
    model.add(LSTM(32))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = make_model(X_train, y_train, embedding_length = 16)
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 16, 16)            1600      
_________________________________________________________________
lstm_9 (LSTM)                (None, 32)                6272      
_________________________________________________________________
dense_9 (Dense)              (None, 100)               3300      
Total params: 11,172
Trainable params: 11,172
Non-trainable params: 0
_________________________________________________________________


In [168]:
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test), 
                    epochs=5,
                    batch_size=64,
                    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
