# Next word prediction using LSTM

- https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
- https://towardsdatascience.com/next-word-prediction-with-nlp-and-deep-learning-48b9fe0a17bf

In [1]:
import numpy as np
import pylab as plt

import json 
import pickle 

# NLP imports
import re
import nltk, gensim
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

data_dir = '../nlp_datasets/'

In [3]:
def load_text_data():
    file = open(data_dir + 'Metamorphosis_Franz_Kafka.txt', 'r', encoding = "utf8")
    doc = ''
    for line in file.readlines():
        doc = doc + line
    return doc

def preprocess_text(raw_doc=None, return_raw_data=False):
    
    if raw_doc is None:
        print ("Loading the Full text.")
        raw_doc = load_text_data()
    
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\n]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))

    doc = raw_doc.lower()
    doc = REPLACE_BY_SPACE_RE.sub(' ',doc)
    doc = BAD_SYMBOLS_RE.sub('', doc)
    doc = ' '.join([word for word in doc.split() if word not in STOPWORDS])
    
    # remove the different form of the same word
    doc = doc.split(' ')
    stemmer = PorterStemmer()
    doc = [stemmer.stem(d) for d in doc]
    
    if return_raw_data:
        return raw_doc, doc
    return doc

data = preprocess_text()

print (data[:10])
    

Loading the Full text.
['project', 'gutenberg', 'ebook', 'metamorphosi', 'franz', 'kafka', 'translat', 'david', 'wylli', 'ebook']


In [4]:
raw_data = preprocess_text(return_raw_data=True)[0]
raw_data[:100]

Loading the Full text.


'\ufeffThe Project Gutenberg EBook of Metamorphosis, by Franz Kafka\nTranslated by David Wyllie.\n\nThis eBoo'

In [16]:
def prepare_word_sequence(doc, n_steps=5):
    X, y = list(), list()
    for i in range(len(doc)-n_steps-1):
        X.append(doc[i:i+n_steps])
        y.append(doc[i+n_steps])
    return (X, y)

(X,y) =prepare_word_sequence(data)
for i in range(5):
    print (X[i], y[i])

['project', 'gutenberg', 'ebook', 'metamorphosi', 'franz'] kafka
['gutenberg', 'ebook', 'metamorphosi', 'franz', 'kafka'] translat
['ebook', 'metamorphosi', 'franz', 'kafka', 'translat'] david
['metamorphosi', 'franz', 'kafka', 'translat', 'david'] wylli
['franz', 'kafka', 'translat', 'david', 'wylli'] ebook


## Tokenize the data

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df =5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)')
X=vectorizer.fit_transform(X)

AttributeError: 'list' object has no attribute 'lower'

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model = LogisticRegression(penalty='l2', C=10, random_state=0, solver='liblinear')
model.fit(X, y_train)

In [6]:
def prepare_sequence(doc, n_steps=16):
    tokenizer = Tokenizer(num_words=10000)from sklearn.feature_extraction.text import TfidfVectorizer
    tokenizer.fit_on_texts([doc])
    vocab_size = len(tokenizer.word_index) + 1
    sequences = tokenizer.texts_to_sequences([doc_words])[0]
    
    X, y = list(), list()
    for i in range(len(sequences)-n_steps-1):
        X.append(sequences[i:i+n_steps])#i+n_steps])
        y.append(sequences[i+n_steps])
    return (np.array(X), np.array(y), tokenizer, vocab_size)

n_steps = 4
(X, y, tokenizer, vocab_size ) = prepare_sequence(data, n_steps=n_steps)
#y = to_categorical(y, num_classes=vocab_size)
print ( X.shape, y.shape)

NameError: name 'Tokenizer' is not defined

In [None]:
# train-test split
from sklearn.model_selection import train_test_split

X_train_, X_test, y_train_, y_test = train_test_split(X, y, test_size=0.2, random_state=8848)
X_train,  X_val,  y_train , y_val = train_test_split(X_train_, y_train_,
                                                     test_size=0.25, random_state=8848)
44xc
print (f"X_train.shape: {X_train.shape} y_train.shape:{y_train.shape}" )
print (f"X_test.shape: {X_test.shape} y_test.shape:{y_test.shape}" )
print (f"X_val.shape: {X_val.shape} y_val.shape:{y_val.shape}" )

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_val, X_test):
    # max_df and min_df is to filter out too frequent and too rare words
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),
                                 max_df=0.9, min_df=5,
                                 token_pattern='(\S+)' )
    
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf = tfidf_vectorizer.transform(X_val)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    return (X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_ )