In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
def tokenize_and_pad(cleaned_df):
    tokenizer = Tokenizer()
    corpus = cleaned_df.loc[:,'text']
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    # create input sequences using list of tokens
    input_sequences = []
    
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    
    return predictors, label, tokenizer, input_sequences, max_sequence_len

In [3]:
def create_and_compile_model(tokenizer, input_sequences):
    total_words= len(tokenizer.word_index) + 1
    max_sequence_len = max([len(x) for x in input_sequences])
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(150, return_sequences = True)))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    return model

In [4]:
def plot(history):
    import matplotlib.pyplot as plt
    acc = history.history['accuracy']
    loss = history.history['loss']
    epochs = range(len(acc))
    plt.plot(epochs, acc, 'b', label='Training accuracy')
    plt.title('Training accuracy')
    plt.figure()
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.title('Training loss')
    plt.legend()
    plt.show()

In [5]:
def predict_text(model, tokenizer, max_sequence_len, seed_text="", next_words=20):    
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted=np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text 

In [6]:
def save_model(model_name,model):
    model.save(f'{model_name}')

In [7]:
def save_tokenizer(tokenizer_name, tokenizer):
    import pickle
    with open(f'{tokenizer_name}_tokenizer.pkl', 'wb') as f:
        pickle.dump(tokenizer, f)

In [8]:
#kaggle
top10k = pd.read_csv('../input/nlp-final-project/pre_processed_top10k.csv', low_memory=False, lineterminator='\n', index_col=0).astype('str')
top50k = pd.read_csv('../input/nlp-final-project/pre_processed_top50k.csv', low_memory=False, lineterminator='\n', index_col=0).astype('str')
top100k = pd.read_csv('../input/nlp-final-project/pre_processed_top100k.csv', low_memory=False, lineterminator='\n', index_col=0).astype('str')
top1m = pd.read_csv('../input/nlp-final-project/pre_processed_top1m.csv', low_memory=False, lineterminator='\n', index_col=0).astype('str')

# Resource heavy and breaks the cloud
# top5m = pd.read_csv('../input/nlp-final-project/pre_processed_top5m.csv', low_memory=False, lineterminator='\n', index_col=0).astype('str')

# Resource heavy and breaks the cloud
# top10m = pd.read_csv('../input/nlp-final-project/pre_processed_top10m.csv', low_memory=False, lineterminator='\n', index_col=0).astype('str')

In [9]:
dfs = list([
    ('top10k', top10k),
    ('top50k', top50k),
    ('top100k', top100k),
    ('top1m', top1m)
    # ('top5m', top5m) breaks the cloud
    # ('top10m', top10m) breaks the cloud
])

In [10]:
dfs[0][1][:5]

In [11]:
max_seq_list = list()
for df in tqdm(dfs):
    predictors, labels, tokenizer, input_sequences, max_sequence_len = tokenize_and_pad(df[1])
    model = create_and_compile_model(tokenizer, input_sequences)
    history = model.fit(predictors, labels, epochs=300, verbose=0)
    plot(history)
    save_model(df[0], model)
    save_tokenizer(df[0], tokenizer)
    max_seq_list.append(max_sequence_len)

Inference

In [12]:
import tensorflow as tf
loadedtop10k = tf.keras.models.load_model('./top10k')
loadedtop50k = tf.keras.models.load_model('./top50k')
loadedtop100k = tf.keras.models.load_model('./top100k')
loadedtop1m = tf.keras.models.load_model('./top1m')
# loadedtop5m = tf.keras.models.load_model('./top5m') breaks the cloud
# loadedtop10m = tf.keras.models.load_model('./top10m') breaks the cloud
models=[loadedtop10k, loadedtop50k, loadedtop100k, loadedtop1m]

In [32]:
seed_text = "no to war yes to peace"

In [33]:
import pickle
for (name,_), model, max_seq in zip(dfs, models, max_seq_list):
    with open(f'{name}_tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)
        predicted_text = predict_text(model, tokenizer, max_seq, next_words=20, seed_text=seed_text)
        print(f'{name}\n{predicted_text}\n\n')

In [15]:
import json
names=list()
for df in dfs: 
    names.append(df[0])
with open('seq_dict.json', 'w') as f:
    json.dump({k: v for k, v in zip(names, max_seq_list)}, f)

In [16]:
cd /kaggle/working

In [17]:
! zip -r outputs.zip .