# Session 8 - Language modelling with RNNs (Text Generation)

In [1]:
# data processing tools
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)

# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(
    action = 'ignore', 
    category = FutureWarning)

2023-03-30 13:09:24.789398: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-03-30 13:09:24.836591: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-03-30 13:09:24.837964: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Some helper functions

In [2]:
def clean_text(txt):
    # loop through string character by character. check if character is part
    # of the string.punctuation vector and if yes then remove it. convert 
    # everything else to lowercase. concatenate to form new string
    # resulting txt string will contain the original string with all 
    # punctuation marks removed and all characters converted to lowercase
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    # convert text to bytes with UTF-8 encoding, then take resulting output
    # and translate it back into strings while keeping all ASCII-characters
    # and removing non-ASCII characters. great if original string contains 
    # characters that cannot be processed by certain systems or software
    txt = txt.encode("utf8").decode(
        "ascii",
        'ignore')
    return txt 

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        # convert each line into a sequence of integers
        token_list = tokenizer.texts_to_sequences([line])[0]
        # for each token, find the token that comes after it
        # add to list
        # (thereby you have a nice list with input words and word that can
        # come after them)
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=max_sequence_len, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words):
    # max_sequence_len: an integer representing the maximum length of a text sequence
    # total_words: an integer representing the total number of unique words in the 
    # vocabulary of the text corpus
    input_len = max_sequence_len - 1
    model = Sequential() # model takes one word after the other sequentially
    
    # Add Input Embedding Layer
    # creates dense vector representation for each input word
    model.add(
        Embedding(
            total_words, # the total number of unique words in the vocabulary
            10, # the size of the vector space in which the words will be embedded
            input_length = input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(
        LSTM(
            100)) # number of memory cells in the layer
    # prevent overfitting by randomly dropping out some of the connections between 
    # the LSTM cells
    model.add(
        Dropout(
            0.1)) # dropout rate
    
    # Add Output Layer
    # generate a probability distribution over the vocabulary of possible next words 
    # in the sequence
    model.add(
        Dense(
            total_words, # the total number of unique words in the vocabulary
            activation = 'softmax'))

    model.compile(
        loss = 'categorical_crossentropy',
        optimizer = 'adam')
    
    return model

def generate_text(seed_text, next_words, model, max_sequence_len):
    # seed_text: a string representing the starting text for text generation
    # next_words: an integer representing the number of words to generate after 
    # the seed text
    # model: a trained Keras neural network model that will be used for text 
    # generation
    # max_sequence_len: an integer representing the maximum length of the input 
    # sequence that the model was trained on
    for _ in range(next_words):
        # convert seed_text to tokens
        token_list = tokenizer.texts_to_sequences([
            seed_text])[0]
        # pad the sequence with zeros to match the length of the input sequences 
        # that the model was trained on
        token_list = pad_sequences([
            token_list],
            maxlen = max_sequence_len-1,
            padding = 'pre') # add zeros before the sequence
        # determine the index of the word with the highest predicted probability 
        # in the output vocabulary
        predicted = np.argmax(
            model.predict(
                token_list),
                axis = 1)
        
        output_word = ""
        # look up the actual word corresponding to the predicted index in the 
        # tokenizer's word index, then appends this word to the seed_text variable
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

## Load the data

In [3]:
data_dir = os.path.join(
    "..", 
    "..", 
    "..", 
    "431868", 
    "news_data")

In [5]:
pd.read_csv(data_dir + "/" + "ArticlesApril2017.csv")

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...
1,,58def3237c459f24986d7c84,823,By GAIL COLLINS,article,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",3,OpEd,23,2017-04-01 00:23:58,Unknown,Meet the gang from under the bus.,The New York Times,Op-Ed,https://www.nytimes.com/2017/03/31/opinion/and...
2,,58def9f57c459f24986d7c90,575,By THE EDITORIAL BOARD,article,Venezuela’s Descent Into Dictatorship,"['Venezuela', 'Politics and Government', 'Madu...",3,Editorial,22,2017-04-01 00:53:06,Unknown,A court ruling annulling the legislature’s aut...,The New York Times,Editorial,https://www.nytimes.com/2017/03/31/opinion/ven...
3,,58defd317c459f24986d7c95,1374,By MICHAEL POWELL,article,Stain Permeates Basketball Blue Blood,"['Basketball (College)', 'University of North ...",3,Sports,1,2017-04-01 01:06:52,College Basketball,"For two decades, until 2013, North Carolina en...",The New York Times,News,https://www.nytimes.com/2017/03/31/sports/ncaa...
4,,58df09b77c459f24986d7ca7,708,By DEB AMLEN,article,Taking Things for Granted,['Crossword Puzzles'],3,Games,0,2017-04-01 02:00:14,Unknown,In which Howard Barkin and Will Shortz teach u...,The New York Times,News,https://www.nytimes.com/2017/03/31/crosswords/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,,58fd41ab7c459f24986dbaa7,710,By ANDREW E. KRAMER,article,Reporting on Gays Who ‘Don’t Exist’,"['Chechnya (Russia)', 'Homosexuality and Bisex...",3,Insider,2,2017-04-24 00:07:04,Unknown,"“I see flies, I see mosquitoes,” said a Cheche...",The New York Times,News,https://www.nytimes.com/2017/04/23/insider/rus...
882,,58fd45a17c459f24986dbaaa,1230,By MATT FLEGENHEIMER and THOMAS KAPLAN,article,The Fights That Could Lead to a Government Shu...,"['Trump, Donald J', 'United States Politics an...",3,National,15,2017-04-24 00:23:53,Politics,The Trump administration wants to use the dead...,The New York Times,News,https://www.nytimes.com/2017/04/23/us/politics...
883,,58fd5c2c7c459f24986dbac3,1424,By NOEL MURRAY,article,"‘The Leftovers’ Season 3, Episode 2: Swedish P...","['Television', 'The Leftovers (TV Program)']",3,Culture,0,2017-04-24 02:00:04,Television,"For all its melancholy, “The Leftovers” rarely...",The New York Times,Review,https://www.nytimes.com/2017/04/23/arts/televi...
884,,58fd5c3d7c459f24986dbac4,1052,By BEN BRANTLEY,article,"Thinking Out Loud, But Why?","['Theater', 'The Antipodes (Play)', 'Baker, An...",3,Culture,1,2017-04-24 02:00:25,Unknown,"In this endlessly fascinating work, Annie Bake...",The New York Times,Review,https://www.nytimes.com/2017/04/23/theater/the...


We're then going to load the data one at a time and append *only* the headlines to our list of data.

In [7]:
all_headlines = []
for filename in os.listdir(data_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(data_dir + "/" + filename)
        all_headlines.extend(list(article_df["headline"].values))#keep headline

We then clean up a little bit and see how many data points we have.

In [8]:
# remove "Unknown" headlines
all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

8603

We call out ```clean_text()``` function and then inspect the first 10 texts.

In [None]:
corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

## Tokenize

We're then going to tokenize our data, using the ```Tokenizer()``` class from ```TensorFlow```, about which you can read more [here](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer).

We then use the ```get_sequence_of_tokens()``` function we defined above, which turns every text into a sequence of tokens based on the vocabulary from the tokenizer.

In [None]:
tokenizer = Tokenizer()
## tokenization
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [None]:
inp_sequences, total_words = get_sequence_of_tokens(tokenizer, corpus)
inp_sequences[:10]

We then want to *pad* our input sequences to make them all the same length.

In [None]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

## Create model

We then use the ```create_model()``` function created above to initialize a model, telling the model the length of sequences and the total size of the vocabulary.

In [None]:
model = create_model(max_sequence_len, total_words)
model.summary()

Model training is exactly the same as last week, but instead of document labels, we're fitting the model to predict next word.

*NB!* This will take some time to train! It took me 35 minutes on UCloud 32xCPU.

In [None]:
history = model.fit(predictors, 
                    label, 
                    epochs=100,
                    batch_size=128, 
                    verbose=1)

When the model has trained, we can then use this to generate *new text*.

In [None]:
print (generate_text("danish", 5, model, max_sequence_len))

## Using pre-trained word embeddings

Instead of having the embedding layer as a trainable parameter, we can instead using a *pretrained word embedding* model like ```word2vec```.

In the following examples, we're using [GloVe embeddings](https://nlp.stanford.edu/projects/glove/). These are trained a little differently from ```word2vec``` but they behave in the same way.

In [None]:
path_to_glove_file = os.path.join("path/to/glove/vectors")

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

We can define some variables that we're going to use later.

With hits and misses, we're counting how many words in the corpus vocabulary have a corresponding GloVe embedding; misses are the words which appear in our vocabulary but which do not have a GloVe embedding.

In [None]:
num_tokens = total_words
embedding_dim = 100
hits = 0
misses = 0

In [None]:
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer - notice that this is different
    model.add(Embedding(
            total_words,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False,
            input_length=input_len)
    )
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(500))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

In [None]:
model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
history = model.fit(predictors, 
                    label, 
                    epochs=100,
                    batch_size=128, 
                    verbose=1)

In [None]:
print (generate_text("china", 30, model, max_sequence_len))