# Prepare Environment

In [None]:
import pandas as pd
import nltk

nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')


In [None]:
!pip install autocorrect tensorflow numpy keras regex pyyaml h5py contractions pandarallel

In [None]:
!gdown 1PJbVYUmRr0_HTwGNtplnu8lG-UCDoXZJ

# Load Data

In [89]:
%%time
# Import dataset from CSV

df = pd.read_csv('blogtext.csv').head(1000)

CPU times: user 7.42 s, sys: 600 ms, total: 8.02 s
Wall time: 8.57 s


In [90]:
print(df.shape)
df.head(10)

(500, 7)


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o..."
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...


# Tokenize sentences

In [91]:
%%time
df.text = df.text.transform(lambda t: nltk.sent_tokenize(t))
df.shape

CPU times: user 86.8 ms, sys: 3.22 ms, total: 90 ms
Wall time: 89.6 ms


(500, 7)

In [92]:
df = df.explode('text')
df.shape

(8435, 7)

In [93]:
#df.text.to_csv("blogtext-sentence_tokenized.csv")

# Prepearing data for training

In [94]:
from keras.src.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences, to_categorical
import re
from pandarallel import pandarallel

import numpy as np
from nltk import ngrams
from nltk.corpus import words

import spacy
from autocorrect import Speller

spacy_nlp = spacy.load("en_core_web_sm")

In [95]:
def has_url(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_url:
            return True
    return False

def has_email(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_email:
            return True
    return False

def has_phonenumber(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_num:
            return True
    return False

def autocorrect_corpus(corpus):
    speller = Speller(lang='en')
    return corpus.transform(lambda s: speller(s['text']))

def has_non_lexi_word(sentence):
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    lemmatized_doc = spacy_nlp(sentence)
    
    english_words = words.words()
    
    for token in lemmatized_doc:
        word = token.lemma_.lower()
        
        if word not in english_words:                               
            #print("Found non-english word {0}".format(word))
            return True

def clean_corpus(corpus_df, rm_sentence_phone=True, rm_sentence_email=True, rm_sentence_url=True, rm_non_lexi_word_sentence=True):
    # Remove sentences with personal details as specified by function parameters
    def remove_sentences_condition(row):
        if rm_sentence_phone and has_phonenumber(row['text']): return False
        elif rm_sentence_email and has_email(row['text']): return False
        elif rm_sentence_url and has_url(row['text']): return False
        elif rm_non_lexi_word_sentence and has_non_lexi_word(row['text']): return False
        else: return True
        
    pre_rem_size = corpus_df.shape[0]
    pandarallel.initialize()
    corpus_df = corpus_df[corpus_df.parallel_apply(remove_sentences_condition, axis=1)]
    sen_removed = pre_rem_size - corpus_df.shape[0]
    print("Removed {0} sentences because they contained email, phone, or url(s)".format(sen_removed))

    # Run autocorrect to fix text-typos
    # autocorrect_corpus(corpus_df)

    # En løsning er å fjerne alle ord som ikke eksisterer i det engelske vokabularet.

    return corpus_df


In [96]:
df = clean_corpus(df, rm_sentence_phone=False, rm_sentence_url=False, rm_sentence_email=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Removed 5292 sentences because they contained email, phone, or url(s)


In [97]:
df = df['text']

df.shape

(3143,)

In [98]:
df.head(100)

2    Seemed to be a transcript of a 'Seven Days' ar...
2                      Poorly formatted and corrupted.
2    I have added the text between 'examine under a...
2      If anyone has the full text, please distribute.
2    I am not responsible for the accuracy of this ...
                           ...                        
2                                                   8.
2       I am convinced that leukemia is psychosomatic.
2                                                   9.
2    I am aware that most vegetarians are sexually ...
2                                                  10.
Name: text, Length: 100, dtype: object

In [99]:
sentence_list = df.tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence_list)
total_words = len(tokenizer.word_index) + 1

In [100]:
total_words

5060

In [101]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'of': 6,
 'it': 7,
 'that': 8,
 'is': 9,
 'in': 10,
 'my': 11,
 'you': 12,
 'was': 13,
 'for': 14,
 'have': 15,
 'so': 16,
 'this': 17,
 'me': 18,
 'on': 19,
 'but': 20,
 'he': 21,
 'with': 22,
 'be': 23,
 'not': 24,
 'what': 25,
 'are': 26,
 'one': 27,
 'all': 28,
 'at': 29,
 'as': 30,
 'they': 31,
 'just': 32,
 'can': 33,
 'do': 34,
 'we': 35,
 'or': 36,
 'out': 37,
 'like': 38,
 'there': 39,
 'his': 40,
 "it's": 41,
 'up': 42,
 'no': 43,
 'if': 44,
 'more': 45,
 'now': 46,
 "i'm": 47,
 'about': 48,
 'will': 49,
 'time': 50,
 "'": 51,
 'when': 52,
 'her': 53,
 'from': 54,
 'get': 55,
 'think': 56,
 'your': 57,
 'then': 58,
 'am': 59,
 'how': 60,
 'would': 61,
 'them': 62,
 'people': 63,
 'she': 64,
 'know': 65,
 'an': 66,
 'had': 67,
 'really': 68,
 'were': 69,
 'only': 70,
 'some': 71,
 'never': 72,
 'good': 73,
 'see': 74,
 'by': 75,
 'too': 76,
 'here': 77,
 'him': 78,
 'has': 79,
 'could': 80,
 'way': 81,
 'into': 82,
 'been': 83

In [102]:
n_gram_list = []
n_gram_length = 4

for line in sentence_list:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for n in range(2, n_gram_length):
        n_grams = ngrams(token_list, n)
        n_gram_list.extend(np.asarray([*n_grams]))

# Padding
n_gram_list = np.array(pad_sequences(
    n_gram_list,
    maxlen=n_gram_length,
    padding='pre'
))

In [103]:
X = n_gram_list[:, :-1]
y = n_gram_list[:, -1]

y = to_categorical(y, num_classes=total_words)

# Build and Train Model

In [107]:
from keras.src.layers import Embedding, GRU, Dense
from keras import Sequential
import keras

model = Sequential()
model.add(Embedding(total_words, 10, input_length=n_gram_length-1))

model.add(GRU(128, return_sequences=True))
model.add(GRU(128))

model.add(Dense(128, activation='sigmoid'))
model.add(Dense(total_words, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 3, 10)             50600     
                                                                 
 gru_4 (GRU)                 (None, 3, 128)            53760     
                                                                 
 gru_5 (GRU)                 (None, 128)               99072     
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dense_5 (Dense)             (None, 5060)              652740    
                                                                 
Total params: 872684 (3.33 MB)
Trainable params: 872684 (3.33 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [108]:
from datetime import datetime

logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

model.fit(X, y,
          epochs=1, verbose=1,
          callbacks=[tensorboard_callback])



<keras.src.callbacks.History at 0x3bf0b80d0>

In [None]:
import pickle

model.save("models/model_{0}.h5".format(datetime.datetime.now()).replace(" ", "_"), )

with open("models/tokenizer_{0}.pickle".format(datetime.datetime.now()).replace(" ", "_"), 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
seed_text = "Hello there, do you make the rules"
next_words = 1

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list],
        maxlen=4,
        padding='pre'
    )

    predictions = model.predict(token_list)
    pred_word = tokenizer.index_word[np.argmax(predictions)]
    seed_text += " " + pred_word

print("Next predicted words: ", seed_text)