In [2]:
import pandas as pd
import nltk

In [18]:
%%time
# Import dataset from CSV

df = pd.read_csv('blogtext.csv').head(100)

CPU times: user 7.41 s, sys: 657 ms, total: 8.07 s
Wall time: 8.36 s


In [19]:
print(df.shape)
df.head(10)

(100, 7)


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o..."
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...


## Tokenize sentences

In [20]:
%%time
df.text = df.text.transform(lambda t: nltk.sent_tokenize(t))
df.shape

CPU times: user 53.8 ms, sys: 2.72 ms, total: 56.5 ms
Wall time: 55.5 ms


In [22]:
df = df.explode('text')
df.shape

(2168, 7)

In [8]:
#df.text.to_csv("blogtext-sentence_tokenized.csv")

# Prepearing data for training

In [23]:
from keras.src.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences, to_categorical

import numpy as np
from nltk import ngrams

import spacy
from autocorrect import Speller

spacy_nlp = spacy.load("en_core_web_sm")


In [32]:
def has_url(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_url:
            return True
    return False
    
def has_email(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_email:
            return True
    return False

def has_phonenumber(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_num:
            return True
    return False

def autocorrect_corpus(corpus):
    speller = Speller(lang='en')
    return corpus.transform(lambda s: speller(s['text']))

def clean_corpus(corpus_df, rm_sentence_phone=True, rm_sentence_email=True, rm_sentence_url=True):
    # Remove sentences with personal details as specified by function parameters
    def remove_sentences_condition(row):
        if rm_sentence_phone and has_phonenumber(row['text']): return False
        elif rm_sentence_email and has_email(row['text']): return False
        elif rm_sentence_url and has_url(row['text']): return False
        else: return True
    
    pre_rem_size = corpus_df.shape[0]
    corpus_df = corpus_df[corpus_df.apply(remove_sentences_condition, axis=1)]
    sen_removed = pre_rem_size - corpus_df.shape[0]
    print("Removed {0} sentences because they contained email, phone, or url(s)".format(sen_removed))
    
    # Run autocorrect to fix text-typos
    autocorrect_corpus(corpus_df)
    
    # En løsning er å fjerne alle ord som ikke eksisterer i det engelske vokabularet.
    
    return corpus_df
    

In [34]:
df = clean_corpus(df, rm_sentence_phone=False, rm_sentence_url=False, rm_sentence_email=False)

Removed 0 sentences because they contained email, phone, or url(s)


TypeError: expected string or bytes-like object, got 'Series'

In [26]:
df = df['text']

df.shape

2168

In [11]:
df.head(100)

0               Info has been found (+/- 100 pages,...
1               These are the team members:   Drewe...
2               In het kader van kernfusie op aarde...
2    Date: 7 Feb 1994 07:41:14 GMT Organization: Th...
2    Seemed to be a transcript of a 'Seven Days' ar...
                           ...                        
2    Repeat this step until you have the required 1...
2    (Safety note: Don't put all your enriched uran...
2    Use at least two or three buckets and keep the...
2    This will prevent the premature build-up of a ...
2    Now it's time to convert your enriched uranium...
Name: text, Length: 100, dtype: object

In [12]:
sentence_list = df.tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence_list)
total_words = len(tokenizer.word_index) + 1

In [13]:
total_words

6369

In [14]:
tokenizer.word_index

{'the': 1,
 'a': 2,
 'to': 3,
 'and': 4,
 'of': 5,
 'i': 6,
 'in': 7,
 'is': 8,
 'it': 9,
 'that': 10,
 'for': 11,
 'you': 12,
 'urllink': 13,
 'but': 14,
 'on': 15,
 'my': 16,
 'was': 17,
 'as': 18,
 'this': 19,
 'have': 20,
 'are': 21,
 'or': 22,
 'be': 23,
 'not': 24,
 'so': 25,
 'they': 26,
 'here': 27,
 'one': 28,
 'at': 29,
 'from': 30,
 'with': 31,
 'there': 32,
 'if': 33,
 'like': 34,
 'he': 35,
 'korean': 36,
 "it's": 37,
 'we': 38,
 'can': 39,
 'about': 40,
 'all': 41,
 'get': 42,
 'me': 43,
 'some': 44,
 'do': 45,
 'out': 46,
 'korea': 47,
 'just': 48,
 'what': 49,
 'when': 50,
 'up': 51,
 'know': 52,
 'your': 53,
 'well': 54,
 'no': 55,
 'more': 56,
 'will': 57,
 'their': 58,
 'now': 59,
 'an': 60,
 'then': 61,
 'has': 62,
 'who': 63,
 'had': 64,
 'were': 65,
 'time': 66,
 'seoul': 67,
 'them': 68,
 'which': 69,
 'how': 70,
 'many': 71,
 'by': 72,
 'little': 73,
 'go': 74,
 'pretty': 75,
 'think': 76,
 'take': 77,
 'good': 78,
 '2': 79,
 'place': 80,
 "i'm": 81,
 'koreans':

In [45]:
n_gram_list = []
n_gram_length = 5

for line in sentence_list:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for n in range(2, n_gram_length):
        n_grams = ngrams(token_list, n)
        n_gram_list.extend(np.asarray([*n_grams]))

# Padding
n_gram_list = np.array(pad_sequences(
    n_gram_list,
    maxlen=n_gram_length,
    padding='pre'
))

In [46]:
X = n_gram_list[:, :-1]
y = n_gram_list[:, -1]

y = to_categorical(y, num_classes=total_words)

In [47]:
from keras.src.layers import Embedding, GRU, Dense
from keras import Sequential

model = Sequential()
model.add(Embedding(total_words, 10, input_length=n_gram_length-1))

model.add(GRU(128, return_sequences=True))
model.add(GRU(128))

model.add(Dense(128, activation='sigmoid'))
model.add(Dense(total_words, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 10)             63690     
                                                                 
 gru_4 (GRU)                 (None, 4, 128)            53760     
                                                                 
 gru_5 (GRU)                 (None, 128)               99072     
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dense_5 (Dense)             (None, 6369)              821601    
                                                                 
Total params: 1054635 (4.02 MB)
Trainable params: 1054635 (4.02 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [48]:
model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x346a1da90>

In [49]:
import datetime
import pickle

model.save(".models/model_{0}.h5".format(datetime.datetime.now()).replace(" ", "_"), )

with open(".models/tokenizer_{0}.pickle".format(datetime.datetime.now()).replace(" ", "_"), 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(


In [65]:
seed_text = "Hello there, do you make the rules"
next_words = 1

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list],
        maxlen=4,
        padding='pre'
    )
    
    predictions = model.predict(token_list)
    pred_word = tokenizer.index_word[np.argmax(predictions)]
    seed_text += " " + pred_word

print("Next predicted words: ", seed_text)

Next predicted words:  Hello there, do you make the rules of
