# Prepare Environment

In [136]:
import numpy as np
import pandas as pd
import nltk

nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kristian.aars/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/kristian.aars/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kristian.aars/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kristian.aars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
!pip install autocorrect tensorflow numpy keras regex pyyaml h5py contractions pandarallel

In [None]:
# Download blogtext.csv (700 000 blog posts)
!gdown 1PJbVYUmRr0_HTwGNtplnu8lG-UCDoXZJ

# Download blogtext_cleaned.csv (10 000 blog posts, 43 500 sentences)
!gdown 1qI-TZrQ_D0S0g7O3l_qKKA7f4l70jlmf

# Load Data

In [213]:
%%time
# Import dataset from CSV

df = pd.read_csv('blogtext.csv').head(50000)

CPU times: user 5.87 s, sys: 491 ms, total: 6.36 s
Wall time: 6.94 s


In [214]:
print(df.shape)
df.head(10)

(50000, 7)


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o..."
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...


# Tokenize sentences

In [215]:
%%time
df.text = df.text.transform(lambda t: nltk.sent_tokenize(t))
df.shape

CPU times: user 5.45 s, sys: 75.7 ms, total: 5.53 s
Wall time: 5.89 s


(50000, 7)

In [216]:
df = df.explode('text')

print(df.shape)

mask = df['text'].apply(lambda x: isinstance(x, str))
df = df[mask]

df.shape

(674967, 7)


(674792, 7)

In [217]:
#df.text.to_csv("blogtext-sentence_tokenized.csv")

# Prepearing data for training

In [218]:
from keras.src.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences, to_categorical
import re
import random
from pandarallel import pandarallel

import numpy as np
from nltk import ngrams
from nltk.corpus import words as en_words
from nltk.corpus import stopwords

import spacy
from autocorrect import Speller

spacy_nlp = spacy.load("en_core_web_sm")

In [219]:
from nltk import word_tokenize


def has_url(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_url:
            return True
    return False

def has_email(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_email:
            return True
    return False

def has_phonenumber(sentence):
    doc = spacy_nlp(sentence)
    for token in doc:
        if token.like_num:
            return True
    return False

def autocorrect_corpus(corpus):
    speller = Speller(lang='en')
    return corpus.transform(lambda s: speller(s['text']))

def has_non_lexi_word(sentence):
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    lemmatized_doc = spacy_nlp(sentence)
    
    english_words = en_words.words()
    
    for token in lemmatized_doc:
        word = token.lemma_.lower()
        
        if word not in english_words:                               
            #print("Found non-english word {0}".format(word))
            return True

def contains_number(sentence):
    return any(char.isdigit() for char in sentence)

def remove_stop_words(text):
    # Tokenize the input text into words
    words = word_tokenize(text)
    
    # Get the list of English stop words
    stop_words = set(stopwords.words('english'))
    
    # Remove stop words from the list of words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

def word_count(text):
    words = word_tokenize(text)
    return len(words)

def clean_corpus(corpus_df, rm_sentence_phone=True, rm_sentence_email=True, rm_sentence_url=True, rm_non_lexi_word_sentence=True, rm_contains_num=True, min_word_count=2, remove_stopwords=True, prob_remove_stopword=0.5):
    # Remove sentences with personal details as specified by function parameters
    def remove_sentences_condition(row):
        if word_count(row['text']) < min_word_count: return False
        elif rm_sentence_phone and has_phonenumber(row['text']): return False
        elif rm_contains_num and contains_number(row['text']): return False
        elif rm_sentence_email and has_email(row['text']): return False
        elif rm_sentence_url and has_url(row['text']): return False
        elif rm_non_lexi_word_sentence and has_non_lexi_word(row['text']): return False
        else: return True
    
    def clean_text(text):
        # Convert to lowercase
        text = text.lower()
    
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
    
        # Tokenize
        words = word_tokenize(text)
        
        # Remove stop words
        if remove_stopwords:
            if random.random() < prob_remove_stopword:
                stop_words = set(stopwords.words('english'))
                words = [word for word in words if word not in stop_words]
                
        # Join the cleaned words back into a sentence
        cleaned_text = ' '.join(words)
    
        return cleaned_text
    
    pre_rem_size = corpus_df.shape[0]
    pandarallel.initialize()
    corpus_df['text'] = corpus_df['text'].apply(clean_text)
    corpus_df = corpus_df[corpus_df.parallel_apply(remove_sentences_condition, axis=1)]
    sen_removed = pre_rem_size - corpus_df.shape[0]
    print("Removed {0} sentences because they contained email, phone, or url(s)".format(sen_removed))

    # Run autocorrect to fix text-typos
    # autocorrect_corpus(corpus_df)

    # En løsning er å fjerne alle ord som ikke eksisterer i det engelske vokabularet.

    return corpus_df


In [None]:
df = clean_corpus(df, rm_sentence_phone=False, rm_sentence_url=False, rm_sentence_email=False, remove_stopwords=True, rm_non_lexi_word_sentence=True, prob_remove_stopword=0.80)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
df

# Remove sentences with less than two words


In [None]:
df.to_csv("blogtext_cleaned.csv")

In [None]:
df = pd.read_csv('blogtext_cleaned.csv')

#df['text'] = df['text'].apply(remove_stop_words)

df = df['text'].head(100)

df.shape

In [None]:
df

In [None]:
sentence_list = df.tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence_list)
total_words = len(tokenizer.word_index) + 1

In [None]:
import matplotlib.pyplot as plt


c_dict = {}

for s in df:
    print(s)
    wrds = s.split()
    for w in wrds:
        if w in c_dict.keys():
            c_dict[w] += 1
        else:
            c_dict[w] = 1
            
c_dict = dict(sorted(c_dict.items(), key=lambda item: item[1]))

# Extract words and counts
words = list(c_dict.keys())
counts = list(c_dict.values())

# Plot the bar chart
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='blue')
plt.xlabel('Words')
plt.ylabel('Count')
plt.title('Word Count Distribution')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.show()

c_dict

In [212]:
total_words

700

In [None]:
tokenizer.word_index

In [None]:
n_gram_list = []
n_gram_length = 3

for line in sentence_list:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for n in range(2, n_gram_length):
        n_grams = ngrams(token_list, n)
        n_gram_list.extend(np.asarray([*n_grams]))

# Padding
n_gram_list = np.array(pad_sequences(
    n_gram_list,
    maxlen=n_gram_length,
    padding='pre'
))

In [None]:
X = n_gram_list[:, :-1]
y = n_gram_list[:, -1]

y = to_categorical(y, num_classes=total_words)

# Build and Train Model

In [None]:
from keras.src.layers import Embedding, GRU, Dense, LSTM
from keras import Sequential
import keras

model = Sequential()
model.add(Embedding(total_words, 10, input_length=n_gram_length-1))

model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))

model.add(Dense(256, activation='sigmoid'))
model.add(Dense(total_words, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [None]:
from datetime import datetime

logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

checkpoint_path = "checkpoints/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "/" + "model_checkpoint_{epoch:02d}.h5"
checkpoint_callback = keras.callbacks.ModelCheckpoint(checkpoint_path, save_freq=500, verbose=1)

model.fit(X, y,
          epochs=500, verbose=1,
          callbacks=[tensorboard_callback, checkpoint_callback])

In [None]:
import pickle

model.save("models/model_{0}.h5".format(datetime.now()).replace(" ", "_"), )

with open("models/tokenizer_{0}.pickle".format(datetime.now()).replace(" ", "_"), 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
seed_text = "Hello there, do you i am"
next_words = 1

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list],
        maxlen=2,
        padding='pre'
    )

    predictions = model.predict(token_list)
    pred_word = tokenizer.index_word[np.argmax(predictions)]
    seed_text += " " + pred_word

print("Next predicted words: ", seed_text)