In [3]:
#Simple example downloading the corpus (English language)

import nltk

# Download the Brown corpus
nltk.download('brown')

# Import the Brown corpus
from nltk.corpus import brown

# Access the sentences in the Brown corpus
sentences = brown.sents()

# Print the first few sentences
for sentence in sentences[:5]:
    print(' '.join(sentence))

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. .
`` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' .


[nltk_data] Downloading package brown to /home/klima7/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [9]:
'''
Word tokenization is the process of splitting a large piece of text into individual words or tokens. 
The goal is to convert a continuous text into a list of words that can be processed and analyzed more easily.
In most cases, tokenization is one of the initial steps in natural language processing and text analysis.

The Brown corpus in nltk is already pre-tokenized

'''

# More advanced example - including word tokenization and simple cleanup

import pickle
import os
import nltk
import string
from nltk.corpus import brown
from nltk.corpus import stopwords

# Download required resources
nltk.download('brown')
nltk.download('stopwords')
nltk.download('punkt')


def preprocess_text(sentences):
    table = str.maketrans("", "", string.punctuation) # removing punctuation - it has to be a conscious decision
    tokenized_sentences = [[word.lower().translate(table) for word in sentence] for sentence in sentences]
    stop_words = set(stopwords.words("english"))
    tokenized_sentences = [[word for word in sentence if word and word not in stop_words] for sentence in tokenized_sentences]
    return tokenized_sentences


def save_tokenized_sentences(tokenized_sentences, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(tokenized_sentences, file)

def load_tokenized_sentences(file_path):
    with open(file_path, 'rb') as file:
        tokenized_sentences = pickle.load(file)
    return tokenized_sentences

file_path = 'tokenized_sentences.pkl'

if not os.path.exists(file_path):
    # Load the Brown corpus without tags
    raw_sentences = brown.sents()
    
    # Preprocess the text
    tokenized_sentences = preprocess_text(raw_sentences)
    
    # Save the preprocessed tokenized sentences to disk
    # save_tokenized_sentences(tokenized_sentences, file_path)
else:
    # Load the preprocessed tokenized sentences from disk
    tokenized_sentences = load_tokenized_sentences(file_path)

# Print the first few preprocessed sentences
for sentence in tokenized_sentences[:5]:
    print(sentence)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Lukasz\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lukasz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lukasz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', 'atlantas', 'recent', 'primary', 'election', 'produced', 'evidence', 'irregularities', 'took', 'place']
['jury', 'said', 'termend', 'presentments', 'city', 'executive', 'committee', 'overall', 'charge', 'election', 'deserves', 'praise', 'thanks', 'city', 'atlanta', 'manner', 'election', 'conducted']
['septemberoctober', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', 'irregularities', 'hardfought', 'primary', 'mayornominate', 'ivan', 'allen', 'jr']
['relative', 'handful', 'reports', 'received', 'jury', 'said', 'considering', 'widespread', 'interest', 'election', 'number', 'voters', 'size', 'city']
['jury', 'said', 'find', 'many', 'georgias', 'registration', 'election', 'laws', 'outmoded', 'inadequate', 'often', 'ambiguous']


In [3]:
# implementation of Word2Vec using existing solutions
from gensim.models import Word2Vec

# Train a Word2Vec model on the tokenized_sentences dataset
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=5, workers=24, epochs=50)

# Save the model to a file
# model.save("word2vec_model.model")

# Load the model from a file
# model = Word2Vec.load("word2vec_model.model")



In [4]:
# Now we check, if the model has learned proper embeddings
# We will check word similiarity as well as analogies (vector arithmetics)
# (pretty pictures here: https://kawine.github.io/blog/nlp/2019/06/21/word-analogies.html)

# Find the most similar words to a given word
similar_words = model.wv.most_similar("king", topn=5)
print("Most similar words to 'king':", similar_words)

# Perform vector arithmetic: king - man + woman
result = model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)
print("Result of 'king' - 'man' + 'woman':", result)


Most similar words to 'king': [('mao', 0.4664570689201355), ('edwards', 0.46602949500083923), ('saint', 0.4623377323150635), ('dwight', 0.4546518921852112), ('senator', 0.44663161039352417)]
Result of 'king' - 'man' + 'woman': [('clergymen', 0.40512460470199585)]


In [5]:
# more advanced dataset - text8, first 100MB of the English Wikipedia

import gensim.downloader as api
from gensim.models import Word2Vec

# Download the Text8 dataset
dataset = api.load("text8")



In [6]:
# Look at the sentences in the dataset. The Dataset object is an iterable, convert it to list for easier printing
sentences_list = list(dataset)

for i, sentence in enumerate(sentences_list[:5]):
    print(f"Sentence {i + 1}: {' '.join(sentence)}")

Sentence 1: anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economi

In [17]:
# Train a Word2Vec model on the Text8 dataset
# It might take some time 
model = Word2Vec(sentences=dataset, vector_size=100, window=5, min_count=5, workers=4, epochs=5)

# Save the model to a file
model.save("word2vec_large_model.model")

# Load the model from a file
# model = Word2Vec.load("word2vec_large_model.model")



In [8]:
# Now we check again, on better corpus, if our assumptions about embeddings were correct

# Find the most similar words to a given word
similar_words = model.wv.most_similar("king", topn=5)
print("Most similar words to 'king':", similar_words)

# Perform vector arithmetic: king - man + woman
result = model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)
print("Result of 'king' - 'man' + 'woman':", result)


Most similar words to 'king': [('mao', 0.4664570689201355), ('edwards', 0.46602949500083923), ('saint', 0.4623377323150635), ('dwight', 0.4546518921852112), ('senator', 0.44663161039352417)]
Result of 'king' - 'man' + 'woman': [('clergymen', 0.40512460470199585)]


## LSTM

In [10]:
# simple, character-level generation
# source: https://keras.io/examples/generative/lstm_character_level_text_generation/
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import random
import io

In [11]:
path = keras.utils.get_file(
    "nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt"
)
with io.open(path, encoding="utf-8") as f:
    text = f.read().lower()
text = text.replace("\n", " ")  # We remove newlines chars for nicer display
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Corpus length: 600893
Total chars: 56
Number of sequences: 200285


In [12]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               94720     
                                                                 
 dense (Dense)               (None, 56)                7224      
                                                                 
Total params: 101,944
Trainable params: 101,944
Non-trainable params: 0
_________________________________________________________________


In [12]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [15]:
epochs = 40
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: "ing tone--those familiar, disagreeable c"
...Generated:  onception of the hower and and and such be the strenger and and and arting of the one the one of the one sensition of the his of the strenges and and and the his of the strangtion of the strenger and and and the strenger and and and in the have and the oness of the experience of the strenges of the one of the spirit and such self-conception and and and the world and and and and the spirit of the s

...Diversity: 0.5
...Generating with seed: "ing tone--those familiar, disagreeable c"
...Generated:  an the mast of the proces of the and fast and a the bact semate and in in the world and respesion of the hering and for the with most of which not and a the and stance be the strengs and more the probles, and inself concerness and not not be sumple to sporitity and all the strengence of perton the belief men instristition and the loghing and of the have and the 

In [16]:
# Word-level generation and prediction
# Toy-problem, not using the previous corpuses


# Import necessary libraries
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input
from gensim.models import Word2Vec

# Load the Word2Vec model and text corpus data
word2vec_model = Word2Vec.load("word2vec_large_model.model")
corpus = ['This is the second sentence to train the model', 'This is the first sentence', 'This is the third sentence just because']
embedding_dim = 100


def extract_target_words(corpus):
    target_words = []
    for sentence in corpus:
        words = sentence.split()
        target_word = words[-1]  # Get the last word in the sentence
        target_words.append(target_word)
    return target_words

target_words = extract_target_words(corpus)


# Create target_vectors
target_vectors = np.zeros((len(target_words), embedding_dim))

for idx, word in enumerate(target_words):
    try:
        target_vectors[idx] = word2vec_model.wv[word]
    except KeyError:
        target_vectors[idx] = np.random.rand(embedding_dim)  # initialize randomly




# Transform each word in the corpus into a corresponding vector representation
corpus_vectors = []
for sentence in corpus:
    sentence_vectors = []
    for word in sentence.split():
        try:
            sentence_vectors.append(word2vec_model.wv[word])
        except KeyError:
            sentence_vectors.append(np.random.rand(100))  # initialize randomly
    corpus_vectors.append(sentence_vectors)

# Pad each sentence in the corpus to a fixed length, as required by the LSTM model
'''
Padding is performed by adding filler values (usually zeros) to the sequences until they reach the desired length. 
Padding is necessary for LSTMs (Long Short-Term Memory) that process input sequences of a fixed length.


There are two common padding strategies:
Pre-padding: Filler values are added to the beginning of the sequence.
Post-padding: Filler values are added to the end of the sequence.

The max_len variable is set to the length of the longest sequence in the corpus. B
y padding all sequences to this length, we ensure that the input to the LSTM model is 
standardized and can be processed correctly.

It's important to note that padding can affect model performance. 
Very short sequences may be dominated by the padding values, which can introduce 
noise and make it harder for the model to learn meaningful patterns. 
On the other hand, excessively long padding can lead to increased computational 
requirements and memory consumption. 
Choosing an appropriate padding length based on the dataset and problem is crucial for achieving good performance.
'''


max_len = max(len(sentence) for sentence in corpus_vectors)
corpus_vectors_padded = np.zeros((len(corpus_vectors), max_len, embedding_dim), dtype='float32')

for i, sentence_vectors in enumerate(corpus_vectors):
    for j, vector in enumerate(sentence_vectors):
        corpus_vectors_padded[i, j, :] = vector


# Build an LSTM model in Keras with appropriate input and output layers
embedding_dim = 100
model = Sequential()
model.add(Input(shape=(max_len, embedding_dim)))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(embedding_dim))
model.compile(loss='mse', optimizer='adam')

# Train the LSTM model on the prepared data
model.fit(corpus_vectors_padded, target_vectors, epochs=50, batch_size=32, validation_split=0.1)

In [24]:
def preprocess_input_text(text, max_len):
    # Transform input text into vector representation
    input_vectors = []
    for word in text.split():
        try:
            input_vectors.append(word2vec_model.wv[word])
        except KeyError:
            input_vectors.append(np.random.rand(embedding_dim))  # initialize randomly

    # Pad the input text to the required length
    input_vectors_padded = np.zeros((1, max_len, embedding_dim), dtype='float32')
    for j, vector in enumerate(input_vectors):
        input_vectors_padded[0, j, :] = vector

    return input_vectors_padded

def generate_text(model, input_text, num_words_to_generate=10):
    generated_text = input_text

    for _ in range(num_words_to_generate):
        # Preprocess and pad the input text
        input_vectors_padded = preprocess_input_text(generated_text, max_len)

        # Predict the next word using the LSTM model
        prediction = model.predict(input_vectors_padded)
        predicted_vector = prediction[0]

        # Find the corresponding word for the predicted index
        predicted_word = word2vec_model.wv.most_similar(positive=[predicted_vector], topn=1)[0][0]

        # Append the predicted word to the input text
        generated_text += " " + predicted_word

    return generated_text