**Below is word2vec+lemmatization**


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import random
import pickle
import os 
import re

In [None]:
from tensorflow.keras.callbacks import TensorBoard
import datetime

# Create a log directory  
import os
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"-model1"
os.makedirs(log_dir, exist_ok=True)

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Read the file
with open('/content/drive/MyDrive/Colab Notebooks/nietzsche.txt', 'r') as file:
    text = file.read()

Mounted at /content/drive


In [None]:
words = text.split()
unique_words = set(words)
num_unique_words = len(unique_words)

print("Number of unique words:", num_unique_words)

Number of unique words: 18809


In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
import string
from nltk.stem import WordNetLemmatizer, PorterStemmer
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec

def preprocessing(text, lemmatize=True, stemming=False, word2vec=True, return_type='word_sequences'):
    # Step 1: Make lowercase
    text = text.lower()

    # Step 2: Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Step 3: Remove numbers
    wordonly = [word for word in text.split() if not word.isdigit()]
    text = ' '.join(wordonly)

    # Step 4: Lemmatize the words
    if lemmatize:
        lemmatization = WordNetLemmatizer()
        text = [lemmatization.lemmatize(word) for word in text.split()]

    # Step 5: Stem the words
    if stemming:
        stemmer = PorterStemmer()
        text = text.split()
        text = [stemmer.stem(word) for word in text]

    text = ' '.join(text)

    if word2vec:
        # Initialize Word2Vec model
        model = Word2Vec([text.split()], min_count=1, vector_size=100)
        word_sequences = np.array([model.wv[word] for word in text.split()])
        vocab_size = len(model.wv)
        vec_to_index = {tuple(model.wv[word]): i for i, word in enumerate(model.wv.index_to_key)}


    if return_type == 'word_sequences':
        return word_sequences
    elif return_type == 'vocab_size':
        return vocab_size
    elif return_type == 'vec_to_index':
        return vec_to_index
    else:
        raise ValueError('Invalid return type')


In [None]:
vec_to_index=preprocessing(text, lemmatize=True, stemming=False, word2vec=True, return_type='vec_to_index')

vec_to_index_stemming=preprocessing(text, lemmatize=False, stemming=True, word2vec=True, return_type='vec_to_index')



In [None]:
word_sequences=preprocessing(text, lemmatize=True, stemming=False, word2vec=True)

In [None]:
vocab_size=preprocessing(text, lemmatize=True, stemming=False, word2vec=True, return_type='vocab_size')
print(vocab_size)

10534


In [None]:
# Split word vectors into input sequences and corresponding output labels
prev_words = []
next_words = []

for i in range(4, len(word_sequences)-4):
    prev_words.append(word_sequences[i-4:i])
    next_words.append(word_sequences[i])

# Convert the lists to numpy arrays for use in the LSTM model
X_word2vec = np.array(prev_words)
y_word2vec = np.array(next_words)


In [None]:
X_word2vec.shape

(98607, 4, 100)

In [None]:
y_word2vec.shape

(98607, 100)

In [None]:
# X_word2vec = X_word2vec.reshape(98611, 4, 100)
# y_word2vec = y_word2vec.reshape(98611, 100)


In [None]:
y_word2vec_indices = np.array([vec_to_index[tuple(vec)] for vec in y_word2vec])


In [None]:
import tensorflow as tf

tf.keras.backend.clear_session()


In [None]:

from keras.layers import Input
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from keras.callbacks import ModelCheckpoint


filepath='/content/drive/MyDrive/Colab Notebooks/best_model_word2vec_lemmatize.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

num_layers = 2
lstm_units_0 = 128
recurrent_dropout_0 = 0.2
lstm_units_1 = 128
recurrent_dropout_1 = 0.1
dropout_rate = 0.1
learning_rate = 0.005977728042983696

model_lstm = Sequential()
model_lstm.add(Input(shape=(4, 100)))
model_lstm.add(LSTM(lstm_units_0, return_sequences=True, recurrent_dropout=recurrent_dropout_0))
model_lstm.add(LSTM(lstm_units_1, recurrent_dropout=recurrent_dropout_1))
model_lstm.add(Dropout(dropout_rate))
model_lstm.add(Dense(vocab_size, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Train the model
model_lstm.fit(X_word2vec, y_word2vec_indices, epochs=5, batch_size=128, validation_split=0.2, callbacks=[tensorboard_callback,checkpoint])









Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.09320, saving model to /content/drive/MyDrive/Colab Notebooks/best_model_word2vec_lemmatize.h5
Epoch 2/5
Epoch 2: val_accuracy improved from 0.09320 to 0.09375, saving model to /content/drive/MyDrive/Colab Notebooks/best_model_word2vec_lemmatize.h5
Epoch 3/5
Epoch 3: val_accuracy improved from 0.09375 to 0.09563, saving model to /content/drive/MyDrive/Colab Notebooks/best_model_word2vec_lemmatize.h5
Epoch 4/5
Epoch 4: val_accuracy did not improve from 0.09563
Epoch 5/5
Epoch 5: val_accuracy improved from 0.09563 to 0.09674, saving model to /content/drive/MyDrive/Colab Notebooks/best_model_word2vec_lemmatize.h5


<keras.callbacks.History at 0x7fbee02df160>

In [None]:
!tensorboard dev upload --logdir {log_dir} --name "Word2vecprojectlemmatization" --description "Training results from https://colab.research.google.com" --one_shot



***** TensorBoard Uploader *****

This will upload your TensorBoard logs to https://tensorboard.dev/ from
the following directory:

logs/fit/20230514-004112-model1

This TensorBoard will be visible to everyone. Do not upload sensitive
data.

Your use of this service is subject to Google's Terms of Service
<https://policies.google.com/terms> and Privacy Policy
<https://policies.google.com/privacy>, and TensorBoard.dev's Terms of Service
<https://tensorboard.dev/policy/terms/>.

This notice will not be shown again while you are logged into the uploader.
To log out, run `tensorboard dev auth revoke`.

Continue? (yes/NO) yes

To sign in with the TensorBoard uploader:

1. On your computer or phone, visit:

   https://www.google.com/device

2. Sign in with your Google account, then enter:

   PZZD-RXZX



New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/Md4q009YTbWJi9j2M1RXmQ/

[1m[2023-05-14T00:45:50][0m Started scanning logdir.
[1m[2023-05-14T00:45:5

**Below is word2vec+stemming**

In [None]:
log_dir2 = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "-model2"
os.makedirs(log_dir2, exist_ok=True)

In [None]:
vec_to_index_stemming=preprocessing(text, lemmatize=False, stemming=True, word2vec=True, return_type='vec_to_index')




In [None]:
word_sequences_stemming=preprocessing(text, lemmatize=False, stemming=True, word2vec=True)
word_sequences_stemming.shape

(98615, 100)

In [None]:
vocab_size_stemming=preprocessing(text, lemmatize=False, stemming=True, word2vec=True, return_type='vocab_size')
print(vocab_size_stemming)

8059


In [None]:
# Split word vectors into input sequences and corresponding output labels
prev_words_stemming = []
next_words_stemming = []

for i in range(4, len(word_sequences_stemming)-4):
    prev_words_stemming.append(word_sequences_stemming[i-4:i])
    next_words_stemming.append(word_sequences_stemming[i])

# Convert the lists to numpy arrays for use in the LSTM model
X_word2vec_stemming = np.array(prev_words_stemming)
y_word2vec_stemming = np.array(next_words_stemming)

In [None]:
X_word2vec_stemming.shape

(98607, 4, 100)

In [None]:
y_word2vec_indices_stemming = np.array([vec_to_index_stemming[tuple(vec)] for vec in y_word2vec_stemming])
vocab_size_stemming=preprocessing(text, lemmatize=False, stemming=True, word2vec=True, return_type='vocab_size')


In [None]:
import tensorflow as tf

tf.keras.backend.clear_session()


In [None]:

from keras.layers import Input
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from keras.callbacks import ModelCheckpoint
filepath='/content/drive/MyDrive/Colab Notebooks/best_model_word2vec_stemming.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

num_layers = 2
lstm_units_0 = 128
recurrent_dropout_0 = 0.2
lstm_units_1 = 128
recurrent_dropout_1 = 0.1
dropout_rate = 0.1
learning_rate = 0.005977728042983696

model_lstm = Sequential()
model_lstm.add(Input(shape=(4, 100)))
model_lstm.add(LSTM(lstm_units_0, return_sequences=True, recurrent_dropout=recurrent_dropout_0))
model_lstm.add(LSTM(lstm_units_1, recurrent_dropout=recurrent_dropout_1))
model_lstm.add(Dropout(dropout_rate))
model_lstm.add(Dense(vocab_size_stemming, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
tensorboard_callback2 = TensorBoard(log_dir=log_dir2, 
    histogram_freq=1,
    embeddings_freq=1,
    write_graph=True,
    write_images=True,
    update_freq='epoch', 

)
# Train the model
model_lstm.fit(X_word2vec_stemming, y_word2vec_indices_stemming, epochs=5, batch_size=128, validation_split=0.2, callbacks=[checkpoint, tensorboard_callback2])



Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.09264, saving model to /content/drive/MyDrive/Colab Notebooks/best_model_word2vec_stemming.h5
Epoch 2/5
Epoch 2: val_accuracy improved from 0.09264 to 0.09375, saving model to /content/drive/MyDrive/Colab Notebooks/best_model_word2vec_stemming.h5
Epoch 3/5
Epoch 3: val_accuracy did not improve from 0.09375
Epoch 4/5
Epoch 4: val_accuracy improved from 0.09375 to 0.09664, saving model to /content/drive/MyDrive/Colab Notebooks/best_model_word2vec_stemming.h5
Epoch 5/5
Epoch 5: val_accuracy did not improve from 0.09664


<keras.callbacks.History at 0x7fbee64b59f0>

**Show results in tensorboard**

In [None]:
!tensorboard dev upload --logdir {log_dir2} --name "Word2vecprojectstemming" --description "Training results from https://colab.research.google.com" --one_shot



New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/L6uE3kLRSWed19ySYxqCoQ/

[1m[2023-05-14T00:48:42][0m Started scanning logdir.
[1m[2023-05-14T00:48:44][0m Total uploaded: 30 scalars, 40 tensors (28.7 kB), 1 binary objects (691.2 kB)
[1m[2023-05-14T00:48:44][0m Done scanning logdir.


Done. View your TensorBoard at https://tensorboard.dev/experiment/L6uE3kLRSWed19ySYxqCoQ/
