In [1]:
!pip install tensorflow keras

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/e4/14/d795bb156f8cc10eb1dcfe1332b7dbb8405b634688980aa9be8f885cc888/tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting keras
  Obtaining dependency information for keras from https://files.pythonhosted.org/packages/8d/44/c604ecc5c9993b6574a681f2f505e980725871a89cfd9e48597b12ccb506/keras-3.3.3-py3-none-any.whl.metadata
  Downloading keras-3.3.3-py3-none-any.whl.metadata (5.7 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.16.1 from https://files.pythonhosted.org/packages/e0/36/6278e4e7e69a90c00e0f82944d8f2713dd85a69d1add455d9e50446837ab/tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-

In [6]:
import nltk

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\irina\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\irina\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\irina\AppData\Roaming\nltk_data...


True

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import requests
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
#Loading Data
url = 'https://drive.google.com/uc?id=1GeUzNVqiixXHnTl8oNiQ2W3CynX_lsu2'
response = requests.get(url)
text = response.text

In [4]:
text



## Text preprocessing

In [13]:
stop_words = stopwords.words('english')
#stopwords.remove('not')
lemmatizer = WordNetLemmatizer()

In [14]:
def preprocess_text(text):
    #Converting text to lowercase and removing unnecessary characters
    text = text.lower()
    text = re.sub(r'\d+', '', text)  #deleting numbers
    text = re.sub(r'\s+', ' ', text)  #removing extra spaces
    text = re.sub(r'[^\w\s]', '', text)  #removing punctuation marks

    tokens = nltk.word_tokenize(text) #tokenization
    
    text = [word for word in tokens if word not in stop_words] #removing stop words
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [15]:
preprocess_text(text)



In [16]:
#Initializing Tokenizer and Creating a Dictionary
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [17]:
#Convert text into sequences
input_sequences = []
for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [18]:
#Reducing all sequences to the same length and creating input and output data
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [19]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [20]:
#Converting y to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [22]:
#Creating Model
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(SimpleRNN(150))
model.add(Dense(total_words, activation='softmax'))

In [23]:
#Compiling  model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
#Model training
history = model.fit(X, y, epochs=30, verbose=1)

Epoch 1/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 63ms/step - accuracy: 0.0780 - loss: 6.7301
Epoch 2/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 64ms/step - accuracy: 0.1460 - loss: 5.4761
Epoch 3/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 64ms/step - accuracy: 0.1739 - loss: 4.9437
Epoch 4/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 64ms/step - accuracy: 0.1976 - loss: 4.5064
Epoch 5/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 64ms/step - accuracy: 0.2277 - loss: 4.1136
Epoch 6/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 64ms/step - accuracy: 0.2626 - loss: 3.7785
Epoch 7/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 64ms/step - accuracy: 0.3082 - loss: 3.4336
Epoch 8/30
[1m3406/3406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 64ms/step - accuracy: 0.3527 - loss: 3.1520
