<a href="https://colab.research.google.com/github/kailasa-Nischal/GRU_implementation_in_C/blob/main/GRU_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir txt

In [16]:
import shutil

# Zip the folder
shutil.make_archive('/content/txt', 'zip', '/content/txt')
from google.colab import files

# Download the zipped folder
files.download('/content/txt.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [148]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset

# User inputs for dimensions
time_frames = int(input("Enter the number of time frames per sequence (words in a sequence): "))
input_size = int(input("Enter the input size (embedding dimension): "))
hidden_size = int(input("Enter the hidden size: "))

# Load the AG News dataset
dataset = load_dataset('ag_news')

# Extract texts from the training set
train_dataset = dataset['train']
text_sequences = [example['text'] for example in train_dataset.take(3)]


# Print the first 10 data points
print("First 10 data points (texts) from the AG News dataset:")
for i, text in enumerate(text_sequences):
    print(f"Text {i+1}: {text}")

Enter the number of time frames per sequence (words in a sequence): 8
Enter the input size (embedding dimension): 9
Enter the hidden size: 7
First 10 data points (texts) from the AG News dataset:
Text 1: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Text 2: Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
Text 3: Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.


In [149]:
tokenizer = Tokenizer()
text_sequences = [example['text'] for example in train_dataset.take(1000)]
tokenizer.fit_on_texts(text_sequences)

print("Word Index:")
print(tokenizer.word_index['monday'])

Word Index:
23


In [150]:
# Tokenize and convert to sequences
tokenizer = Tokenizer()
text_sequences = [example['text'] for example in train_dataset.take(1000)]
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

vocabulary_size = len(tokenizer.word_index) + 1  # Vocabulary size
padded_sequences = pad_sequences(sequences, maxlen=time_frames, padding='post')

print("\nPadded Sequences of Tokens:")
for i, padded_seq in enumerate(padded_sequences):
    print(f"Sequence {i+1}: {padded_seq}")

output_dim = vocabulary_size



Padded Sequences of Tokens:
Sequence 1: [1270    4 2635 2636   22 1059 1271  332]
Sequence 2: [  19 4061    7  385  334    4    1  118]
Sequence 3: [  81  201    1 1274    4    1  600 1581]
Sequence 4: [ 217 1277   18   41  281   24    7  266]
Sequence 5: [4072   89  601  158    1  101  783 1582]
Sequence 6: [530  21 120 532 388  63 388 337]
Sequence 7: [2644 4076    1 1273   69 2645   24  180]
Sequence 8: [   1  165   12 2648   21    3 4081 4082]
Sequence 9: [   1 4090  785   21  228 2010  105  922]
Sequence 10: [1270    4 2635 2636   22 1059 1271  332]
Sequence 11: [  81  201    1 1274    4    1  600 1581]
Sequence 12: [  24  266 1281   10   80   42  694  926]
Sequence 13: [  80 1062   72 2660 4096   24    7   27]
Sequence 14: [4097    3 4098   21   30   26 1589 1065]
Sequence 15: [2017  605  606    2 2665    1  535  604]
Sequence 16: [  31 4104   46  608    1  243  928  689]
Sequence 17: [ 362   33  424  247    3  363 1286 2668]
Sequence 18: [ 539   64    1 2672   71 4107   16  365

In [151]:
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

# Convert y to categorical
y = tf.keras.utils.to_categorical(y, num_classes=vocabulary_size)

In [152]:
print(X.shape)
print(y.shape)
print(padded_sequences.shape)

(1000, 7)
(1000, 7817)
(1000, 8)


In [156]:
model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=input_size, input_length=time_frames))
model.add(GRU(hidden_size, return_sequences=False))  # We use return_sequences=False for prediction of next word
model.add(Dense(vocabulary_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X, y, epochs=10,batch_size=32, verbose=1)

Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.0109 - loss: 8.9598
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.0119 - loss: 8.8342
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.0157 - loss: 8.3513
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0187 - loss: 7.9343
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.0197 - loss: 7.6058
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0129 - loss: 7.3735
Epoch 7/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.0282 - loss: 7.1757
Epoch 8/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0240 - loss: 7.0025
Epoch 9/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x79eb0520de10>

In [157]:
# Get user input sentence
user_input_sentence = input("Enter a sentence to autocomplete: ")
user_sequence = tokenizer.texts_to_sequences([user_input_sentence])
user_sequence = pad_sequences(user_sequence, maxlen=time_frames-1, padding='post')  # Adjust padding for input to GRU
print(user_sequence)
print(user_sequence.shape)

file_path = '/content/txt/inputs.txt'  # file path
with open(file_path, 'w') as file:
    for sequence in user_sequence:
        # Converting each sequence to a string of integers
        sequence_str = ' '.join(map(str, sequence))
        # Write the string to the file
        file.write(sequence_str+" ")

# Predict the next word
predicted_probs = model.predict(user_sequence)
predicted_word_index = np.argmax(predicted_probs, axis=-1)[0]
print(predicted_word_index)
if predicted_word_index in tokenizer.index_word:
    predicted_word = tokenizer.index_word[predicted_word_index]
    print(f"Predicted next word: {predicted_word}")
else:
    print("Predicted next word is out of vocabulary (likely padding or a special token)")
    predicted_word = "error in predicting"

#print(f"Predicted next word: {predicted_word}")

Enter a sentence to autocomplete: mui name is
[[708  12   0   0   0   0   0]]
(1, 7)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step
27
Predicted next word: sunday


In [114]:
gru_layer = model.layers[1]  # Assuming GRU is the second layer
weights = gru_layer.get_weights()

U = weights[1]  # Input-to-hidden weights
W = weights[0]  # Hidden-to-hidden weights
b = weights[2]  #biases

# Split weights into components
U_z, U_r, U_h = np.split(U, 3, axis=1)
W_z, W_r, W_h = np.split(W, 3, axis=1)
b_z, b_r, b_h = np.split(b, 3,axis=1)

print(U_z.shape)
print(W_z.shape)
print(b_z.shape)

# Save weights to text files
np.savetxt('./txt/U_z.txt', U_z)
np.savetxt('./txt/U_r.txt', U_r)
np.savetxt('./txt/U_h.txt', U_h)
np.savetxt('./txt/W_z.txt', W_z)
np.savetxt('./txt/W_r.txt', W_r)
np.savetxt('./txt/W_h.txt', W_h)
np.savetxt('./txt/b_z.txt', b_z)
np.savetxt('./txt/b_r.txt', b_r)
np.savetxt('./txt/b_h.txt', b_h)

(7, 7)
(14, 7)
(2, 7)
