<a href="https://colab.research.google.com/github/kailasa-Nischal/GRU_implementation_in_C/blob/main/GRU_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense,Embedding
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# User inputs for dimensions
num_sequences = int(input("Enter the number of sequences: "))
time_frames = int(input("Enter the number of time frames per sequence: ")) #number of words in a sequence
input_size = int(input("Enter the input size: ")) #representation of one word
hidden_size = int(input("Enter the hidden size: "))

# Create a dummy input tensor to fit the model to the user-defined dimensions
# dummy_input = np.random.rand(num_sequences, time_frames, input_size)

text_sequences = []
for i in range(num_sequences):
    sequence = input(f"Enter words for sequence {i+1}: ")
    text_sequences.append(sequence)

# Step 2: Tokenize and Convert Words to Sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

print("Word Index (Tokenization):")
for word, index in tokenizer.word_index.items():
    print(f"Word: {word} - Token: {index}")

print("\nSequences of Tokens:")
for i, seq in enumerate(sequences):
    print(f"Sequence {i+1}: {seq}")

# Pad sequences to ensure they have the same length (number of time frames)
vocabulary_size = len(tokenizer.word_index) + 1  # Vocabulary size
padded_sequences = pad_sequences(sequences, maxlen=time_frames, padding='post')

print("\nPadded Sequences of Tokens:")
for i, padded_seq in enumerate(padded_sequences):
    print(f"Sequence {i+1}: {padded_seq}")

print("\nVocabulary Size:", vocabulary_size)

file_path = '/content/txt/inputs.txt'  # Replace with your actual file path

# Append padded sequences to the existing text file
with open(file_path, 'w') as file:
    for sequence in padded_sequences:
        # Convert each sequence to a space-separated string of integers
        sequence_str = ' '.join(map(str, sequence))
        # Write the string to the file
        file.write(sequence_str+" ")

# Define the output dimension
output_dim= vocabulary_size
# Step 3: Define the GRU Model
model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=input_size, input_length=time_frames))
model.add(GRU(hidden_size, return_sequences=True))  # `hidden_size` should be defined based on your GRU requirements
model.add(Dense(output_dim, activation='softmax'))  # `output_dim` should be defined based on your task

# Compile and fit the model (dummy data used here)
model.compile(optimizer='adam', loss='categorical_crossentropy')
dummy_labels = np.random.randint(0, output_dim, size=(num_sequences, time_frames,output_dim)) #need to keep original labels for real training
model.fit(padded_sequences, dummy_labels, epochs=5)

# Extract and save the weights as before
gru_layer = model.layers[1]
weights = gru_layer.get_weights()
# Save the weights as shown in part (a)

# Unpack weights into the corresponding matrices
U = weights[0]  # U_z, U_r, U_h combined
W = weights[1]  # W_z, W_r, W_h combined
b = weights[2]  # b_z, b_r, b_h combined


# Split weights into individual components
U_z, U_r, U_h = np.split(U, 3, axis=1)
W_z, W_r, W_h = np.split(W, 3, axis=1)
b_z, b_r, b_h = np.split(b, 3, axis=1)

# Save each component to a separate text file
np.savetxt('./txt/U_z.txt', U_z)
np.savetxt('./txt/U_r.txt', U_r)
np.savetxt('./txt/U_h.txt', U_h)
np.savetxt('./txt/W_z.txt', W_z)
np.savetxt('./txt/W_r.txt', W_r)
np.savetxt('./txt/W_h.txt', W_h)
np.savetxt('./txt/b_z.txt', b_z)
np.savetxt('./txt/b_r.txt', b_r)
np.savetxt('./txt/b_h.txt', b_h)

Enter the number of sequences: 2
Enter the number of time frames per sequence: 7
Enter the input size: 4
Enter the hidden size: 5
Enter words for sequence 1: erripappa
Enter words for sequence 2: /content/txt
Word Index (Tokenization):
Word: erripappa - Token: 1
Word: content - Token: 2
Word: txt - Token: 3

Sequences of Tokens:
Sequence 1: [1]
Sequence 2: [2, 3]

Padded Sequences of Tokens:
Sequence 1: [1 0 0 0 0 0 0]
Sequence 2: [2 3 0 0 0 0 0]

Vocabulary Size: 4
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 7.7250
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 7.7261
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 7.7273
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 7.7284
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 7.7296


In [None]:
!mkdir txt

In [None]:
import shutil

# Zip the folder
shutil.make_archive('/content/txt', 'zip', '/content/txt')
from google.colab import files

# Download the zipped folder
files.download('/content/txt.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>