In [186]:
# import pandas as pd

# # Specify the path to your JSON file
# json_file_path = '/content/drive/MyDrive/Datasets/hinglish_upload_v1.json'
# csv_file_path = '/content/drive/MyDrive/Datasets/hinglish.csv'
# # Read the JSON file into a DataFrame
# df = pd.read_json(json_file_path, lines=True)
# new_df = df['translation'].apply(pd.Series)

# new_df[['hi_ng', 'en']].to_csv(csv_file_path, index=False)

In [187]:
vocab_size = 10000
sequence_length = 30
batch_size = 128
validation_split = 0.15
embed_dim = 256
latent_dim = 256
num_heads = 8
epochs = 30 # Number of Epochs to train
is_training = False


In [188]:
import os
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

data = pd.read_csv('hinglish.csv')[:5000]
data = data.rename(columns={'hi_ng': 'english', 'en': 'spanish'})


In [189]:
data.head()

Unnamed: 0,english,spanish
0,film ka kya naam hai,What's the name of the movie
1,"namaste, sada hua tomatoes score mahaan hai, l...","Hi, the rotten tomatoes score is great but the..."
2,kya aapako lagata hai ki aapako film pasand aa...,Do you think you will like the movie
3,yah kis tarah kee philm hai,What kind of movie is it
4,film kab banee thee?,when was the movie made?


In [190]:
len(data)

5000

In [191]:
# combined_text = ' '.join(col for col in data['spanish'])
# len(set(combined_text.split(' ')))

In [192]:
# from google.colab import drive
# drive.mount('/content/drive')

In [193]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import pathlib
import random
import string
import re
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import sklearn
from sklearn.model_selection import train_test_split
print(f"Tensorflow Version:{tf.__version__}")

Tensorflow Version:2.10.0


In [194]:
data["spanish"] = data["spanish"].apply(lambda item: "[start] " + item + " [end]")

In [195]:
data.head()

Unnamed: 0,english,spanish
0,film ka kya naam hai,[start] What's the name of the movie [end]
1,"namaste, sada hua tomatoes score mahaan hai, l...","[start] Hi, the rotten tomatoes score is great..."
2,kya aapako lagata hai ki aapako film pasand aa...,[start] Do you think you will like the movie [...
3,yah kis tarah kee philm hai,[start] What kind of movie is it [end]
4,film kab banee thee?,[start] when was the movie made? [end]


In [196]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "").replace("]", "")
print(strip_chars)
def spanish_standardize(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]"%re.escape(strip_chars), "")
english_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
spanish_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=spanish_standardize,
)
english_vectorization.adapt(list(data["english"]))
spanish_vectorization.adapt(list(data["spanish"]))

!"#$%&'()*+,-./:;<=>?@\^_`{|}~¿


In [197]:
english_vectorization_model = tf.keras.Sequential([english_vectorization])
spanish_vectorization_model = tf.keras.Sequential([spanish_vectorization])

# Adapt the vectorization layers
english_vectorization_model.layers[0].adapt(list(data["english"]))
spanish_vectorization_model.layers[0].adapt(list(data["spanish"]))

# Save the models
english_vectorization_model.save("english_vectorization_model")
spanish_vectorization_model.save("spanish_vectorization_model")






INFO:tensorflow:Assets written to: english_vectorization_model\assets


INFO:tensorflow:Assets written to: english_vectorization_model\assets






INFO:tensorflow:Assets written to: spanish_vectorization_model\assets


INFO:tensorflow:Assets written to: spanish_vectorization_model\assets


In [198]:
from tensorflow.keras.models import load_model

english_vectorization_model = load_model("english_vectorization_model")
spanish_vectorization_model = load_model(
    "spanish_vectorization_model",
    custom_objects={'spanish_standardize': spanish_standardize}
)

# Extract the TextVectorization layers
english_vectorization = english_vectorization_model.layers[0]
spanish_vectorization = spanish_vectorization_model.layers[0]









In [199]:
english_vectorization("film ka kya naam hai")

<tf.Tensor: shape=(30,), dtype=int64, numpy=
array([ 52,  13,   9, 149,   2,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0], dtype=int64)>

In [200]:
def preprocess(english, spanish):
    english = english_vectorization(english)
    spanish = spanish_vectorization(spanish)
    return ({"encoder_inputs": english, "decoder_inputs": spanish[:, :-1]}, spanish[:, 1:])
def make_dataset(df, batch_size, mode):
    dataset = tf.data.Dataset.from_tensor_slices((list(df["english"]), list(df["spanish"])))
    if mode == "train":
       dataset = dataset.shuffle(batch_size * 4)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    dataset = dataset.prefetch(tf.data.AUTOTUNE).cache()
    return dataset

In [201]:
train, valid = train_test_split(data, test_size=validation_split, random_state=42)
train.shape, valid.shape

((4250, 2), (750, 2))

In [202]:
train_ds = make_dataset(train, batch_size=batch_size, mode="train")
valid_ds = make_dataset(valid, batch_size=batch_size, mode="valid")

In [203]:
for batch in train_ds.take(1):
    print(batch)

({'encoder_inputs': <tf.Tensor: shape=(128, 30), dtype=int64, numpy=
array([[ 643,  200,   17, ...,    0,    0,    0],
       [6725,    0,    0, ...,    0,    0,    0],
       [ 536, 5881,   65, ...,    0,    0,    0],
       ...,
       [ 103,  155,   21, ...,    0,    0,    0],
       [ 141,  288,  308, ...,    0,    0,    0],
       [  25,   16,   48, ...,    0,    0,    0]], dtype=int64)>, 'decoder_inputs': <tf.Tensor: shape=(128, 30), dtype=int64, numpy=
array([[   2, 2109,    6, ...,    0,    0,    0],
       [   2, 4823,    3, ...,    0,    0,    0],
       [   2,   75,  208, ...,    0,    0,    0],
       ...,
       [   2,   75,   37, ...,    0,    0,    0],
       [   2,  131,    5, ...,  958,    5,   23],
       [   2,    5,   23, ...,    0,    0,    0]], dtype=int64)>}, <tf.Tensor: shape=(128, 30), dtype=int64, numpy=
array([[2109,    6,    9, ...,    0,    0,    0],
       [4823,    3,    0, ...,    0,    0,    0],
       [  75,  208,   28, ...,    0,    0,    0],
       .

In [204]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [205]:
import tensorflow as tf
from tensorflow.keras import layers

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
            mask_zero=True
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length,
            output_dim=embed_dim,
            mask_zero=True
        )

        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        embedded_tokens = self.token_embeddings(inputs)
        positions = tf.range(start=0, limit=self.sequence_length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = self.token_embeddings.compute_mask(inputs)
        return mask


In [206]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential([
            layers.Dense(latent_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask = None):
        causal_mask = self.get_causal_attention_mask(inputs)
        padding_mask = None
        if mask != None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask
        )

        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask
        )

        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        out = self.layernorm_3(out_2 + proj_output)

        return out

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0
        )
        return tf.tile(mask, mult)

In [207]:
def edit_distance(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    y_pred = tf.argmax(y_pred, axis=-1, output_type=y_true.dtype)
    y_true_tensor =  tf.sparse.from_dense(
        y_true
    )
    y_pred_tensor = tf.sparse.from_dense(
        y_pred
    )
    metric = 1 - tf.edit_distance(y_true_tensor, y_pred_tensor, normalize=True)
    return metric

def get_transformer():
    encoder_inputs = tf.keras.Input(shape=(sequence_length, ), name="encoder_inputs")
    print("Shape of encoder_inputs:", encoder_inputs.shape)

    x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
    print("Shape of positional_encoder_inputs:", x.shape)
    encoder_outputs = TransformerEncoder(embed_dim, num_heads, latent_dim)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
    x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
    x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.5)(x)
    decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
    )
    '''transformer.compile(
        "adam", loss="sparse_categorical_crossentropy", metrics=["accuracy", edit_distance]
    )'''
    transformer.compile(
        "adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return transformer
transformer = get_transformer()
transformer.summary()
keras.utils.plot_model(transformer, show_shapes=True)

Shape of encoder_inputs: (None, 30)
Shape of positional_encoder_inputs: (None, 30, 256)
Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 30)]         0           []                               
                                                                                                  
 positional_embedding_18 (Posit  (None, 30, 256)     2567680     ['encoder_inputs[0][0]']         
 ionalEmbedding)                                                                                  
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  


In [210]:
transformer.fit(train_ds, epochs=100, validation_data=valid_ds)

Epoch 1/100


KeyboardInterrupt



In [211]:
# Save the model weights
transformer.save_weights("transformer_weights.h5")


In [212]:
# # Recreate the model architecture
# transformer = get_transformer()  # Make sure to redefine the get_transformer function as needed

# # Load the weights
# transformer.load_weights("transformer_weights.h5")


In [213]:
transformer.evaluate(valid_ds)



[3.2109458446502686, 0.32739168405532837]

In [214]:
spanish_vocab = spanish_vectorization.get_vocabulary()
spanish_index_lookup = dict(zip(range(len(spanish_vocab)), spanish_vocab))
def remove_start_and_end_token(sentence):
    return sentence.replace("[start] ", "").replace(" [end]", "")
def decode_sequence(transformer, input_sentence):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(sequence_length):
        tokenized_target_sentence = spanish_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spanish_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return remove_start_and_end_token(decoded_sentence)

In [228]:
reference_sentences =[]
decoded_sentences = []  # list of decoded Spanish sentences
for i in np.random.choice(len(data), 10):
    item = data.iloc[i]
    translated = decode_sequence(transformer, item["english"])
    print("English:", remove_start_and_end_token(item["english"]))
    print("Spanish:", remove_start_and_end_token(item["spanish"]))
    print("Translated:", translated)
    reference_sentences.append(remove_start_and_end_token(item["spanish"]))
    decoded_sentences.append(translated)

English: are tuje pasand nahi tho kya tu koi recommend nahi kartha war movies ko.. ya dusra kya reason hey yar
Spanish: are you  not recommending it just because you personally don't like war movies or for another reason?
Translated: are you not recommending it just because you personally dont like war movies or for another reason
English: muje patha nahi hein. lekin voh comedian type ka actor hein
Spanish: I am not sure about that. He seems to be more of a comedian type actor. 
Translated: i am not sure about that he seems to be more of a comedian type actor
English: hello, kaise ho aap? kya aapko Iron man movie pasand hein?
Spanish: Hello how are you? How did you like the movie Iron Man? 
Translated: hello how are you how did you like the movie iron man
English: sad hein yar
Spanish: That is sad.
Translated: that is sad
English: ye wali kafi achi hai but kafi visual hai
Spanish: That one is really good but its really visual 
Translated: that one is really good but its really visual
E

In [223]:
translated = decode_sequence(transformer, "I am late today")
print(translated)

im doing well


In [216]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
corpus_score = corpus_bleu([[ref] for ref in reference_sentences], decoded_sentences)

print(f"Corpus BLEU score: {corpus_score*100:.2f}")

Corpus BLEU score: 77.42
