### Importing the dataset

In [None]:
import pandas as pd
data=pd.read_csv('/content/newdata[2].csv')
data.tail()

Unnamed: 0.1,Unnamed: 0,english_sentence,hindi_sentence
177601,49994,He asked you saw tiger or not .,उसने पूछा टाइगर देखा या नहीं ।
177602,49995,These words pricked like an arrow .,उसके यह शब्द तीर की तरह चुभ गए ।
177603,49996,I slowly said no .,"मैंने धीरे से कहा , नहीं ।"
177604,49997,There was no permission to take bike inside th...,पार्क में बाइक ले जाने की अनुमति नहीं थी ।
177605,49998,Somebody advised us that we should take safari...,हमें किसी ने सलाह दी कि आप अंदर के लिए सफारी ल...


### Loading the Tokenizer and model

In [None]:
from transformers import MarianTokenizer, TFMarianMTModel
tokenizer=MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model=TFMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
de=data['english_sentence']

In [None]:
de

Unnamed: 0,english_sentence
0,politicians do not have permission to do what ...
1,"I'd like to tell you about one such child,"
2,This percentage is even greater than the perce...
3,what we really mean is that they're bad at not...
4,.The ending portion of these Vedas is called U...
...,...
177601,He asked you saw tiger or not .
177602,These words pricked like an arrow .
177603,I slowly said no .
177604,There was no permission to take bike inside th...


In [None]:
dh=data['hindi_sentence']

In [None]:
dh

Unnamed: 0,hindi_sentence
0,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
...,...
177601,उसने पूछा टाइगर देखा या नहीं ।
177602,उसके यह शब्द तीर की तरह चुभ गए ।
177603,"मैंने धीरे से कहा , नहीं ।"
177604,पार्क में बाइक ले जाने की अनुमति नहीं थी ।


### Preprocessing the data

In [None]:
from transformers import MarianTokenizer

# Load the pre-trained MarianTokenizer for English to Hindi translation
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# Sample English sentences from your dataset
english_sentences = [
    "politicians do not have permission to do what",
    "I'd like to tell you about one such child",
    "This percentage is even greater than the perce",
    "what we really mean is that they're bad at not",
    ".The ending portion of these Vedas is called U",
    "He asked you saw tiger or not .",
    "These words pricked like an arrow .",
    "I slowly said no .",
    "There was no permission to take b"
]

# Tokenize the English sentences
tokenized_inputs = tokenizer(english_sentences, truncation=True, padding='max_length', max_length=128, return_tensors="tf")

print(tokenized_inputs)


{'input_ids': <tf.Tensor: shape=(9, 128), dtype=int32, numpy=
array([[21770,   110,    36, ..., 61949, 61949, 61949],
       [   56,    70,   232, ..., 61949, 61949, 61949],
       [  239, 16995,    23, ..., 61949, 61949, 61949],
       ...,
       [ 1055,   562, 28615, ..., 61949, 61949, 61949],
       [   56, 12214,   149, ..., 61949, 61949, 61949],
       [  547,    80,   177, ..., 61949, 61949, 61949]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(9, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}


In [None]:
# Translate the tokenized English sentences to Hindi
tokenized_hindi = tokenizer.batch_decode(tokenized_inputs['input_ids'], skip_special_tokens=True)

for english_sentence, hindi_translation in zip(english_sentences, tokenized_hindi):
    print(f"English: {english_sentence}")
    print(f"Hindi: {hindi_translation}")
    print()


English: politicians do not have permission to do what
Hindi: politicians do not have permission to do what

English: I'd like to tell you about one such child
Hindi: I'd like to tell you about one such child

English: This percentage is even greater than the perce
Hindi: This percentage is even greater than the perce

English: what we really mean is that they're bad at not
Hindi: what we really mean is that they're bad at not

English: .The ending portion of these Vedas is called U
Hindi: .The ending portion of these Vedas is called U

English: He asked you saw tiger or not .
Hindi: He asked you saw tiger or not.

English: These words pricked like an arrow .
Hindi: These words pricked like an arrow.

English: I slowly said no .
Hindi: I slowly said no.

English: There was no permission to take b
Hindi: There was no permission to take b



In [None]:
# Assuming data is a pandas DataFrame
inputs = tokenizer(data['english_sentence'].astype(str).tolist(), truncation=True, padding='max_length', max_length=128, return_tensors="pt")
targets = tokenizer(data['hindi_sentence'].astype(str).tolist(), truncation=True, padding='max_length', max_length=128, return_tensors="pt")


### Converting to tensorflow dataset

In [None]:
import tensorflow as tf
input_ids=tf.convert_to_tensor(inputs['input_ids'])
attention_masks=tf.convert_to_tensor(inputs['attention_mask'])
decoder_input_ids=tf.convert_to_tensor(targets['input_ids'])

### Shifting decoder_input_ids by one position to the right for teacher forcing

In [None]:
labels=tf.roll(decoder_input_ids,shift=-1,axis=1)
labels=tf.where(labels==0,-100,labels)

### Creating a tensorflow dataset

In [None]:
dataset=tf.data.Dataset.from_tensor_slices(({
    'input_ids':input_ids,
    'attention_mask':attention_masks,
    'decoder_input_ids':decoder_input_ids
},labels))
dataset=dataset.shuffle(1000).batch(16)

### Compiling the model

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

### Fitting the model

In [None]:
model.fit(dataset, epochs=1)



KeyboardInterrupt: 

### Translating English to Hindi

In [None]:
text="Neha Seirah is a good girl"
input_ids = tokenizer.encode(text, return_tensors="tf")
generated_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
translated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Translated Hindi text: ", translated_text)