In [20]:
import numpy as np
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import pandas as pd
import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
import csv

with open("rus.txt", "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    df = pd.DataFrame([row for row in reader], columns=[n for n in range(3)])

df.head()

Unnamed: 0,0,1,2
0,Go.,Марш!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Иди.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Идите.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Hi.,Здравствуйте.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
4,Hi.,Привет!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [3]:
df = df.drop(2, axis=1)

df.head()


Unnamed: 0,0,1
0,Go.,Марш!
1,Go.,Иди.
2,Go.,Идите.
3,Hi.,Здравствуйте.
4,Hi.,Привет!


In [4]:
df.shape

(392305, 2)

In [15]:
df = df.rename(columns={0: "output", 1: "input"})
df.tail()

Unnamed: 0,output,input
392300,"In today's world, we have to equip all our kid...",В современном мире перед нами стоит задача дат...
392301,Death is something that we're often discourage...,"Смерть - это зачастую то, разговоры или даже м..."
392302,"At a moment when our economy is growing, our b...","В тот момент, когда наша экономика растёт, наш..."
392303,Since there are usually multiple websites on a...,"Поскольку сайтов, посвящённых какой-либо теме,..."
392304,Doubtless there exists in this world precisely...,"Несомненно, для каждого мужчины в этом мире гд..."


In [16]:
def preprocess_sentence(w):

    w = w.lower().strip()

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [17]:
def create_dataset(df):
    word_pairs = [[preprocess_sentence(w) for w in df[['input', 'output']].values[i]] for i in range(len(df))]
    return zip(*word_pairs)


In [None]:
en, ru = create_dataset(df)
print(en[10])
print(ru[10])

In [6]:
from transformers import MarianMTModel, MarianTokenizer


model_name = "Helsinki-NLP/opus-mt-ru-en"  # replace with the desired model name
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# prepare the data for fine-tuning
#train_data = []
#for i, row in df.iterrows():
    #train_data.append({
        #'input_text': row['rus'],
        #'output_text': row['eng']
    #})
    





In [9]:
inputs=df['input'].tolist()
outputs=df['output'].tolist()
print(len(inputs), len(outputs))

392305 392305


In [10]:
inputs_encoded = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
outputs_encoded = tokenizer(outputs, return_tensors="pt", padding=True, truncation=True)

In [11]:
print(inputs_encoded)

{'input_ids': tensor([[ 6063,  1492,    56,  ..., 62517, 62517, 62517],
        [  137,   955,     3,  ..., 62517, 62517, 62517],
        [  137,  4069,    30,  ..., 62517, 62517, 62517],
        ...,
        [   49,  1066,  2114,  ..., 62517, 62517, 62517],
        [ 3987,    21, 32832,  ..., 62517, 62517, 62517],
        [26173,     2,    40,  ...,    60,     3,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}


In [14]:
model.train(inputs_encoded)

ValueError: training mode is expected to be boolean

In [None]:
# evaluate the performance
test_data = [
    {'input_text': 'привет', 'target_text': 'hello'},
    {'input_text': 'книга', 'target_text': 'book'},
    # add more test data
]
test_df = pd.DataFrame(test_data)
inputs = tokenizer(test_df['input_text'].tolist(), return_tensors="pt", padding=True, truncation=True)
outputs = tokenizer(test_df['target_text'].tolist(), return_tensors="pt", padding=True, truncation=True)
predictions = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
accuracy = (test_df['target_text'] == decoded_preds).mean()
print(f"Accuracy: {accuracy}")

In [65]:
list(df['eng'].values[:10])

['Go.', 'Go.', 'Go.', 'Hi.', 'Hi.', 'Hi.', 'Hi.', 'Hi.', 'Run!', 'Run!']

In [7]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2)

In [8]:
train_seq = [(rus, eng) for rus, eng in train_df[["rus", "eng"]].values]
test_seq = [(rus, eng) for rus, eng in test_df[["rus", "eng"]].values]

In [9]:
train_seq[:5]

[('Том не мог надеть свои носки.', "Tom couldn't put his socks on."),
 ('Я хотел бы с тобой как-нибудь спеть.',
  "I'd like to sing with you sometime."),
 ('Я поехал туда на поезде.', 'I went there by train.'),
 ('До летних каникул всего неделя.',
  'The summer vacation is only a week away.'),
 ('Этому дереву около трёхсот лет.', 'This tree is about 300 years old.')]

In [10]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
import os

In [11]:
# Load the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")

# Prepare the dataset
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")




In [12]:
# Tokenize the data
train_encoded = tokenizer(train_seq, return_tensors="pt", padding=True, truncation=True)
test_encoded = tokenizer(test_seq, return_tensors="pt", padding=True, truncation=True)

In [14]:
train_encoded.keys()

dict_keys(['input_ids', 'attention_mask'])

In [24]:
train_encoded['input_ids'][0]

tensor([ 1118,    46,    26,  1732,   730,    30,   320,   570,   217,  4215,
            3,  1089,  5717,    21,  3590,  3672,   136,   189,    18,    45,
          562,  6825,  3861,  2674,   144,   573, 11100,    23,    25,     3,
            0, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517,
        62517, 62517, 62517, 62517, 62517, 62517, 62517, 62517, 

In [32]:
print(len(train_encoded["input_ids"]))
print(len(test_encoded["input_ids"]))


313844
78461


In [56]:
# Define the training arguments for the Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    save_total_limit = 1,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
)

# Define the Seq2SeqTrainer and train the model
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.TensorDataset(train_encodings['input_ids'],
                                                 train_encodings['attention_mask']
                                                 ),
    eval_dataset=torch.utils.data.TensorDataset(test_encodings['input_ids'],
                                                test_encodings['attention_mask']
                                               )
)
                                               

trainer.train()



TypeError: vars() argument must have __dict__ attribute

In [8]:
from transformers import pipeline

# Create a pipeline for translation Russian to English

translator = pipeline("translation_ru_to_en", model="Helsinki-NLP/opus-mt-ru-en")



# Train the pipeline on the df, limiting the output translation to 40 characters
translator.fit(df["rus"], df["eng"], max_length=40)

# Print the pipeline
print(translator)


Downloading pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]



AttributeError: 'TranslationPipeline' object has no attribute 'fit'

In [23]:
from transformers import pipeline

# Create a pipeline for translation Russian to English
translator = pipeline("translation_ru_to_en", model="Helsinki-NLP/opus-mt-ru-en")

# Translate some text
translated_text = translator("Привет, как дела?")

# Print the translated text
print(translated_text)



[{'translation_text': "Hey, how's it going?"}]
