In [66]:
## Initializations
%pip install transformers



In [67]:
## Load Data
import pandas as pd
csv_file_path = 'init_input.csv'
sample_count = 2
data = pd.read_csv(csv_file_path, header=0)
print(data.columns)

Index(['ID', 'Type', 'Source'], dtype='object')


In [68]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

In [69]:
import torch
## Translate data
language_source = 'en'
# language_target_names = ['German', 'Greek', 'English', 'Spanish', 'Persian',
#                          'French', 'Hindi', 'Croatian', 'Italian', 'Korean',
#                          'Dutch', 'Romanian', 'Russian', 'Turkish', 'Chinese']
# language_target_labels = ['de', 'el', 'en', 'es', 'fa', 'fr', 'hi', 'hr', 'it',
#                           'ko', 'nl', 'ro', 'ru', 'tr', 'zh']
language_target_names = ['Persian', 'Dutch']
language_target_labels = ['fa', 'nl']

result_fw = []
hiddens_fw_avg = []
hiddens_fw_last = []
for i, r in data.iterrows():
  label = r.iloc[1]
  sentence = r.iloc[2]
  for l_name, l_label in zip(language_target_names, language_target_labels):
    tokenizer.src_lang = 'en'
    if l_label != 'en':
      input_ids = tokenizer(sentence, return_tensors="pt").input_ids
      generated_tokens = model.generate(input_ids,
                                        forced_bos_token_id=tokenizer.get_lang_id(l_label),
                                        output_hidden_states=True)
      output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

      hidden_states = (model.get_encoder()(input_ids).last_hidden_state)
      hidden_avg = torch.mean(hidden_states, dim=1)[0,:]
      hidden_last = hidden_states[0,-1,:]
      hiddens_fw_avg.append(hidden_avg)
      hiddens_fw_last.append(hidden_last)
    else:
      output = sentence
    result_fw.append([i, label, l_label, l_name, sentence, ''.join(output) ])
df_result_fw = pd.DataFrame(result_fw)
print(df_result_fw)

    0  1   2        3                                                  4  \
0   0  2  fa  Persian  Andrei and Danny picked up the yellow chair an...   
1   0  2  nl    Dutch  Andrei and Danny picked up the yellow chair an...   
2   1  2  fa  Persian   Andrei looked at Danny putting down a yellow bag   
3   1  2  nl    Dutch   Andrei looked at Danny putting down a yellow bag   
4   2  3  fa  Persian             Andrei approached Danny; he held a bag   
5   2  3  nl    Dutch             Andrei approached Danny; he held a bag   
6   3  1  fa  Persian              Andrei and Danny moved a yellow chair   
7   3  1  nl    Dutch              Andrei and Danny moved a yellow chair   
8   4  1  fa  Persian                     Yevgeni left Andrei; Danny too   
9   4  1  nl    Dutch                     Yevgeni left Andrei; Danny too   
10  5  3  fa  Persian  Danny put down the bag and the chair; it was g...   
11  5  3  nl    Dutch  Danny put down the bag and the chair; it was g...   

           

In [70]:
import torch
## Translate backward data

result_bw = []
hiddens_bw_avg = []
hiddens_bw_last = []
for i, r in df_result_fw.iterrows():
  label = r.iloc[1]
  sentence = r.iloc[5]
  l_name = r.iloc[3]
  l_label = r.iloc[2]

  tokenizer.src_lang = l_label
  if l_label != 'en':
    input_ids = tokenizer(sentence, return_tensors="pt").input_ids
    generated_tokens = model.generate(input_ids,
                                      forced_bos_token_id=tokenizer.get_lang_id('en'),
                                      output_hidden_states=True)
    output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    hidden_states = (model.get_encoder()(input_ids).last_hidden_state)
    hidden_avg = torch.mean(hidden_states, dim=1)[0,:]
    hidden_last = hidden_states[0,-1,:]
    hiddens_bw_avg.append(hidden_avg)
    hiddens_bw_last.append(hidden_last)
  else:
    output = sentence
  result_bw.append([i, label, l_label, l_name, sentence, ''.join(output) ])
df_result_bw = pd.DataFrame(result_bw)
print(df_result_bw)

     0  1   2        3                                                  4  \
0    0  2  fa  Persian          آندری و دنی صندلی زرد و کیسه را برداشتند.   
1    1  2  nl    Dutch  Andrei en Danny nemen de gele stoel en de koff...   
2    2  2  fa  Persian   آندری به دنی نگاه کرد و چمدان زرد را پایین آورد.   
3    3  2  nl    Dutch   Andrei keek Danny naar beneden met een gele tas.   
4    4  3  fa  Persian       آندری به دانی نزدیک شد؛ او چمدان را نگه داشت   
5    5  3  nl    Dutch          Andrei keek naar Danny; hij hield een tas   
6    6  1  fa  Persian                    آندری و دانی صندلی زرد می پوشند   
7    7  1  nl    Dutch         Andrei en Danny verplaatsen een gele stoel   
8    8  1  fa  Persian                  یوجینی آندری را ترک کرد؛ دانی نیز   
9    9  1  nl    Dutch                  Yevgeni verlaat Andrei; Danny ook   
10  10  3  fa  Persian        دانی کیسه و صندلی را پایین انداخت؛ سبز بود.   
11  11  3  nl    Dutch  Danny legde de tas en de stoel neer; het was g...   

In [71]:
## Save output
import pandas as pd

df_result_fw.to_csv("language_translate_fw_output_sample"+str(sample_count)+".csv",
                 index=False, encoding='utf-8')
df_result_bw.to_csv("language_translate_bw_output_sample"+str(sample_count)+".csv",
                 index=False, encoding='utf-8')

try:
  hiddens_fw_avg = torch.stack(hiddens_fw_avg)
  hiddens_fw_last = torch.stack(hiddens_fw_last)
  hiddens_bw_avg = torch.stack(hiddens_bw_avg)
  hiddens_bw_last = torch.stack(hiddens_bw_last)
except Exception as e:
    print("Tensors already stacked!")

torch.save(hiddens_fw_avg, "language_translate_fw_avg_output_sample"+str(sample_count)+".pth")
torch.save(hiddens_fw_last, "language_translate_fw_last_output_sample"+str(sample_count)+".pth")
torch.save(hiddens_bw_avg, "language_translate_bw_avg_output_sample"+str(sample_count)+".pth")
torch.save(hiddens_bw_last, "language_translate_bw_last_output_sample"+str(sample_count)+".pth")

import zipfile

# List of files to be included in the zip file
files_to_zip = ["language_translate_fw_output_sample"+str(sample_count)+".csv",
                "language_translate_bw_output_sample"+str(sample_count)+".csv",
                "language_translate_fw_avg_output_sample"+str(sample_count)+".pth",
                "language_translate_fw_last_output_sample"+str(sample_count)+".pth",
                "language_translate_bw_avg_output_sample"+str(sample_count)+".pth",
                "language_translate_bw_last_output_sample"+str(sample_count)+".pth"]

# Path where the zip file will be created
zip_file_path = "language_translate_output_sample"+str(sample_count)+".zip"

# Create a new zip file
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    # Add each file to the zip file
    for file in files_to_zip:
        zipf.write(file)

print("Zip file created successfully:", zip_file_path)


Zip file created successfully: language_translate_output_sample2.zip
