In [None]:
## Initializations
%pip install transformers



In [None]:
## Load Data
import pandas as pd
csv_file_path = 'sentences.csv'
df = pd.read_csv(csv_file_path, header=0)
print(df.columns)

Index(['type', 'underspecified sentence', 'control sentence 1',
       'control sentence 2', 'continuation of control sentence 1',
       'continuation of control sentence 2'],
      dtype='object')


In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

In [None]:
hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
chinese_text = "生活就像一盒巧克力。"

# translate Hindi to French
tokenizer.src_lang = "hi"
encoded_hi = tokenizer(hi_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(output)
# => "La vie est comme une boîte de chocolat."

# translate Chinese to English
tokenizer.src_lang = "zh"
encoded_zh = tokenizer(chinese_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(output)
# => "Life is like a box of chocolate."


['La vie est comme une boîte de chocolat.']
['Life is like a box of chocolate.']


In [None]:
## Sample data
sample_count = 2

import random

df2 = df.iloc[:, :2]

def sample_n_from_group(group):
    return group.sample(n=min(sample_count, len(group)), replace=False)

data = df2.groupby('type', group_keys=False).apply(sample_n_from_group)
data = data.sample(frac=1).reset_index(drop=True)

print(data)

   type                            underspecified sentence
0     2  Andrei and Danny picked up the yellow chair an...
1     2   Andrei looked at Danny putting down a yellow bag
2     3             Andrei approached Danny; he held a bag
3     1              Andrei and Danny moved a yellow chair
4     1                     Yevgeni left Andrei; Danny too
5     3  Danny put down the bag and the chair; it was g...


In [None]:
## Translate data
language_source = 'en'
language_target_names = ['German', 'Greek', 'English', 'Spanish', 'Persian',
                         'French', 'Hindi', 'Croatian', 'Italian', 'Korean',
                         'Dutch', 'Romanian', 'Russian', 'Turkish', 'Chinese']
language_target_labels = ['de', 'el', 'en', 'es', 'fa', 'fr', 'hi', 'hr', 'it',
                          'ko', 'nl', 'ro', 'ru', 'tr', 'zh']

result = []
for i, r in data.iterrows():
  label = r.iloc[0]
  sentence = r.iloc[1]
  for l_name, l_label in zip(language_target_names, language_target_labels):
    tokenizer.src_lang = 'en'
    if l_label != 'en':
      encoded = tokenizer(sentence, return_tensors="pt")
      generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(l_label))
      output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    else:
      output = sentence
    result.append([i, label, l_label, l_name, '?', '?', sentence, output ])
df_result = pd.DataFrame(result)
print(df_result)

    0  1   2         3  4  5  \
0   0  2  de    German  ?  ?   
1   0  2  el     Greek  ?  ?   
2   0  2  en   English  ?  ?   
3   0  2  es   Spanish  ?  ?   
4   0  2  fa   Persian  ?  ?   
.. .. ..  ..       ... .. ..   
85  5  3  nl     Dutch  ?  ?   
86  5  3  ro  Romanian  ?  ?   
87  5  3  ru   Russian  ?  ?   
88  5  3  tr   Turkish  ?  ?   
89  5  3  zh   Chinese  ?  ?   

                                                    6  \
0   Andrei and Danny picked up the yellow chair an...   
1   Andrei and Danny picked up the yellow chair an...   
2   Andrei and Danny picked up the yellow chair an...   
3   Andrei and Danny picked up the yellow chair an...   
4   Andrei and Danny picked up the yellow chair an...   
..                                                ...   
85  Danny put down the bag and the chair; it was g...   
86  Danny put down the bag and the chair; it was g...   
87  Danny put down the bag and the chair; it was g...   
88  Danny put down the bag and the chair; it 

In [None]:
## Save output
import pandas as pd

df_result.to_csv("language_init_output_sample"+str(sample_count)+".csv",
                 index=False, encoding='utf-8')