In [1]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers.adapters.composition import Stack
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoAdapterModel
from transformers import AdapterConfig
from transformers import TrainingArguments, AdapterTrainer
from datasets import concatenate_datasets
import numpy as np
from transformers import EvalPrediction

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

def get_data(lang, data):
    data = data[data["language"] == lang]
    return data

def get_languages(data):
    # data = pd.read_csv("data/train.tsv",sep="\t")
    return data["language"].unique()

def encode_batch(examples):
    all_encoded = {"input_ids": [], "attention_mask": []}
    for premise, hypothesis in (zip(examples["premise"], examples["hypothesis"])):
        # encode separately to maintain CLS and SEP tokens
        encoded = tokenizer.batch_encode_plus(
            [premise, hypothesis],
            add_special_tokens = True,
            pad_to_max_length = True,
            max_length = 100 ,
            return_tensors = 'pt',
            truncation = True
            )

        all_encoded["input_ids"].append(encoded["input_ids"].flatten())
        all_encoded["attention_mask"].append(encoded["attention_mask"].flatten())

    return all_encoded

def preprocess_dataset(dataset):
  # Encode the input data
  dataset = dataset.map(encode_batch, batched=True)
  # The transformers model expects the target class column to be named "labels"
  dataset = dataset.rename_column("gold_label", "labels")
  # Transform to pytorch tensors and only output the required columns
  dataset.set_format(columns=["input_ids", "attention_mask", "labels"])
  return dataset

In [2]:
def make_dataset(df, lang):
    en_data = get_data(lang, df)
    en_data = en_data.dropna()
    print("Size of data for language", lang, en_data.shape)

    labels = en_data["gold_label"].values
    labels = [0 if label == "entailment" else 1 if label == "neutral" else 2 for label in labels]
    en_data["gold_label"] = labels
    en_data = Dataset.from_pandas(en_data)

    dataset_en = preprocess_dataset(en_data)
    dataset_en = dataset_en.remove_columns(["language", "premise", "hypothesis", "__index_level_0__"])

    return dataset_en

In [3]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
chinese_text = "生活就像一盒巧克力。"

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

Downloading config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.80G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# translate Hindi to French
tokenizer.src_lang = "hi"
encoded_hi = tokenizer(hi_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [6]:
df = pd.read_csv("data/train.tsv", sep="\t")

In [8]:
df[df["language"] != "en"]

Unnamed: 0,gold_label,premise,hypothesis,language
100993,neutral,"Và, uh, nếu nó tăng và cứ tiếp tục tăng như vậ...","Nếu có sự thay đổi trong dòng điện, sẽ rất ngu...",vi
100994,contradiction,"phải, tôi cũng không hiểu điều gì khiến, anh b...",Nó đạt tỉ lệ 99%.,vi
100995,neutral,Anh ấy quay lại với Chúa Julian.,Anh muốn cầu xin Chúa Julian tha cho vợ mình.,vi
100996,entailment,"Giờ đến lượt cô đang tự bào chữa, giọng cô run...",The woman was angry and defensive.,vi
100997,neutral,Espinosa đã thu thập nhiều chuyện tình từ nhữn...,Espinosa bán lại những tác phẩm chuyện tình lã...,vi
...,...,...,...,...
114988,contradiction,"Dans l'ordre juridique postbellum, le même rés...",L'ordre légal parabellum avait des résultats t...,fr
114989,neutral,"Et euh, si ça montait brusquement et que ça co...","S'il y a une surtension d'électricité, c'est t...",fr
114990,entailment,C'était toujours une zone culturelle mais la b...,La plus grande partie de la région était en ba...,fr
114991,contradiction,Ce que je pense de vous peut n'avoir que très ...,Tu devrais vraiment t'intéresser à ce que je p...,fr
