In [None]:
from itertools import combinations
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load and clean dataset functions
def load_data(filepath):
    if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filepath.endswith('.xlsx'):
        return pd.read_excel(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

def clean_dataset(df):
    if 'title' not in df.columns or 'label' not in df.columns:
        raise ValueError("The dataframe must have 'title' and 'label' columns.")

    df = df.dropna(subset=['title', 'label'])  # Ensure no NaNs in title or label
    df = df[df['title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]  # Ensure non-empty strings
    df = df[df['label'].apply(lambda x: isinstance(x, (int, np.integer)) or str(x).isdigit())]  # Ensure valid labels

    df['title'] = df['title'].astype(str)
    df['label'] = df['label'].astype(int)
    return df

# Dataset wrapper to handle input data
class DatasetWrapper(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    print(f"Texts to tokenize: {texts[:5]}")  # Print the first 5 texts for debugging
    print(f"Labels: {labels[:5]}")            # Print the corresponding labels

    # Filter out any potential empty strings in the texts
    valid_data = [(text, label) for text, label in zip(texts, labels) if text.strip()]

    # Separate valid texts and labels
    valid_texts, valid_labels = zip(*valid_data) if valid_data else ([], [])

    if len(valid_texts) == 0:
        raise ValueError("No valid texts found after filtering.")

    encoding = tokenizer(
        valid_texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )

    dataset = DatasetWrapper(encoding, valid_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, dataset_len):
    model.train()
    losses = []
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / dataset_len, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, dataset_len):
    model.eval()
    losses = []
    correct_predictions = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return correct_predictions.double() / dataset_len, np.mean(losses), all_labels, all_preds

# Main function
def main():
    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16
    EPOCHS = 3
    TEST_SPLIT = 0.2
    PATIENCE = 3  # Early stopping patience
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device: {device}')

    languages = ['hebrew', 'arabic', 'bangla', 'chinese', 'english', 'german', 'indonesian', 'romanian', 'turkish']
    file_paths = {
        'hebrew': '/hebrew.xlsx',
        'arabic': '/arabic.xlsx',
        'bangla': '/bangla.csv',
        'chinese': '/chinese.csv',
        'english': '/english.csv',
        'german': '/german.csv',
        'indonesian': '/indonesian.csv',
        'romanian': '/romanianL.xlsx',
        'turkish': '/turkish.csv'
    }

    results = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    # Iterate over combinations of 2 languages
    for lang_comb in combinations(languages, 2):
        print(f"Processing combination: {lang_comb}")

        # Load and clean datasets for the two languages
        df_lang1 = load_data(file_paths[lang_comb[0]])
        df_lang1 = clean_dataset(df_lang1)

        df_lang2 = load_data(file_paths[lang_comb[1]])
        df_lang2 = clean_dataset(df_lang2)

        # Check if datasets are empty after cleaning
        if df_lang1.empty:
            print(f"Warning: Dataset for {lang_comb[0]} is empty after cleaning. Skipping this combination.")
            continue
        if df_lang2.empty:
            print(f"Warning: Dataset for {lang_comb[1]} is empty after cleaning. Skipping this combination.")
            continue

        # Split each dataset into 80% train and 20% test
        if len(df_lang1) > 1 and len(df_lang2) > 1:  # Ensure there are enough samples to split
            df_train_lang1, df_test_lang1 = train_test_split(df_lang1, test_size=TEST_SPLIT, random_state=RANDOM_SEED)
            df_train_lang2, df_test_lang2 = train_test_split(df_lang2, test_size=TEST_SPLIT, random_state=RANDOM_SEED)
        else:
            print(f"Warning: Dataset too small for {lang_comb}. Skipping.")
            continue

        # Combine train datasets of the two languages
        #df_train_combined = pd.concat([df_train_lang1, df_train_lang2], ignore_index=True)

        # Combine train datasets of the two languages
        df_train_combined = pd.concat([df_train_lang1, df_train_lang2], ignore_index=True)

        # Create DataLoaders
        train_data_loader = create_data_loader(
            df_train_combined['title'].tolist(),
            df_train_combined['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )
        test_data_loader_lang1 = create_data_loader(
            df_test_lang1['title'].tolist(),
            df_test_lang1['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )
        test_data_loader_lang2 = create_data_loader(
            df_test_lang2['title'].tolist(),
            df_test_lang2['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )

        # Initialize model
        model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
        model = model.to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)

        best_loss = float('inf')
        patience_counter = 0

        # Training loop
        for epoch in range(EPOCHS):
            print(f'Starting epoch {epoch + 1}/{EPOCHS}')
            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train_combined)
            )
            print(f'Train loss {train_loss}, accuracy {train_acc}')

        # Evaluate on the two test sets separately
        print(f'Evaluating on {lang_comb[0]} test set...')
        test_acc_lang1, test_loss_lang1, labels_lang1, preds_lang1 = eval_model(
            model,
            test_data_loader_lang1,
            loss_fn,
            device,
            len(df_test_lang1)
        )
        precision_lang1 = precision_score(labels_lang1, preds_lang1, average='weighted')
        recall_lang1 = recall_score(labels_lang1, preds_lang1, average='weighted')
        f1_lang1 = f1_score(labels_lang1, preds_lang1, average='weighted')

        print(f'Evaluating on {lang_comb[1]} test set...')
        test_acc_lang2, test_loss_lang2, labels_lang2, preds_lang2 = eval_model(
            model,
            test_data_loader_lang2,
            loss_fn,
            device,
            len(df_test_lang2)
        )
        precision_lang2 = precision_score(labels_lang2, preds_lang2, average='weighted')
        recall_lang2 = recall_score(labels_lang2, preds_lang2, average='weighted')
        f1_lang2 = f1_score(labels_lang2, preds_lang2, average='weighted')

        # Save results for the current combination
        results.append({
            'languages': lang_comb,
            'test_results_lang1': {
                'accuracy': test_acc_lang1.item(),
                'precision': precision_lang1,
                'recall': recall_lang1,
                'f1': f1_lang1
            },
            'test_results_lang2': {
                'accuracy': test_acc_lang2.item(),
                'precision': precision_lang2,
                'recall': recall_lang2,
                'f1': f1_lang2
            }
        })

    print(f"Final results: {results}")

if __name__ == "__main__":
    main()


Using device: cuda




Processing combination: ('hebrew', 'arabic')
Texts to tokenize: [' זקוקים להחלפת ברך? ניתוחי החלפת ברך בשיטה הטובה ביותר', '  "אף אחד לא רוצה להיראות כאילו עבר השתלת שיער" - השיטה שאתם צריכים להכיר', 'סקר ראשון אחר הבחירות: גנץ מתחזק, רה"מ נחלש', " שנתיים לאחר מותה הטרגי בגיל 23 - נחשף שיר חדש של תמר 'טאי' עמר", 'גאוני. צפו: כך נבנה השער הדרמטי של הנוער']
Labels: [1, 1, 0, 0, 1]
Texts to tokenize: [' אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...', ' ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי', '  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"', 'אלעל פתחה עמדות צ\'ק אין עצמי בנתב"ג', 'הטעות של בני 55+']
Labels: [1, 1, 1, 0, 1]
Texts to tokenize: ['بالفيديو – شكل ابنتي #أشرف_زكي و #روجينا حديث #الجمهور... كشفتا والدهما؟', 'الموت يفجع حجاج عبد العظيم مرتين في 4 أيام', 'تاكوبيل تعيد البطاطا، الأمور إثارة!', 'الصحة: مد فترة الحملة القومية الثانية للتطعيم ضد مرض شلل الأطفال حتى غد', 'إحباط محاولة تهريب 20 مليون قرص أمفيتامين مخدر مخبأة داخل شحنة فاكهة العنب.\n']
Labels: [1, 0,

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.42116064030537437, accuracy 0.803964591056972
Starting epoch 2/3
Train loss 0.30755317325792764, accuracy 0.872437012937883
Starting epoch 3/3
Train loss 0.259000814068942, accuracy 0.8893092229704168
Evaluating on hebrew test set...
Evaluating on arabic test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('hebrew', 'bangla')
Texts to tokenize: [' זקוקים להחלפת ברך? ניתוחי החלפת ברך בשיטה הטובה ביותר', '  "אף אחד לא רוצה להיראות כאילו עבר השתלת שיער" - השיטה שאתם צריכים להכיר', 'סקר ראשון אחר הבחירות: גנץ מתחזק, רה"מ נחלש', " שנתיים לאחר מותה הטרגי בגיל 23 - נחשף שיר חדש של תמר 'טאי' עמר", 'גאוני. צפו: כך נבנה השער הדרמטי של הנוער']
Labels: [1, 1, 0, 0, 1]
Texts to tokenize: [' אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...', ' ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי', '  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"', 'אלעל פתחה עמדות צ\'ק אין עצמי בנתב"ג', 'הטעות של בני 55+']
Labels: [1, 1, 1, 0, 1]
Texts to tokenize: ['৬ দিন পর সৌদি আরব যাওয়ার কথা, তার আগে যুবকের রহস্যজনক মৃ’ত্যু', 'এই সেই সন্তান যিনি কিনা তার মাকে গর্ভ;বতী করে ফেলেন, উপায় না পেয়ে মা;য়ের সাথে বিয়ে!', 'কারিনার থেকে তিনগুণ বেশি পারিশ্রমিক কঙ্গনার!', 'ই-কমার্স ব্যবসায় সাকিব - Techzoom.TV', '২৫ লাখ টাকার মোটরসাইকেলে বিশ্বভ্রমণে অজিত']
Labels: [0, 1, 0, 0, 0]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.5938983955214509, accuracy 0.6796008869179601
Starting epoch 2/3
Train loss 0.417978665470022, accuracy 0.811529933481153
Starting epoch 3/3
Train loss 0.3045252685196104, accuracy 0.8725055432372506
Evaluating on hebrew test set...
Evaluating on bangla test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('hebrew', 'chinese')
Texts to tokenize: [' זקוקים להחלפת ברך? ניתוחי החלפת ברך בשיטה הטובה ביותר', '  "אף אחד לא רוצה להיראות כאילו עבר השתלת שיער" - השיטה שאתם צריכים להכיר', 'סקר ראשון אחר הבחירות: גנץ מתחזק, רה"מ נחלש', " שנתיים לאחר מותה הטרגי בגיל 23 - נחשף שיר חדש של תמר 'טאי' עמר", 'גאוני. צפו: כך נבנה השער הדרמטי של הנוער']
Labels: [1, 1, 0, 0, 1]
Texts to tokenize: [' אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...', ' ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי', '  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"', 'אלעל פתחה עמדות צ\'ק אין עצמי בנתב"ג', 'הטעות של בני 55+']
Labels: [1, 1, 1, 0, 1]
Texts to tokenize: ['他终于承认了长达10年的地下情', '“深圳没有早睡的人”', '“加点旋转更舒服？”男朋友居然和舍友做这种不可描述的事情！', '你为什么不联系微信好友了？', '这4个坏习惯最伤肾 你有做过吗？']
Labels: [1, 0, 1, 1, 1]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.19886607851761626, accuracy 0.9205026828579498
Starting epoch 2/3
Train loss 0.08972263023331888, accuracy 0.9679469076532053
Starting epoch 3/3
Train loss 0.035207892487426846, accuracy 0.9885625529511438
Evaluating on hebrew test set...
Evaluating on chinese test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('hebrew', 'english')
Texts to tokenize: [' זקוקים להחלפת ברך? ניתוחי החלפת ברך בשיטה הטובה ביותר', '  "אף אחד לא רוצה להיראות כאילו עבר השתלת שיער" - השיטה שאתם צריכים להכיר', 'סקר ראשון אחר הבחירות: גנץ מתחזק, רה"מ נחלש', " שנתיים לאחר מותה הטרגי בגיל 23 - נחשף שיר חדש של תמר 'טאי' עמר", 'גאוני. צפו: כך נבנה השער הדרמטי של הנוער']
Labels: [1, 1, 0, 0, 1]
Texts to tokenize: [' אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...', ' ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי', '  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"', 'אלעל פתחה עמדות צ\'ק אין עצמי בנתב"ג', 'הטעות של בני 55+']
Labels: [1, 1, 1, 0, 1]
Texts to tokenize: ['Filipino activist arrested for disrupting Manila Cathedral mass in Reproductive Health Bill protest', 'International Board fixes soccer field size, halts technology experiments', '24 Rules For Women On A First Date With A Man', 'Political fallout from the sacking of Professor David Nutt gathers momentum', 'Which "Clueless" Char

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.04999170400094451, accuracy 0.981127080181543
Starting epoch 2/3
Train loss 0.016949751701004056, accuracy 0.9940242057488653
Starting epoch 3/3
Train loss 0.004865134308490445, accuracy 0.9984114977307109
Evaluating on hebrew test set...
Evaluating on english test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('hebrew', 'german')
Texts to tokenize: [' זקוקים להחלפת ברך? ניתוחי החלפת ברך בשיטה הטובה ביותר', '  "אף אחד לא רוצה להיראות כאילו עבר השתלת שיער" - השיטה שאתם צריכים להכיר', 'סקר ראשון אחר הבחירות: גנץ מתחזק, רה"מ נחלש', " שנתיים לאחר מותה הטרגי בגיל 23 - נחשף שיר חדש של תמר 'טאי' עמר", 'גאוני. צפו: כך נבנה השער הדרמטי של הנוער']
Labels: [1, 1, 0, 0, 1]
Texts to tokenize: [' אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...', ' ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי', '  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"', 'אלעל פתחה עמדות צ\'ק אין עצמי בנתב"ג', 'הטעות של בני 55+']
Labels: [1, 1, 1, 0, 1]
Texts to tokenize: ['Robert Geiss auf Krücken: "Ich brauche einen Arzt!" ', '„Promi Big Brother“ 2017: Das ist der Gewinner', 'Laura Müller: Werbepartner erntet Shitstorm', 'Netflix-Serie zu Claas Relotius-Skandal geplant ', 'Shawn Mendes hat Schmetterlinge im Bauch']
Labels: [0, 0, 0, 0, 0]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.08724306658017048, accuracy 0.9657534246575342
Starting epoch 2/3
Train loss 0.03829377601019505, accuracy 0.9859464830719
Starting epoch 3/3
Train loss 0.01535160619432978, accuracy 0.9943395556817375
Evaluating on hebrew test set...
Evaluating on german test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('hebrew', 'indonesian')
Texts to tokenize: [' זקוקים להחלפת ברך? ניתוחי החלפת ברך בשיטה הטובה ביותר', '  "אף אחד לא רוצה להיראות כאילו עבר השתלת שיער" - השיטה שאתם צריכים להכיר', 'סקר ראשון אחר הבחירות: גנץ מתחזק, רה"מ נחלש', " שנתיים לאחר מותה הטרגי בגיל 23 - נחשף שיר חדש של תמר 'טאי' עמר", 'גאוני. צפו: כך נבנה השער הדרמטי של הנוער']
Labels: [1, 1, 0, 0, 1]
Texts to tokenize: [' אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...', ' ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי', '  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"', 'אלעל פתחה עמדות צ\'ק אין עצמי בנתב"ג', 'הטעות של בני 55+']
Labels: [1, 1, 1, 0, 1]
Texts to tokenize: ['Arkeolog Temukan Situs David Vs Goliath seperti Disebut Alkitab', 'Tampil Classy dengan Rok Tutu, Ini 5 Inspirasi Hijab ala Aghnia Punjabi', 'Hasil China Open 2019 - Marcus/Kevin Sukses Bayar Lunas Kekalahan di Kejuaraan Dunia 2019', 'Sepasang Pengungsi Rohingya Tewas dalam Baku Tembak di Bangladesh', 'Jadwal Wakil Indonesia

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.4887080680982559, accuracy 0.7700155763239875
Starting epoch 2/3
Train loss 0.37563844088228376, accuracy 0.8373831775700935
Starting epoch 3/3
Train loss 0.2820742687913282, accuracy 0.8876947040498443
Evaluating on hebrew test set...
Evaluating on indonesian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('hebrew', 'romanian')
Texts to tokenize: [' זקוקים להחלפת ברך? ניתוחי החלפת ברך בשיטה הטובה ביותר', '  "אף אחד לא רוצה להיראות כאילו עבר השתלת שיער" - השיטה שאתם צריכים להכיר', 'סקר ראשון אחר הבחירות: גנץ מתחזק, רה"מ נחלש', " שנתיים לאחר מותה הטרגי בגיל 23 - נחשף שיר חדש של תמר 'טאי' עמר", 'גאוני. צפו: כך נבנה השער הדרמטי של הנוער']
Labels: [1, 1, 0, 0, 1]
Texts to tokenize: [' אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...', ' ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי', '  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"', 'אלעל פתחה עמדות צ\'ק אין עצמי בנתב"ג', 'הטעות של בני 55+']
Labels: [1, 1, 1, 0, 1]
Texts to tokenize: ['Top 10 al celor mai doriți angajatori din România este dominat de companii de tehnologie', 'Termenii și condițiile de utilizare, pe limba ta: ce vor oficialii', 'Apple, Huawei sau Oppo, zdrobite de Xiaomi: cum a avut atât de câștigat', 'Spotify a lansat Greenroom, o rețea de socializare audio similară cu Clubhouse', 'Telefoane

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.3959115047082805, accuracy 0.8198888073009546
Starting epoch 2/3
Train loss 0.2629180858974289, accuracy 0.8939473408161125
Starting epoch 3/3
Train loss 0.18315472752293385, accuracy 0.9318157977551663
Evaluating on hebrew test set...
Evaluating on romanian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('hebrew', 'turkish')
Processing combination: ('arabic', 'bangla')
Texts to tokenize: ['انقسام إيراني حول التعامل بايدن', 'تفاصيل واقعة حى الشيخ جراح بالقدس محاولات إسرائيل لإخلاء منازل الفلسطينيين', 'الأهلى يحذر لاعبيه قبل الإجازة الاختلاط ممنوع ورفع الكمامة مرفوض', 'السويد تعلن وفاة امرأة بعد تطعيمها بلقاح «أسترازينيكا»', '#شرطة_الرياض: القبض 6 وافدين اقتحموا مقار شركات وسرقوا أموالاً ومعدات.\n\n']
Labels: [1, 1, 0, 1, 1]
Texts to tokenize: ['بالفيديو – شكل ابنتي #أشرف_زكي و #روجينا حديث #الجمهور... كشفتا والدهما؟', 'الموت يفجع حجاج عبد العظيم مرتين في 4 أيام', 'تاكوبيل تعيد البطاطا، الأمور إثارة!', 'الصحة: مد فترة الحملة القومية الثانية للتطعيم ضد مرض شلل الأطفال حتى غد', 'إحباط محاولة تهريب 20 مليون قرص أمفيتامين مخدر مخبأة داخل شحنة فاكهة العنب.\n']
Labels: [1, 0, 1, 0, 0]
Texts to tokenize: ['৬ দিন পর সৌদি আরব যাওয়ার কথা, তার আগে যুবকের রহস্যজনক মৃ’ত্যু', 'এই সেই সন্তান যিনি কিনা তার মাকে গর্ভ;বতী করে ফেলেন, উপায় না পেয়ে মা;য়ের সাথে বিয়ে!', 'কারিনার থেকে ত

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.47591379560905395, accuracy 0.746345851135597
Starting epoch 2/3
Train loss 0.32677481710303313, accuracy 0.8578067611123603
Starting epoch 3/3
Train loss 0.279871130263884, accuracy 0.8787946930514954
Evaluating on arabic test set...
Evaluating on bangla test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('arabic', 'chinese')
Texts to tokenize: ['انقسام إيراني حول التعامل بايدن', 'تفاصيل واقعة حى الشيخ جراح بالقدس محاولات إسرائيل لإخلاء منازل الفلسطينيين', 'الأهلى يحذر لاعبيه قبل الإجازة الاختلاط ممنوع ورفع الكمامة مرفوض', 'السويد تعلن وفاة امرأة بعد تطعيمها بلقاح «أسترازينيكا»', '#شرطة_الرياض: القبض 6 وافدين اقتحموا مقار شركات وسرقوا أموالاً ومعدات.\n\n']
Labels: [1, 1, 0, 1, 1]
Texts to tokenize: ['بالفيديو – شكل ابنتي #أشرف_زكي و #روجينا حديث #الجمهور... كشفتا والدهما؟', 'الموت يفجع حجاج عبد العظيم مرتين في 4 أيام', 'تاكوبيل تعيد البطاطا، الأمور إثارة!', 'الصحة: مد فترة الحملة القومية الثانية للتطعيم ضد مرض شلل الأطفال حتى غد', 'إحباط محاولة تهريب 20 مليون قرص أمفيتامين مخدر مخبأة داخل شحنة فاكهة العنب.\n']
Labels: [1, 0, 1, 0, 0]
Texts to tokenize: ['他终于承认了长达10年的地下情', '“深圳没有早睡的人”', '“加点旋转更舒服？”男朋友居然和舍友做这种不可描述的事情！', '你为什么不联系微信好友了？', '这4个坏习惯最伤肾 你有做过吗？']
Labels: [1, 0, 1, 1, 1]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.2910858484062725, accuracy 0.8727675965915723
Starting epoch 2/3
Train loss 0.18711275036267672, accuracy 0.9252558266215323
Starting epoch 3/3
Train loss 0.14276475946796513, accuracy 0.9426092369946695
Evaluating on arabic test set...
Evaluating on chinese test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('arabic', 'english')
Texts to tokenize: ['انقسام إيراني حول التعامل بايدن', 'تفاصيل واقعة حى الشيخ جراح بالقدس محاولات إسرائيل لإخلاء منازل الفلسطينيين', 'الأهلى يحذر لاعبيه قبل الإجازة الاختلاط ممنوع ورفع الكمامة مرفوض', 'السويد تعلن وفاة امرأة بعد تطعيمها بلقاح «أسترازينيكا»', '#شرطة_الرياض: القبض 6 وافدين اقتحموا مقار شركات وسرقوا أموالاً ومعدات.\n\n']
Labels: [1, 1, 0, 1, 1]
Texts to tokenize: ['بالفيديو – شكل ابنتي #أشرف_زكي و #روجينا حديث #الجمهور... كشفتا والدهما؟', 'الموت يفجع حجاج عبد العظيم مرتين في 4 أيام', 'تاكوبيل تعيد البطاطا، الأمور إثارة!', 'الصحة: مد فترة الحملة القومية الثانية للتطعيم ضد مرض شلل الأطفال حتى غد', 'إحباط محاولة تهريب 20 مليون قرص أمفيتامين مخدر مخبأة داخل شحنة فاكهة العنب.\n']
Labels: [1, 0, 1, 0, 0]
Texts to tokenize: ['Filipino activist arrested for disrupting Manila Cathedral mass in Reproductive Health Bill protest', 'International Board fixes soccer field size, halts technology experiments', '24 Rules For Women On A First D

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.15984121724258418, accuracy 0.9297996155567844
Starting epoch 2/3
Train loss 0.10513235652008096, accuracy 0.9570266213760961
Starting epoch 3/3
Train loss 0.08499231410498569, accuracy 0.9643205097822367
Evaluating on arabic test set...
Evaluating on english test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('arabic', 'german')
Texts to tokenize: ['انقسام إيراني حول التعامل بايدن', 'تفاصيل واقعة حى الشيخ جراح بالقدس محاولات إسرائيل لإخلاء منازل الفلسطينيين', 'الأهلى يحذر لاعبيه قبل الإجازة الاختلاط ممنوع ورفع الكمامة مرفوض', 'السويد تعلن وفاة امرأة بعد تطعيمها بلقاح «أسترازينيكا»', '#شرطة_الرياض: القبض 6 وافدين اقتحموا مقار شركات وسرقوا أموالاً ومعدات.\n\n']
Labels: [1, 1, 0, 1, 1]
Texts to tokenize: ['بالفيديو – شكل ابنتي #أشرف_زكي و #روجينا حديث #الجمهور... كشفتا والدهما؟', 'الموت يفجع حجاج عبد العظيم مرتين في 4 أيام', 'تاكوبيل تعيد البطاطا، الأمور إثارة!', 'الصحة: مد فترة الحملة القومية الثانية للتطعيم ضد مرض شلل الأطفال حتى غد', 'إحباط محاولة تهريب 20 مليون قرص أمفيتامين مخدر مخبأة داخل شحنة فاكهة العنب.\n']
Labels: [1, 0, 1, 0, 0]
Texts to tokenize: ['Robert Geiss auf Krücken: "Ich brauche einen Arzt!" ', '„Promi Big Brother“ 2017: Das ist der Gewinner', 'Laura Müller: Werbepartner erntet Shitstorm', 'Netflix-Serie zu Claas Relotius-Skandal geplant ', 'Shawn M

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.14010760762133947, accuracy 0.9418791333421708
Starting epoch 2/3
Train loss 0.08350484234638963, accuracy 0.9673898634616234
Starting epoch 3/3
Train loss 0.05822106578267339, accuracy 0.9765660671939669
Evaluating on arabic test set...
Evaluating on german test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('arabic', 'indonesian')
Texts to tokenize: ['انقسام إيراني حول التعامل بايدن', 'تفاصيل واقعة حى الشيخ جراح بالقدس محاولات إسرائيل لإخلاء منازل الفلسطينيين', 'الأهلى يحذر لاعبيه قبل الإجازة الاختلاط ممنوع ورفع الكمامة مرفوض', 'السويد تعلن وفاة امرأة بعد تطعيمها بلقاح «أسترازينيكا»', '#شرطة_الرياض: القبض 6 وافدين اقتحموا مقار شركات وسرقوا أموالاً ومعدات.\n\n']
Labels: [1, 1, 0, 1, 1]
Texts to tokenize: ['بالفيديو – شكل ابنتي #أشرف_زكي و #روجينا حديث #الجمهور... كشفتا والدهما؟', 'الموت يفجع حجاج عبد العظيم مرتين في 4 أيام', 'تاكوبيل تعيد البطاطا، الأمور إثارة!', 'الصحة: مد فترة الحملة القومية الثانية للتطعيم ضد مرض شلل الأطفال حتى غد', 'إحباط محاولة تهريب 20 مليون قرص أمفيتامين مخدر مخبأة داخل شحنة فاكهة العنب.\n']
Labels: [1, 0, 1, 0, 0]
Texts to tokenize: ['Arkeolog Temukan Situs David Vs Goliath seperti Disebut Alkitab', 'Tampil Classy dengan Rok Tutu, Ini 5 Inspirasi Hijab ala Aghnia Punjabi', 'Hasil China Open 2019 - Marcus/Kevin Sukses Bayar Lunas Kekalahan 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.44897921340359165, accuracy 0.7900069737867662
Starting epoch 2/3
Train loss 0.3392272249739275, accuracy 0.8531812774336465
Starting epoch 3/3
Train loss 0.2698279174724276, accuracy 0.8882963449152891
Evaluating on arabic test set...
Evaluating on indonesian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('arabic', 'romanian')
Texts to tokenize: ['انقسام إيراني حول التعامل بايدن', 'تفاصيل واقعة حى الشيخ جراح بالقدس محاولات إسرائيل لإخلاء منازل الفلسطينيين', 'الأهلى يحذر لاعبيه قبل الإجازة الاختلاط ممنوع ورفع الكمامة مرفوض', 'السويد تعلن وفاة امرأة بعد تطعيمها بلقاح «أسترازينيكا»', '#شرطة_الرياض: القبض 6 وافدين اقتحموا مقار شركات وسرقوا أموالاً ومعدات.\n\n']
Labels: [1, 1, 0, 1, 1]
Texts to tokenize: ['بالفيديو – شكل ابنتي #أشرف_زكي و #روجينا حديث #الجمهور... كشفتا والدهما؟', 'الموت يفجع حجاج عبد العظيم مرتين في 4 أيام', 'تاكوبيل تعيد البطاطا، الأمور إثارة!', 'الصحة: مد فترة الحملة القومية الثانية للتطعيم ضد مرض شلل الأطفال حتى غد', 'إحباط محاولة تهريب 20 مليون قرص أمفيتامين مخدر مخبأة داخل شحنة فاكهة العنب.\n']
Labels: [1, 0, 1, 0, 0]
Texts to tokenize: ['Top 10 al celor mai doriți angajatori din România este dominat de companii de tehnologie', 'Termenii și condițiile de utilizare, pe limba ta: ce vor oficialii', 'Apple, Huawei sau Oppo, zdrobite de Xiaomi: cum 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.40564835645773234, accuracy 0.810251542477456
Starting epoch 2/3
Train loss 0.2854398874058222, accuracy 0.8819648789748457
Starting epoch 3/3
Train loss 0.2261807980679185, accuracy 0.9083056478405315
Evaluating on arabic test set...
Evaluating on romanian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('arabic', 'turkish')
Processing combination: ('bangla', 'chinese')
Texts to tokenize: ['ভারতীয় দলে ‘গ্রুপিং’ চরমে, অবসর নেয়ার চিন্তায় কোহলি!', 'স্বামীর মাঝে নিজের বাবাকে খুঁজে পেয়েছেন প্রিয়াংকা', 'তুইও বিক্রি হয়ে গেলি, খেলতে নেমে গেলি : সায়নীকে শ্রীলেখা', 'চলতি মৌসুমে আজ ঢাকায় সর্বোচ্চ বৃষ্টি, তলিয়ে গেছে অধিকাংশ সড়ক', 'মেয়েকে ফিরে পেতে তওবা করে অভিনয় ছাড়েন শাবানা']
Labels: [0, 0, 0, 0, 0]
Texts to tokenize: ['৬ দিন পর সৌদি আরব যাওয়ার কথা, তার আগে যুবকের রহস্যজনক মৃ’ত্যু', 'এই সেই সন্তান যিনি কিনা তার মাকে গর্ভ;বতী করে ফেলেন, উপায় না পেয়ে মা;য়ের সাথে বিয়ে!', 'কারিনার থেকে তিনগুণ বেশি পারিশ্রমিক কঙ্গনার!', 'ই-কমার্স ব্যবসায় সাকিব - Techzoom.TV', '২৫ লাখ টাকার মোটরসাইকেলে বিশ্বভ্রমণে অজিত']
Labels: [0, 1, 0, 0, 0]
Texts to tokenize: ['他终于承认了长达10年的地下情', '“深圳没有早睡的人”', '“加点旋转更舒服？”男朋友居然和舍友做这种不可描述的事情！', '你为什么不联系微信好友了？', '这4个坏习惯最伤肾 你有做过吗？']
Labels: [1, 0, 1, 1, 1]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.2028451325050644, accuracy 0.9170632698768197
Starting epoch 2/3
Train loss 0.09309682321436454, accuracy 0.964095744680851
Starting epoch 3/3
Train loss 0.04021895558337085, accuracy 0.9860022396416573
Evaluating on bangla test set...
Evaluating on chinese test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('bangla', 'english')
Texts to tokenize: ['ভারতীয় দলে ‘গ্রুপিং’ চরমে, অবসর নেয়ার চিন্তায় কোহলি!', 'স্বামীর মাঝে নিজের বাবাকে খুঁজে পেয়েছেন প্রিয়াংকা', 'তুইও বিক্রি হয়ে গেলি, খেলতে নেমে গেলি : সায়নীকে শ্রীলেখা', 'চলতি মৌসুমে আজ ঢাকায় সর্বোচ্চ বৃষ্টি, তলিয়ে গেছে অধিকাংশ সড়ক', 'মেয়েকে ফিরে পেতে তওবা করে অভিনয় ছাড়েন শাবানা']
Labels: [0, 0, 0, 0, 0]
Texts to tokenize: ['৬ দিন পর সৌদি আরব যাওয়ার কথা, তার আগে যুবকের রহস্যজনক মৃ’ত্যু', 'এই সেই সন্তান যিনি কিনা তার মাকে গর্ভ;বতী করে ফেলেন, উপায় না পেয়ে মা;য়ের সাথে বিয়ে!', 'কারিনার থেকে তিনগুণ বেশি পারিশ্রমিক কঙ্গনার!', 'ই-কমার্স ব্যবসায় সাকিব - Techzoom.TV', '২৫ লাখ টাকার মোটরসাইকেলে বিশ্বভ্রমণে অজিত']
Labels: [0, 1, 0, 0, 0]
Texts to tokenize: ['Filipino activist arrested for disrupting Manila Cathedral mass in Reproductive Health Bill protest', 'International Board fixes soccer field size, halts technology experiments', '24 Rules For Women On A First Date With A Man', 'Political fallout from the sacking of Professor Da

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.05452053626515445, accuracy 0.9772624604728204
Starting epoch 2/3
Train loss 0.02381183109289298, accuracy 0.9906640566179793
Starting epoch 3/3
Train loss 0.011149908165165102, accuracy 0.9958214124378859
Evaluating on bangla test set...
Evaluating on english test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('bangla', 'german')
Texts to tokenize: ['ভারতীয় দলে ‘গ্রুপিং’ চরমে, অবসর নেয়ার চিন্তায় কোহলি!', 'স্বামীর মাঝে নিজের বাবাকে খুঁজে পেয়েছেন প্রিয়াংকা', 'তুইও বিক্রি হয়ে গেলি, খেলতে নেমে গেলি : সায়নীকে শ্রীলেখা', 'চলতি মৌসুমে আজ ঢাকায় সর্বোচ্চ বৃষ্টি, তলিয়ে গেছে অধিকাংশ সড়ক', 'মেয়েকে ফিরে পেতে তওবা করে অভিনয় ছাড়েন শাবানা']
Labels: [0, 0, 0, 0, 0]
Texts to tokenize: ['৬ দিন পর সৌদি আরব যাওয়ার কথা, তার আগে যুবকের রহস্যজনক মৃ’ত্যু', 'এই সেই সন্তান যিনি কিনা তার মাকে গর্ভ;বতী করে ফেলেন, উপায় না পেয়ে মা;য়ের সাথে বিয়ে!', 'কারিনার থেকে তিনগুণ বেশি পারিশ্রমিক কঙ্গনার!', 'ই-কমার্স ব্যবসায় সাকিব - Techzoom.TV', '২৫ লাখ টাকার মোটরসাইকেলে বিশ্বভ্রমণে অজিত']
Labels: [0, 1, 0, 0, 0]
Texts to tokenize: ['Robert Geiss auf Krücken: "Ich brauche einen Arzt!" ', '„Promi Big Brother“ 2017: Das ist der Gewinner', 'Laura Müller: Werbepartner erntet Shitstorm', 'Netflix-Serie zu Claas Relotius-Skandal geplant ', 'Shawn Mendes hat Schmetterlinge im Bauch']
Labels: [0, 0, 0, 0, 0]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.09006822330845195, accuracy 0.9653328611898017
Starting epoch 2/3
Train loss 0.04010188399811019, accuracy 0.9840651558073654
Starting epoch 3/3
Train loss 0.01809772559397393, accuracy 0.9928824362606231
Evaluating on bangla test set...
Evaluating on german test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('bangla', 'indonesian')
Texts to tokenize: ['ভারতীয় দলে ‘গ্রুপিং’ চরমে, অবসর নেয়ার চিন্তায় কোহলি!', 'স্বামীর মাঝে নিজের বাবাকে খুঁজে পেয়েছেন প্রিয়াংকা', 'তুইও বিক্রি হয়ে গেলি, খেলতে নেমে গেলি : সায়নীকে শ্রীলেখা', 'চলতি মৌসুমে আজ ঢাকায় সর্বোচ্চ বৃষ্টি, তলিয়ে গেছে অধিকাংশ সড়ক', 'মেয়েকে ফিরে পেতে তওবা করে অভিনয় ছাড়েন শাবানা']
Labels: [0, 0, 0, 0, 0]
Texts to tokenize: ['৬ দিন পর সৌদি আরব যাওয়ার কথা, তার আগে যুবকের রহস্যজনক মৃ’ত্যু', 'এই সেই সন্তান যিনি কিনা তার মাকে গর্ভ;বতী করে ফেলেন, উপায় না পেয়ে মা;য়ের সাথে বিয়ে!', 'কারিনার থেকে তিনগুণ বেশি পারিশ্রমিক কঙ্গনার!', 'ই-কমার্স ব্যবসায় সাকিব - Techzoom.TV', '২৫ লাখ টাকার মোটরসাইকেলে বিশ্বভ্রমণে অজিত']
Labels: [0, 1, 0, 0, 0]
Texts to tokenize: ['Arkeolog Temukan Situs David Vs Goliath seperti Disebut Alkitab', 'Tampil Classy dengan Rok Tutu, Ini 5 Inspirasi Hijab ala Aghnia Punjabi', 'Hasil China Open 2019 - Marcus/Kevin Sukses Bayar Lunas Kekalahan di Kejuaraan Dunia 2019', 'Sepasang Pengungsi Rohingya Tewas dalam B

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.49017503311910404, accuracy 0.7664301141622956
Starting epoch 2/3
Train loss 0.37201290190881336, accuracy 0.8369330453563715
Starting epoch 3/3
Train loss 0.26471382764692886, accuracy 0.8907744523295279
Evaluating on bangla test set...
Evaluating on indonesian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('bangla', 'romanian')
Texts to tokenize: ['ভারতীয় দলে ‘গ্রুপিং’ চরমে, অবসর নেয়ার চিন্তায় কোহলি!', 'স্বামীর মাঝে নিজের বাবাকে খুঁজে পেয়েছেন প্রিয়াংকা', 'তুইও বিক্রি হয়ে গেলি, খেলতে নেমে গেলি : সায়নীকে শ্রীলেখা', 'চলতি মৌসুমে আজ ঢাকায় সর্বোচ্চ বৃষ্টি, তলিয়ে গেছে অধিকাংশ সড়ক', 'মেয়েকে ফিরে পেতে তওবা করে অভিনয় ছাড়েন শাবানা']
Labels: [0, 0, 0, 0, 0]
Texts to tokenize: ['৬ দিন পর সৌদি আরব যাওয়ার কথা, তার আগে যুবকের রহস্যজনক মৃ’ত্যু', 'এই সেই সন্তান যিনি কিনা তার মাকে গর্ভ;বতী করে ফেলেন, উপায় না পেয়ে মা;য়ের সাথে বিয়ে!', 'কারিনার থেকে তিনগুণ বেশি পারিশ্রমিক কঙ্গনার!', 'ই-কমার্স ব্যবসায় সাকিব - Techzoom.TV', '২৫ লাখ টাকার মোটরসাইকেলে বিশ্বভ্রমণে অজিত']
Labels: [0, 1, 0, 0, 0]
Texts to tokenize: ['Top 10 al celor mai doriți angajatori din România este dominat de companii de tehnologie', 'Termenii și condițiile de utilizare, pe limba ta: ce vor oficialii', 'Apple, Huawei sau Oppo, zdrobite de Xiaomi: cum a avut atât de câștigat', 'Spotify a lansat Greenroom, o rețea de so

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.3862784422489981, accuracy 0.8241689965827897
Starting epoch 2/3
Train loss 0.26007045032641546, accuracy 0.8929274101687895
Starting epoch 3/3
Train loss 0.16529321892877782, accuracy 0.934348141244693
Evaluating on bangla test set...
Evaluating on romanian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('bangla', 'turkish')
Processing combination: ('chinese', 'english')
Texts to tokenize: ['毁童年！《还珠格格》里的大BOSS竟然是毫不起眼的她！！', '用歌声谢谢你，昌平战疫一线的人们！', '那个在优衣库啪啪啪的女生，出道了', '理学院关于在2018级新生中组织实施“逐梦彩虹人生 启航邮理相伴”新生引航工程的通知', '如何在PPT里添加音频，并跨页播放？']
Labels: [1, 0, 1, 0, 1]
Texts to tokenize: ['他终于承认了长达10年的地下情', '“深圳没有早睡的人”', '“加点旋转更舒服？”男朋友居然和舍友做这种不可描述的事情！', '你为什么不联系微信好友了？', '这4个坏习惯最伤肾 你有做过吗？']
Labels: [1, 0, 1, 1, 1]
Texts to tokenize: ['Filipino activist arrested for disrupting Manila Cathedral mass in Reproductive Health Bill protest', 'International Board fixes soccer field size, halts technology experiments', '24 Rules For Women On A First Date With A Man', 'Political fallout from the sacking of Professor David Nutt gathers momentum', 'Which "Clueless" Character Are You Based On Your Zodiac Sign']
Labels: [0, 0, 1, 0, 1]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.08281504158341152, accuracy 0.9690422361525024
Starting epoch 2/3
Train loss 0.029967423318114906, accuracy 0.9894923440550818
Starting epoch 3/3
Train loss 0.010173918818434127, accuracy 0.9966344671667866
Evaluating on chinese test set...
Evaluating on english test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('chinese', 'german')
Texts to tokenize: ['毁童年！《还珠格格》里的大BOSS竟然是毫不起眼的她！！', '用歌声谢谢你，昌平战疫一线的人们！', '那个在优衣库啪啪啪的女生，出道了', '理学院关于在2018级新生中组织实施“逐梦彩虹人生 启航邮理相伴”新生引航工程的通知', '如何在PPT里添加音频，并跨页播放？']
Labels: [1, 0, 1, 0, 1]
Texts to tokenize: ['他终于承认了长达10年的地下情', '“深圳没有早睡的人”', '“加点旋转更舒服？”男朋友居然和舍友做这种不可描述的事情！', '你为什么不联系微信好友了？', '这4个坏习惯最伤肾 你有做过吗？']
Labels: [1, 0, 1, 1, 1]
Texts to tokenize: ['Robert Geiss auf Krücken: "Ich brauche einen Arzt!" ', '„Promi Big Brother“ 2017: Das ist der Gewinner', 'Laura Müller: Werbepartner erntet Shitstorm', 'Netflix-Serie zu Claas Relotius-Skandal geplant ', 'Shawn Mendes hat Schmetterlinge im Bauch']
Labels: [0, 0, 0, 0, 0]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.09880512463145977, accuracy 0.9637855897733876
Starting epoch 2/3
Train loss 0.043004906927528425, accuracy 0.9853718768158048
Starting epoch 3/3
Train loss 0.01666086958468321, accuracy 0.9946252178965718
Evaluating on chinese test set...
Evaluating on german test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('chinese', 'indonesian')
Texts to tokenize: ['毁童年！《还珠格格》里的大BOSS竟然是毫不起眼的她！！', '用歌声谢谢你，昌平战疫一线的人们！', '那个在优衣库啪啪啪的女生，出道了', '理学院关于在2018级新生中组织实施“逐梦彩虹人生 启航邮理相伴”新生引航工程的通知', '如何在PPT里添加音频，并跨页播放？']
Labels: [1, 0, 1, 0, 1]
Texts to tokenize: ['他终于承认了长达10年的地下情', '“深圳没有早睡的人”', '“加点旋转更舒服？”男朋友居然和舍友做这种不可描述的事情！', '你为什么不联系微信好友了？', '这4个坏习惯最伤肾 你有做过吗？']
Labels: [1, 0, 1, 1, 1]
Texts to tokenize: ['Arkeolog Temukan Situs David Vs Goliath seperti Disebut Alkitab', 'Tampil Classy dengan Rok Tutu, Ini 5 Inspirasi Hijab ala Aghnia Punjabi', 'Hasil China Open 2019 - Marcus/Kevin Sukses Bayar Lunas Kekalahan di Kejuaraan Dunia 2019', 'Sepasang Pengungsi Rohingya Tewas dalam Baku Tembak di Bangladesh', 'Jadwal Wakil Indonesia di Semifinal Vietnam Open 2019']
Labels: [0, 1, 0, 0, 0]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.31526250493508573, accuracy 0.8573685041857526
Starting epoch 2/3
Train loss 0.20838590882087876, accuracy 0.9124151003001105
Starting epoch 3/3
Train loss 0.13504842103756634, accuracy 0.9460985626283367
Evaluating on chinese test set...
Evaluating on indonesian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('chinese', 'romanian')
Texts to tokenize: ['毁童年！《还珠格格》里的大BOSS竟然是毫不起眼的她！！', '用歌声谢谢你，昌平战疫一线的人们！', '那个在优衣库啪啪啪的女生，出道了', '理学院关于在2018级新生中组织实施“逐梦彩虹人生 启航邮理相伴”新生引航工程的通知', '如何在PPT里添加音频，并跨页播放？']
Labels: [1, 0, 1, 0, 1]
Texts to tokenize: ['他终于承认了长达10年的地下情', '“深圳没有早睡的人”', '“加点旋转更舒服？”男朋友居然和舍友做这种不可描述的事情！', '你为什么不联系微信好友了？', '这4个坏习惯最伤肾 你有做过吗？']
Labels: [1, 0, 1, 1, 1]
Texts to tokenize: ['Top 10 al celor mai doriți angajatori din România este dominat de companii de tehnologie', 'Termenii și condițiile de utilizare, pe limba ta: ce vor oficialii', 'Apple, Huawei sau Oppo, zdrobite de Xiaomi: cum a avut atât de câștigat', 'Spotify a lansat Greenroom, o rețea de socializare audio similară cu Clubhouse', 'Telefoanele ieftine cu Android vor fi mult mai bune, mai rapide și mai eficiente: ce s-a schimbat']
Labels: [1, 1, 1, 0, 1]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.23665291754093992, accuracy 0.8990779851932598
Starting epoch 2/3
Train loss 0.13386962877737985, accuracy 0.9483126674842167
Starting epoch 3/3
Train loss 0.07359250029929446, accuracy 0.9735658809102058
Evaluating on chinese test set...
Evaluating on romanian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('chinese', 'turkish')
Processing combination: ('english', 'german')
Texts to tokenize: ['Soccer Provides Oasis in Mexican City Ravaged by Drug War', 'Guys Try Tinder', 'Five police officers injured in Naples protest over new garbage tip', 'Michael B. Jordan Got Laid The Fuck Out While Filming "Creed"', 'International experts probe deadly Ebola Reston virus outbreak in Philippine pigs']
Labels: [0, 1, 0, 1, 0]
Texts to tokenize: ['Filipino activist arrested for disrupting Manila Cathedral mass in Reproductive Health Bill protest', 'International Board fixes soccer field size, halts technology experiments', '24 Rules For Women On A First Date With A Man', 'Political fallout from the sacking of Professor David Nutt gathers momentum', 'Which "Clueless" Character Are You Based On Your Zodiac Sign']
Labels: [0, 0, 1, 0, 1]
Texts to tokenize: ['Robert Geiss auf Krücken: "Ich brauche einen Arzt!" ', '„Promi Big Brother“ 2017: Das ist der Gewinner', 'Laura Müller: Werbe

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.07047657952433259, accuracy 0.9751836875585581
Starting epoch 2/3
Train loss 0.026901164189739018, accuracy 0.9907293259036442
Starting epoch 3/3
Train loss 0.009271657040652212, accuracy 0.9969179939839243
Evaluating on english test set...
Evaluating on german test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('english', 'indonesian')
Texts to tokenize: ['Soccer Provides Oasis in Mexican City Ravaged by Drug War', 'Guys Try Tinder', 'Five police officers injured in Naples protest over new garbage tip', 'Michael B. Jordan Got Laid The Fuck Out While Filming "Creed"', 'International experts probe deadly Ebola Reston virus outbreak in Philippine pigs']
Labels: [0, 1, 0, 1, 0]
Texts to tokenize: ['Filipino activist arrested for disrupting Manila Cathedral mass in Reproductive Health Bill protest', 'International Board fixes soccer field size, halts technology experiments', '24 Rules For Women On A First Date With A Man', 'Political fallout from the sacking of Professor David Nutt gathers momentum', 'Which "Clueless" Character Are You Based On Your Zodiac Sign']
Labels: [0, 0, 1, 0, 1]
Texts to tokenize: ['Arkeolog Temukan Situs David Vs Goliath seperti Disebut Alkitab', 'Tampil Classy dengan Rok Tutu, Ini 5 Inspirasi Hijab ala Aghnia Punjabi', 'Hasil China Open 2019 - Ma

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.18547694614702004, accuracy 0.9153457446808511
Starting epoch 2/3
Train loss 0.12628027204121958, accuracy 0.9449202127659574
Starting epoch 3/3
Train loss 0.08848643573589915, accuracy 0.9633776595744681
Evaluating on english test set...
Evaluating on indonesian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('english', 'romanian')
Texts to tokenize: ['Soccer Provides Oasis in Mexican City Ravaged by Drug War', 'Guys Try Tinder', 'Five police officers injured in Naples protest over new garbage tip', 'Michael B. Jordan Got Laid The Fuck Out While Filming "Creed"', 'International experts probe deadly Ebola Reston virus outbreak in Philippine pigs']
Labels: [0, 1, 0, 1, 0]
Texts to tokenize: ['Filipino activist arrested for disrupting Manila Cathedral mass in Reproductive Health Bill protest', 'International Board fixes soccer field size, halts technology experiments', '24 Rules For Women On A First Date With A Man', 'Political fallout from the sacking of Professor David Nutt gathers momentum', 'Which "Clueless" Character Are You Based On Your Zodiac Sign']
Labels: [0, 0, 1, 0, 1]
Texts to tokenize: ['Top 10 al celor mai doriți angajatori din România este dominat de companii de tehnologie', 'Termenii și condițiile de utilizare, pe limba ta: ce vor oficialii', 'Apple, H

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.11711099286368601, accuracy 0.9496690286647421
Starting epoch 2/3
Train loss 0.0686485300834715, accuracy 0.972093430146094
Starting epoch 3/3
Train loss 0.04073147940173962, accuracy 0.9838450995830053
Evaluating on english test set...
Evaluating on romanian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('english', 'turkish')
Processing combination: ('german', 'indonesian')
Texts to tokenize: ['Helene Fischer über ihre Auszeit: „Ich werde schon noch wiederkommen“ ', '"Sturm der Liebe": Ariane will die Trennung | Lässt Christoph sie\n', 'Kultstar Terence Hill kommt zur Brückeneinweihung nach Worms ', 'Norwegen: Nach Breivik-Anschlägen sollen Anti-Terrorgesetze verschärft werden', 'Bundesschiedsgericht der Piratenpartei lehnt Parteiausschluss ab']
Labels: [0, 0, 0, 1, 1]
Texts to tokenize: ['Robert Geiss auf Krücken: "Ich brauche einen Arzt!" ', '„Promi Big Brother“ 2017: Das ist der Gewinner', 'Laura Müller: Werbepartner erntet Shitstorm', 'Netflix-Serie zu Claas Relotius-Skandal geplant ', 'Shawn Mendes hat Schmetterlinge im Bauch']
Labels: [0, 0, 0, 0, 0]
Texts to tokenize: ['Arkeolog Temukan Situs David Vs Goliath seperti Disebut Alkitab', 'Tampil Classy dengan Rok Tutu, Ini 5 Inspirasi Hijab ala Aghnia Punjabi', 'Hasil China Open 2019 - Marcus/Kevin Sukses B

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.15446065379683507, accuracy 0.9337638485692281
Starting epoch 2/3
Train loss 0.09612597698824821, accuracy 0.9599354227146158
Starting epoch 3/3
Train loss 0.06019564174822288, accuracy 0.975265122341371
Evaluating on german test set...
Evaluating on indonesian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('german', 'romanian')
Texts to tokenize: ['Helene Fischer über ihre Auszeit: „Ich werde schon noch wiederkommen“ ', '"Sturm der Liebe": Ariane will die Trennung | Lässt Christoph sie\n', 'Kultstar Terence Hill kommt zur Brückeneinweihung nach Worms ', 'Norwegen: Nach Breivik-Anschlägen sollen Anti-Terrorgesetze verschärft werden', 'Bundesschiedsgericht der Piratenpartei lehnt Parteiausschluss ab']
Labels: [0, 0, 0, 1, 1]
Texts to tokenize: ['Robert Geiss auf Krücken: "Ich brauche einen Arzt!" ', '„Promi Big Brother“ 2017: Das ist der Gewinner', 'Laura Müller: Werbepartner erntet Shitstorm', 'Netflix-Serie zu Claas Relotius-Skandal geplant ', 'Shawn Mendes hat Schmetterlinge im Bauch']
Labels: [0, 0, 0, 0, 0]
Texts to tokenize: ['Top 10 al celor mai doriți angajatori din România este dominat de companii de tehnologie', 'Termenii și condițiile de utilizare, pe limba ta: ce vor oficialii', 'Apple, Huawei sau Oppo, zdrobite de Xiaomi: cum a avut atât de câștigat', 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.11988878521302485, accuracy 0.9538382469747233
Starting epoch 2/3
Train loss 0.06447986459924956, accuracy 0.9753928576990766
Starting epoch 3/3
Train loss 0.03590503679449516, accuracy 0.9867931286891246
Evaluating on german test set...
Evaluating on romanian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('german', 'turkish')
Processing combination: ('indonesian', 'romanian')
Texts to tokenize: ['Ibu Kota Baru Akan Miliki Transportasi Berkonsep Smart City', 'Chapter 956 One Piece Ungkapkan Kematian Orang Dekat Luffy?', 'Ayu Ting Ting Enggan Rayakan Ulang Tahun Keenam Bilqis, Kenapa?   ', 'Komisi IV DPR Setujui Anggaran Kementan Rp 21 T di 2020', 'Heboh Babi Masuk Masjid: Ibu-ibu Teriak, Si Babi Mati Ditembak']
Labels: [0, 1, 1, 0, 0]
Texts to tokenize: ['Arkeolog Temukan Situs David Vs Goliath seperti Disebut Alkitab', 'Tampil Classy dengan Rok Tutu, Ini 5 Inspirasi Hijab ala Aghnia Punjabi', 'Hasil China Open 2019 - Marcus/Kevin Sukses Bayar Lunas Kekalahan di Kejuaraan Dunia 2019', 'Sepasang Pengungsi Rohingya Tewas dalam Baku Tembak di Bangladesh', 'Jadwal Wakil Indonesia di Semifinal Vietnam Open 2019']
Labels: [0, 1, 0, 0, 0]
Texts to tokenize: ['Top 10 al celor mai doriți angajatori din România este dominat de companii de tehnologie', 'Termenii și condiții

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.42449577358857926, accuracy 0.8028802010341661
Starting epoch 2/3
Train loss 0.3143414980983808, accuracy 0.8644469144155028
Starting epoch 3/3
Train loss 0.2183651244195849, accuracy 0.9137872710578456
Evaluating on indonesian test set...
Evaluating on romanian test set...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Processing combination: ('indonesian', 'turkish')
Processing combination: ('romanian', 'turkish')
Final results: [{'languages': ('hebrew', 'arabic'), 'test_results_lang1': {'accuracy': 0.8293838862559242, 'precision': 0.829517616635037, 'recall': 0.8293838862559242, 'f1': 0.8293762215607874}, 'test_results_lang2': {'accuracy': 0.8655896607431341, 'precision': 0.8691414458247566, 'recall': 0.865589660743134, 'f1': 0.8650791697953828}}, {'languages': ('hebrew', 'bangla'), 'test_results_lang1': {'accuracy': 0.8341232227488152, 'precision': 0.8365070574902124, 'recall': 0.8341232227488151, 'f1': 0.8337871449938657}, 'test_results_lang2': {'accuracy': 0.7219917012448133, 'precision': 0.7126977788123341, 'recall': 0.7219917012448133, 'f1': 0.7134814133935211}}, {'languages': ('hebrew', 'chinese'), 'test_results_lang1': {'accuracy': 0.7914691943127963, 'precision': 0.7965670190788673, 'recall': 0.7914691943127962, 'f1': 0.7904805842984762}, 'test_results_lang2': {'accuracy': 0.968187274909963

In [None]:
pip install google-cloud-storage




In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load and clean dataset functions
def load_data(filepath):
    if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filepath.endswith('.xlsx'):
        return pd.read_excel(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

def clean_dataset(df):
    if 'title' not in df.columns or 'label' not in df.columns:
        raise ValueError("The dataframe must have 'title' and 'label' columns.")
    df = df.dropna(subset=['title', 'label'])  # Ensure no NaNs in title or label
    df = df[df['title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]  # Ensure non-empty strings
    df = df[df['label'].apply(lambda x: isinstance(x, (int, np.integer)) or str(x).isdigit())]  # Ensure valid labels

    df['title'] = df['title'].astype(str)
    df['label'] = df['label'].astype(int)
    return df

# Dataset wrapper to handle input data
class DatasetWrapper(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    encoding = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )
    dataset = DatasetWrapper(encoding, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, dataset_len):
    model.train()
    losses = []
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / dataset_len, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, dataset_len):
    model.eval()
    losses = []
    correct_predictions = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return correct_predictions.double() / dataset_len, np.mean(losses), all_labels, all_preds

# Main function to train on all data (split Hebrew) and test on Hebrew
def main():
    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16
    EPOCHS = 3
    TEST_SPLIT = 0.2  # Split ratio for the Hebrew dataset
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device: {device}')

    languages = ['hebrew', 'arabic', 'bangla', 'chinese', 'english', 'german', 'indonesian', 'romanian', 'turkish']
    file_paths = {
        'hebrew': '/hebrew.xlsx',
        'arabic': '/arabic.xlsx',
        'bangla': '/bangla.csv',
        'chinese': '/chinese.csv',
        'english': '/english.csv',
        'german': '/german.csv',
        'indonesian': '/indonesian.csv',
        'romanian': '/romanianL.xlsx',
        'turkish': '/turkish.csv'
    }

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    # Load and clean the Hebrew dataset, then split it
    df_hebrew = load_data(file_paths['hebrew'])
    df_hebrew = clean_dataset(df_hebrew)
    df_train_hebrew, df_test_hebrew = train_test_split(df_hebrew, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

    # Combine all non-Hebrew datasets for training
    train_dfs = [df_train_hebrew]  # Start with Hebrew train split
    for lang in languages:
        if lang != 'hebrew':
            df = load_data(file_paths[lang])
            df = clean_dataset(df)
            train_dfs.append(df)

    # Combine all training data
    df_train_combined = pd.concat(train_dfs, ignore_index=True)

    # Create DataLoaders for training and testing
    train_data_loader = create_data_loader(
        df_train_combined['title'].tolist(),
        df_train_combined['label'].tolist(),
        tokenizer,
        MAX_LEN,
        BATCH_SIZE
    )

    test_data_loader_hebrew = create_data_loader(
        df_test_hebrew['title'].tolist(),
        df_test_hebrew['label'].tolist(),
        tokenizer,
        MAX_LEN,
        BATCH_SIZE
    )

    # Initialize the model
    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
    model = model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    loss_fn = torch.nn.CrossEntropyLoss().to(device)

    # Training loop
    for epoch in range(EPOCHS):
        print(f'Starting epoch {epoch + 1}/{EPOCHS}')
        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            len(df_train_combined)
        )
        print(f'Train loss {train_loss}, accuracy {train_acc}')

    # Evaluate on the Hebrew test set
    print(f'Evaluating on Hebrew test set...')
    test_acc, test_loss, labels, preds = eval_model(
        model,
        test_data_loader_hebrew,
        loss_fn,
        device,
        len(df_test_hebrew)
    )
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    print(f'Test Accuracy: {test_acc}')
    print(f'Test Precision: {precision}')
    print(f'Test Recall: {recall}')
    print(f'Test F1: {f1}')

if __name__ == "__main__":
    main()


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.1777877322611112, accuracy 0.9228796550955177
Starting epoch 2/3
Train loss 0.11068957795370421, accuracy 0.9549914517208058
Starting epoch 3/3
Train loss 0.0742959509393722, accuracy 0.9696845808865433
Evaluating on Hebrew test set...
Test Accuracy: 0.8341232227488152
Test Precision: 0.8341606945917059
Test Recall: 0.8341232227488151
Test F1: 0.8341232227488151


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.cloud import storage

# Function to load and clean dataset
def load_data(filepath):
    if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filepath.endswith('.xlsx'):
        return pd.read_excel(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

def clean_dataset(df):
    if 'title' not in df.columns or 'label' not in df.columns:
        raise ValueError("The dataframe must have 'title' and 'label' columns.")
    df = df.dropna(subset=['title', 'label'])  # Ensure no NaNs in title or label
    df = df[df['title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]  # Ensure non-empty strings
    df = df[df['label'].apply(lambda x: isinstance(x, (int, np.integer)) or str(x).isdigit())]  # Ensure valid labels

    df['title'] = df['title'].astype(str)
    df['label'] = df['label'].astype(int)
    return df

# Dataset wrapper to handle input data
class DatasetWrapper(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    encoding = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )
    dataset = DatasetWrapper(encoding, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, dataset_len):
    model.train()
    losses = []
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / dataset_len, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, dataset_len):
    model.eval()
    losses = []
    correct_predictions = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return correct_predictions.double() / dataset_len, np.mean(losses), all_labels, all_preds

# Save model to Google Cloud Storage (GCS)
#def save_model_to_gcs(model, bucket_name, model_filename):
 #   local_model_path = f"/tmp/{model_filename}"
 #   torch.save(model.state_dict(), local_model_path)

  #  client = storage.Client()
  #  bucket = client.get_bucket(bucket_name)
  #  blob = bucket.blob(model_filename)
  #  blob.upload_from_filename(local_model_path)
  #  print(f"Model saved to GCS at gs://{bucket_name}/{model_filename}")

# Main function to train on 8 languages and test on Hebrew, save models
def main():
    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16
    EPOCHS = 3
    TEST_SPLIT = 0.2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
   # bucket_name = "your-bucket-name"  # Replace with your Google Cloud Storage bucket name
    print(f'Using device: {device}')

    languages = ['hebrew', 'arabic', 'bangla', 'chinese', 'english', 'german', 'indonesian', 'romanian', 'turkish']
    file_paths = {
        'hebrew': '/hebrew.xlsx',
        'arabic': '/arabic.xlsx',
        'bangla': '/bangla.csv',
        'chinese': '/chinese.csv',
        'english': '/english.csv',
        'german': '/german.csv',
        'indonesian': '/indonesian.csv',
        'romanian': '/romanianL.xlsx',
        'turkish': '/turkish.csv'
    }

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    # Load and clean the Hebrew dataset (for testing)
    df_hebrew = load_data(file_paths['hebrew'])
    df_hebrew = clean_dataset(df_hebrew)

    # Split Hebrew data into training and testing
    df_train_hebrew, df_test_hebrew = train_test_split(df_hebrew, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

    # Loop to train 8 times, excluding one language each time
    for excluded_lang in languages:
        if excluded_lang == 'hebrew':
            continue  # We are not excluding Hebrew, it's used for testing

        print(f"Training model excluding: {excluded_lang}")

        # Combine datasets from all languages except the excluded one
        train_dfs = [df_train_hebrew]
        for lang in languages:
            if lang != excluded_lang and lang != 'hebrew':
                df = load_data(file_paths[lang])
                df = clean_dataset(df)
                train_dfs.append(df)

        # Combine all training data
        df_train_combined = pd.concat(train_dfs, ignore_index=True)

        # Create DataLoader for training and testing
        train_data_loader = create_data_loader(
            df_train_combined['title'].tolist(),
            df_train_combined['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )

        test_data_loader_hebrew = create_data_loader(
            df_test_hebrew['title'].tolist(),
            df_test_hebrew['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )

        # Initialize the model
        model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
        model = model.to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)

        # Training loop
        for epoch in range(EPOCHS):
            print(f'Starting epoch {epoch + 1}/{EPOCHS}')
            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train_combined)
            )
            print(f'Train loss {train_loss}, accuracy {train_acc}')

        # Evaluate on the Hebrew test set
        print(f'Evaluating on Hebrew test set...')
        test_acc, test_loss, labels, preds = eval_model(
            model,
            test_data_loader_hebrew,
            loss_fn,
            device,
            len(df_test_hebrew)
        )
        precision = precision_score(labels, preds, average='weighted')
        recall = recall_score(labels, preds, average='weighted')
        f1 = f1_score(labels, preds, average='weighted')

        print(f'Test Accuracy: {test_acc}')
        print(f'Test Precision: {precision}')
        print(f'Test Recall: {recall}')
        print(f'Test F1: {f1}')

        # Save the trained model to GCS
        model_filename = f"bert_model_excluding_{excluded_lang}.pth"
       # save_model_to_gcs(model, bucket_name, model_filename)

if __name__ == "__main__":
    main()


Using device: cuda




Training model excluding: arabic


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.14669799948655596, accuracy 0.938279301745636
Starting epoch 2/3
Train loss 0.0885182198068112, accuracy 0.9638266969937793
Starting epoch 3/3
Train loss 0.05245648598935557, accuracy 0.9795086459674989
Evaluating on Hebrew test set...
Test Accuracy: 0.8578199052132702
Test Precision: 0.857961301665071
Test Recall: 0.8578199052132701
Test F1: 0.8578135179673229
Training model excluding: bangla


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.1691623266761403, accuracy 0.9274297732648489
Starting epoch 2/3
Train loss 0.10746378200727025, accuracy 0.9569746178954135
Starting epoch 3/3
Train loss 0.0723411330467237, accuracy 0.9708233737541425
Evaluating on Hebrew test set...
Test Accuracy: 0.8388625592417063
Test Precision: 0.8393792057455183
Test Recall: 0.8388625592417062
Test F1: 0.8388191181631409
Training model excluding: chinese


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.17347462150120413, accuracy 0.9254316894598701
Starting epoch 2/3
Train loss 0.11372298131517568, accuracy 0.9529147672330432
Starting epoch 3/3
Train loss 0.07957243802399701, accuracy 0.9676267440254178
Evaluating on Hebrew test set...
Test Accuracy: 0.8483412322274883
Test Precision: 0.8495119764314076
Test Recall: 0.8483412322274881
Test F1: 0.8482389811704677
Training model excluding: english


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.20743749050166785, accuracy 0.9104885812293334
Starting epoch 2/3
Train loss 0.13565948560541957, accuracy 0.9445594734077073
Starting epoch 3/3
Train loss 0.09321078142880164, accuracy 0.9627769708581847
Evaluating on Hebrew test set...
Test Accuracy: 0.8530805687203792
Test Precision: 0.8568916881914764
Test Recall: 0.8530805687203792
Test F1: 0.8526435548409166
Training model excluding: german


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.2387304182154496, accuracy 0.8939048239895697
Starting epoch 2/3
Train loss 0.16195849437323573, accuracy 0.9311495002172968
Starting epoch 3/3
Train loss 0.11564773336278845, accuracy 0.9529009126466754
Evaluating on Hebrew test set...
Test Accuracy: 0.8246445497630333
Test Precision: 0.8248836905952432
Test Recall: 0.8246445497630331
Test F1: 0.8245972682125817
Training model excluding: indonesian


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.14218489176923493, accuracy 0.941503455434456
Starting epoch 2/3
Train loss 0.08237388510862123, accuracy 0.9672143461990221
Starting epoch 3/3
Train loss 0.05303919568060689, accuracy 0.9786118167663689
Evaluating on Hebrew test set...
Test Accuracy: 0.8341232227488152
Test Precision: 0.8356521374886304
Test Recall: 0.8341232227488151
Test F1: 0.8339666121015284
Training model excluding: romanian


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.15940602812644888, accuracy 0.9321440668397877
Starting epoch 2/3
Train loss 0.1008417455350309, accuracy 0.9586701113775079
Starting epoch 3/3
Train loss 0.06872377068983439, accuracy 0.9720393972198792
Evaluating on Hebrew test set...
Test Accuracy: 0.8246445497630333
Test Precision: 0.8249308527226901
Test Recall: 0.8246445497630331
Test F1: 0.8246209153604135
Training model excluding: turkish


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Starting epoch 1/3
Train loss 0.17953420400624007, accuracy 0.9224026858941005
Starting epoch 2/3
Train loss 0.11074631710332157, accuracy 0.9543967888203375
Starting epoch 3/3
Train loss 0.07474985972121385, accuracy 0.9698580242325132
Evaluating on Hebrew test set...
Test Accuracy: 0.8436018957345972
Test Precision: 0.8444102704109273
Test Recall: 0.8436018957345972
Test F1: 0.843531612348791


In [9]:
#all langs except arabic + predictions
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to load and clean dataset
def load_data(filepath):
    if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filepath.endswith('.xlsx'):
        return pd.read_excel(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

def clean_dataset(df):
    if 'title' not in df.columns or 'label' not in df.columns:
        raise ValueError("The dataframe must have 'title' and 'label' columns.")
    df = df.dropna(subset=['title', 'label'])  # Ensure no NaNs in title or label
    df = df[df['title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]  # Ensure non-empty strings
    df = df[df['label'].apply(lambda x: isinstance(x, (int, np.integer)) or str(x).isdigit())]  # Ensure valid labels

    df['title'] = df['title'].astype(str)
    df['label'] = df['label'].astype(int)
    return df

# Dataset wrapper to handle input data
class DatasetWrapper(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    encoding = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )
    dataset = DatasetWrapper(encoding, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, dataset_len):
    model.train()
    losses = []
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / dataset_len, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, dataset_len):
    model.eval()
    losses = []
    correct_predictions = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return correct_predictions.double() / dataset_len, np.mean(losses), all_labels, all_preds

# Main function to train on all languages except Arabic and test on Hebrew
def main():
    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16
    EPOCHS = 3
    TEST_SPLIT = 0.2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device: {device}')

    languages = ['hebrew', 'arabic', 'bangla', 'chinese', 'english', 'german', 'indonesian', 'romanian', 'turkish']
    file_paths = {
        'hebrew': '/hebrew.xlsx',
        'arabic': '/arabic.xlsx',
        'bangla': '/bangla.csv',
        'chinese': '/chinese.csv',
        'english': '/english.csv',
        'german': '/german.csv',
        'indonesian': '/indonesian.csv',
        'romanian': '/romanianL.xlsx',
        'turkish': '/turkish.csv'
    }

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    # Load and clean the Hebrew dataset (for testing)
    df_hebrew = load_data(file_paths['hebrew'])
    df_hebrew = clean_dataset(df_hebrew)

    # Split Hebrew data into training and testing
    df_train_hebrew, df_test_hebrew = train_test_split(df_hebrew, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

    # Combine datasets from all languages except Arabic
    train_dfs = [df_train_hebrew]  # Hebrew is included in the training set
    for lang in languages:
        if lang != 'arabic' and lang != 'hebrew':
            df = load_data(file_paths[lang])
            df = clean_dataset(df)
            train_dfs.append(df)

    # Combine all training data
    df_train_combined = pd.concat(train_dfs, ignore_index=True)

    # Create DataLoader for training and testing
    train_data_loader = create_data_loader(
        df_train_combined['title'].tolist(),
        df_train_combined['label'].tolist(),
        tokenizer,
        MAX_LEN,
        BATCH_SIZE
    )

    test_data_loader_hebrew = create_data_loader(
        df_test_hebrew['title'].tolist(),
        df_test_hebrew['label'].tolist(),
        tokenizer,
        MAX_LEN,
        BATCH_SIZE
    )

    # Initialize the model
    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
    model = model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    loss_fn = torch.nn.CrossEntropyLoss().to(device)

    # Training loop
    for epoch in range(EPOCHS):
        print(f'Starting epoch {epoch + 1}/{EPOCHS}')
        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            len(df_train_combined)
        )
        print(f'Train loss {train_loss}, accuracy {train_acc}')

    # Evaluate on the Hebrew test set
    print(f'Evaluating on Hebrew test set...')
    test_acc, test_loss, labels, preds = eval_model(
        model,
        test_data_loader_hebrew,
        loss_fn,
        device,
        len(df_test_hebrew)
    )

    # Modify to use the correct order of labels and predictions in the batch
    for i in range(len(labels)):
        title = df_test_hebrew['title'].iloc[i]  # This indexing should be reconsidered
        true_label = labels[i]  # True labels from evaluation
        predicted_label = preds[i]  # Predicted labels from evaluation
        correctness = "טוב" if true_label == predicted_label else "רע"
        print(title)
     #if true_label != predicted_label:
        print(f"Title: {title}")
        print(f"True label: {true_label}, Predicted label: {predicted_label}, {correctness}")
        print("")


    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    print(f'Test Accuracy: {test_acc}')
    print(f'Test Precision: {precision}')
    print(f'Test Recall: {recall}')
    print(f'Test F1: {f1}')

if __name__ == "__main__":
    main()


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss 0.14683798568623488, accuracy 0.9385944479460689
Starting epoch 2/3
Train loss 0.08883747794400945, accuracy 0.964045929133211
Starting epoch 3/3
Train loss 0.053359373515291394, accuracy 0.9786865254446302
Evaluating on Hebrew test set...
 אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...
Title:  אל תשקיעו במערכת חימום גדולה עד שתראו את ההמצאה המהפכנית הזו...
True label: 0, Predicted label: 0, טוב

 ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי
Title:  ארגז כלים לטיפול בכאבי גב תחתון באופן עצמאי
True label: 1, Predicted label: 1, טוב

  עינב בובליל: "זה המוצר שגמל אותי ממתוקים"
Title:   עינב בובליל: "זה המוצר שגמל אותי ממתוקים"
True label: 1, Predicted label: 1, טוב

אלעל פתחה עמדות צ'ק אין עצמי בנתב"ג
Title: אלעל פתחה עמדות צ'ק אין עצמי בנתב"ג
True label: 1, Predicted label: 1, טוב

הטעות של בני 55+
Title: הטעות של בני 55+
True label: 1, Predicted label: 0, רע

 מיכאל בן דוד: 'כעסתי על אמא שלי שלא הגיעה לחתונה שלי'
Title:  מיכאל בן דוד: 'כעסתי על אמא שלי שלא הג

In [8]:
for i in range(len(labels)):
        title = df_test_hebrew['title'].iloc[i]  # This indexing should be reconsidered
        true_label = labels[i]  # True labels from evaluation
        predicted_label = preds[i]  # Predicted labels from evaluation
        correctness = "טוב" if true_label == predicted_label else "רע"
        print(true_label)
   # if true_label != predicted_label:
   #     print(f"Title: {title}")
   #     print(f"True label: {true_label}, Predicted label: {predicted_label}, {correctness}")
   #     print("")

NameError: name 'labels' is not defined

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to load and clean dataset
def load_data(filepath):
    if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filepath.endswith('.xlsx'):
        return pd.read_excel(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

def clean_dataset(df):
    if 'title' not in df.columns or 'label' not in df.columns:
        raise ValueError("The dataframe must have 'title' and 'label' columns.")
    df = df.dropna(subset=['title', 'label'])  # Ensure no NaNs in title or label
    df = df[df['title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]  # Ensure non-empty strings
    df = df[df['label'].apply(lambda x: isinstance(x, (int, np.integer)) or str(x).isdigit())]  # Ensure valid labels

    df['title'] = df['title'].astype(str)
    df['label'] = df['label'].astype(int)
    return df

# Dataset wrapper to handle input data
class DatasetWrapper(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    encoding = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )
    dataset = DatasetWrapper(encoding, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, dataset_len):
    model.train()
    losses = []
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / dataset_len, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, dataset_len):
    model.eval()
    losses = []
    correct_predictions = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return correct_predictions.double() / dataset_len, np.mean(losses), all_labels, all_preds

# Main function to train and test models
def main():
    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16
    EPOCHS = 3
    TEST_SPLIT = 0.2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device: {device}')

    languages = ['hebrew', 'arabic', 'bangla', 'chinese', 'english', 'german', 'indonesian', 'romanian']
    file_paths = {
        'hebrew': '/hebrew.xlsx',
        'arabic': '/arabic.xlsx',
        'bangla': '/bangla.csv',
        'chinese': '/chinese.csv',
        'english': '/english.csv',
        'german': '/german.csv',
        'indonesian': '/indonesian.csv',
        'romanian': '/romanianL.xlsx'
    }

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    # Load and clean datasets for all languages
    dfs = {lang: clean_dataset(load_data(file_paths[lang])) for lang in languages}

    # Initialize a list to store the results
    results = []

    # Loop to train on each language and test on all others
    for train_lang in languages:
        print(f"\nTraining on {train_lang}")
        df_train = dfs[train_lang]

        # Train/Test split for the training language
        df_train_lang, df_val_lang = train_test_split(df_train, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

        # Create DataLoader for training
        train_data_loader = create_data_loader(
            df_train_lang['title'].tolist(),
            df_train_lang['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )

        val_data_loader = create_data_loader(
            df_val_lang['title'].tolist(),
            df_val_lang['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )

        # Initialize the model
        model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
        model = model.to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)

        # Training loop
        for epoch in range(EPOCHS):
            print(f'Epoch {epoch + 1}/{EPOCHS}')
            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train_lang)
            )
            print(f'Train loss: {train_loss}, accuracy: {train_acc}')

        # Evaluate on all other languages
        for test_lang in languages:
            if test_lang == train_lang:
                continue  # Skip the training language for testing

            print(f"\nEvaluating on {test_lang}")
            df_test = dfs[test_lang]

            test_data_loader = create_data_loader(
                df_test['title'].tolist(),
                df_test['label'].tolist(),
                tokenizer,
                MAX_LEN,
                BATCH_SIZE
            )

            test_acc, test_loss, labels, preds = eval_model(
                model,
                test_data_loader,
                loss_fn,
                device,
                len(df_test)
            )

            precision = precision_score(labels, preds, average='weighted')
            recall = recall_score(labels, preds, average='weighted')
            f1 = f1_score(labels, preds, average='weighted')

            # Store the result
            results.append({
                'Source Language': train_lang,
                'Target Language': test_lang,
                'Accuracy': test_acc.item(),  # Convert to float for compatibility with DataFrame
                'Precision': precision,
                'Recall': recall,
                'F1-Score': f1
            })

    # Convert the results to a DataFrame and print as a table
    results_df = pd.DataFrame(results)
    print("\nCross-Language Results:")
    print(results_df)

if __name__ == "__main__":
    main()


Using device: cuda





Training on hebrew


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss: 0.5344139430320488, accuracy: 0.7285714285714286
Epoch 2/3
Train loss: 0.31471379312141884, accuracy: 0.8726190476190477
Epoch 3/3
Train loss: 0.18415789861442908, accuracy: 0.9416666666666668

Evaluating on arabic

Evaluating on bangla


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on chinese


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on english


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on german


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on indonesian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on romanian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training on arabic


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss: 0.4082080039222188, accuracy: 0.8072230750585764
Epoch 2/3
Train loss: 0.3063827399020047, accuracy: 0.8748485093318252
Epoch 3/3
Train loss: 0.26823153216824974, accuracy: 0.8864021976246264

Evaluating on hebrew

Evaluating on bangla


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on chinese


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on english


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on german


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on indonesian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on romanian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training on bangla


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss: 0.5924757650641145, accuracy: 0.6721991701244814
Epoch 2/3
Train loss: 0.4838067510577499, accuracy: 0.7572614107883817
Epoch 3/3
Train loss: 0.358433407593946, accuracy: 0.8537344398340249

Evaluating on hebrew

Evaluating on arabic


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on chinese


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on english


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on german


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on indonesian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on romanian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training on chinese


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss: 0.18551950572066384, accuracy: 0.9256229360552387
Epoch 2/3
Train loss: 0.07580692258983233, accuracy: 0.973131191834284
Epoch 3/3
Train loss: 0.031064480184527093, accuracy: 0.9905433803662564

Evaluating on hebrew

Evaluating on arabic


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on bangla


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on english


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on german


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on indonesian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on romanian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training on english


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss: 0.036678601822732165, accuracy: 0.9878125
Epoch 2/3
Train loss: 0.009648211446710775, accuracy: 0.9971093750000001
Epoch 3/3
Train loss: 0.0027898219449662065, accuracy: 0.9992578125

Evaluating on hebrew

Evaluating on arabic


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on bangla


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on chinese


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on german


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on indonesian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on romanian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training on german


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss: 0.07802034882167341, accuracy: 0.9718099286692126
Epoch 2/3
Train loss: 0.03185688894267965, accuracy: 0.988759997117948
Epoch 3/3
Train loss: 0.012535029535835357, accuracy: 0.9956769219684416

Evaluating on hebrew

Evaluating on arabic


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on bangla


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on chinese


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on english


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on indonesian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on romanian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training on indonesian


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/3
Train loss: 0.500509320229292, accuracy: 0.7618333333333334
Epoch 2/3
Train loss: 0.39669222151239714, accuracy: 0.825
Epoch 3/3
Train loss: 0.33304236811896165, accuracy: 0.85725

Evaluating on hebrew

Evaluating on arabic


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on bangla


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on chinese


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on english


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on german


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on romanian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training on romanian


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train loss: 0.3839570510968128, accuracy: 0.8253767399056713
Epoch 2/3
Train loss: 0.2559551752090235, accuracy: 0.8937075808121477
Epoch 3/3
Train loss: 0.17417108927860253, accuracy: 0.9282181065224894

Evaluating on hebrew

Evaluating on arabic


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on bangla


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on chinese


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on english


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on german


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Evaluating on indonesian


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Cross-Language Results:
   Source Language Target Language  Accuracy  Precision    Recall  F1-Score
0           hebrew          arabic  0.514995   0.527476  0.514995  0.452858
1           hebrew          bangla  0.670539   0.732278  0.670539  0.677997
2           hebrew         chinese  0.822226   0.822998  0.822226  0.804932
3           hebrew         english  0.890062   0.893807  0.890062  0.889801
4           hebrew          german  0.457678   0.664310  0.457678  0.538242
5           hebrew      indonesian  0.691533   0.712008  0.691533  0.663056
6           hebrew        romanian  0.728720   0.727873  0.728720  0.727772
7           arabic          hebrew  0.411989   0.411209  0.411989  0.408571
8           arabic          bangla  0.451452   0.521020  0.451452  0.460844
9           arabic         chinese  0.467219   0.570224  0.467219  0.492709
10          arabic         english  0.446844   0.442482  0.446844  0.436135
11          arabic          german  0.392155   0.730122  0.3921

In [None]:
gcloud storage buckets list


SyntaxError: invalid syntax (<ipython-input-10-277ce96ec27d>, line 1)