In [None]:
#multilingual 2 languages test all languages
from itertools import combinations
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load and clean dataset functions
def load_data(filepath):
    if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filepath.endswith('.xlsx'):
        return pd.read_excel(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

def clean_dataset(df):
    if 'title' not in df.columns or 'label' not in df.columns:
        raise ValueError("The dataframe must have 'title' and 'label' columns.")

    df = df.dropna(subset=['title', 'label'])  # Ensure no NaNs in title or label
    df = df[df['title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]  # Ensure non-empty strings
    df = df[df['label'].apply(lambda x: isinstance(x, (int, np.integer)) or str(x).isdigit())]  # Ensure valid labels

    df['title'] = df['title'].astype(str)
    df['label'] = df['label'].astype(int)
    return df

# Dataset wrapper to handle input data
class DatasetWrapper(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    print(f"Texts to tokenize: {texts[:5]}")  # Print the first 5 texts for debugging
    print(f"Labels: {labels[:5]}")            # Print the corresponding labels

    # Filter out any potential empty strings in the texts
    valid_data = [(text, label) for text, label in zip(texts, labels) if text.strip()]

    # Separate valid texts and labels
    valid_texts, valid_labels = zip(*valid_data) if valid_data else ([], [])

    if len(valid_texts) == 0:
        raise ValueError("No valid texts found after filtering.")

    encoding = tokenizer(
        valid_texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )

    dataset = DatasetWrapper(encoding, valid_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, dataset_len):
    model.train()
    losses = []
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / dataset_len, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, dataset_len):
    model.eval()
    losses = []
    correct_predictions = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return correct_predictions.double() / dataset_len, np.mean(losses), all_labels, all_preds

# Main function
def main():
    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16
    EPOCHS = 3
    TEST_SPLIT = 0.2
    PATIENCE = 3  # Early stopping patience
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device: {device}')

    languages = ['hebrew', 'arabic', 'bangla', 'chinese', 'english', 'german', 'indonesian', 'romanian', 'turkish']
    file_paths = {
        'hebrew': '/hebrew.xlsx',
        'arabic': '/arabic.xlsx',
        'bangla': '/bangla.csv',
        'chinese': '/chinese.csv',
        'english': '/english.csv',
        'german': '/german.csv',
        'indonesian': '/indonesian.csv',
        'romanian': '/romanianL.xlsx',
        'turkish': '/turkish.csv'
    }

    results = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    # Iterate over combinations of 2 languages
    for lang_comb in combinations(languages, 2):
        print(f"Processing combination: {lang_comb}")

        # Load and clean datasets for the two languages
        df_lang1 = load_data(file_paths[lang_comb[0]])
        df_lang1 = clean_dataset(df_lang1)

        df_lang2 = load_data(file_paths[lang_comb[1]])
        df_lang2 = clean_dataset(df_lang2)

        # Split each dataset into 80% train and 20% test
        train_size_lang1 = int(len(df_lang1) * (1 - TEST_SPLIT))
        train_size_lang2 = int(len(df_lang2) * (1 - TEST_SPLIT))

        df_train_lang1_indices, df_test_lang1_indices = random_split(df_lang1, [train_size_lang1, len(df_lang1) - train_size_lang1])
        df_train_lang2_indices, df_test_lang2_indices = random_split(df_lang2, [train_size_lang2, len(df_lang2) - train_size_lang2])

        # Convert Subset objects back into Pandas DataFrames
        df_train_lang1 = df_lang1.iloc[df_train_lang1_indices.indices]
        df_test_lang1 = df_lang1.iloc[df_test_lang1_indices.indices]

        df_train_lang2 = df_lang2.iloc[df_train_lang2_indices.indices]
        df_test_lang2 = df_lang2.iloc[df_test_lang2_indices.indices]

        # Combine train datasets of the two languages
        df_train_combined = pd.concat([df_train_lang1, df_train_lang2], ignore_index=True)

        # Create DataLoaders
        train_data_loader = create_data_loader(
            df_train_combined['title'].tolist(),
            df_train_combined['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )
        test_data_loader_lang1 = create_data_loader(
            df_test_lang1['title'].tolist(),
            df_test_lang1['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )
        test_data_loader_lang2 = create_data_loader(
            df_test_lang2['title'].tolist(),
            df_test_lang2['label'].tolist(),
            tokenizer,
            MAX_LEN,
            BATCH_SIZE
        )

        # Initialize model
        model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
        model = model.to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)

        best_loss = float('inf')
        patience_counter = 0

        # Training loop
        for epoch in range(EPOCHS):
            print(f'Starting epoch {epoch + 1}/{EPOCHS}')
            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train_combined)
            )
            print(f'Train loss {train_loss}, accuracy {train_acc}')

        # Evaluate on the two test sets separately
        print(f'Evaluating on {lang_comb[0]} test set...')
        test_acc_lang1, test_loss_lang1, labels_lang1, preds_lang1 = eval_model(
            model,
            test_data_loader_lang1,
            loss_fn,
            device,
            len(df_test_lang1)
        )
        precision_lang1 = precision_score(labels_lang1, preds_lang1, average='weighted')
        recall_lang1 = recall_score(labels_lang1, preds_lang1, average='weighted')
        f1_lang1 = f1_score(labels_lang1, preds_lang1, average='weighted')

        print(f'Evaluating on {lang_comb[1]} test set...')
        test_acc_lang2, test_loss_lang2, labels_lang2, preds_lang2 = eval_model(
            model,
            test_data_loader_lang2,
            loss_fn,
            device,
            len(df_test_lang2)
        )
        precision_lang2 = precision_score(labels_lang2, preds_lang2, average='weighted')
        recall_lang2 = recall_score(labels_lang2, preds_lang2, average='weighted')
        f1_lang2 = f1_score(labels_lang2, preds_lang2, average='weighted')

        # Save results for the current combination
        results.append({
            'languages': lang_comb,
            'test_results_lang1': {
                'accuracy': test_acc_lang1.item(),
                'precision': precision_lang1,
                'recall': recall_lang1,
                'f1': f1_lang1
            },
            'test_results_lang2': {
                'accuracy': test_acc_lang2.item(),
                'precision': precision_lang2,
                'recall': recall_lang2,
                'f1': f1_lang2
            }
        })

    print(f"Final results: {results}")

if __name__ == "__main__":
    main()
