## Contradictory, My Dear Watson

Can machines determine the relationships between sentences?

Given two sentences, there are three ways they could be related:
* one could entail the other
* one could contradict the other
* they could be unrelated

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset
from transformers import AdamW, pipeline

from tqdm import tqdm

import numpy as np
import pandas as pd

## Load the data

In [None]:
train_raw = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

train_raw\
    .loc[lambda df: df.lang_abv.eq('en')]\
    .groupby('label')\
    .sample(2, random_state=1)\
    [['premise', 'hypothesis', 'label']]\
    .style.hide(axis='index')

## Translation Augmentation

In [None]:
def translate_back_and_forth(text):
    """Translate English text to French and then French back to English."""
    
     
    device = 0 if torch.cuda.is_available() else -1
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load French/English translation models
    translator_en_to_fr = pipeline("translation",
                                   model="Helsinki-NLP/opus-mt-en-fr",
                                   device=device)
    translator_fr_to_en = pipeline("translation",
                                   model="Helsinki-NLP/opus-mt-fr-en",
                                   device=device)

    # Translate from English to French
    translated_text = translator_en_to_fr(text)[0]['translation_text']

    # Translate back from French to English
    back_translated_text = translator_fr_to_en(translated_text)[0]['translation_text']

    return back_translated_text

original_text = "The quick brown fox jumps over the lazy dog."
augmented_text = translate_back_and_forth(original_text)
print(f"Original: {original_text}")
print(f"Augmented: {augmented_text}")

In [None]:
%%time
train_raw\
    .head()\
    .assign(
        premise_aug = lambda df: np.select(
            [df.lang_abv.ne('en')],
            [df.premise],
            df.premise.apply(translate_back_and_forth)
        ),
        hypothesis_aug = lambda df: np.select(
            [df.lang_abv.ne('en')],
            [df.hypothesis],
            df.hypothesis.apply(translate_back_and_forth)
        )
    )\
    [['premise', 'premise_aug', 'hypothesis', 'hypothesis_aug']]\
    .style.hide(axis='index')

### Create a GPU compatitble version of the above functionality

In [None]:
def translate_back_and_forth_batch(texts):
    """Translate a batch of English text to French and back to English."""
    
    # Check if CUDA is available, otherwise default to CPU
    device = 0 if torch.cuda.is_available() else -1

    # Load French/English translation models
    translator_en_to_fr = pipeline("translation", 
                                   model="Helsinki-NLP/opus-mt-en-fr", 
                                   device=device)
    translator_fr_to_en = pipeline("translation", 
                                   model="Helsinki-NLP/opus-mt-fr-en", 
                                   device=device)

    # Translate from English to French
    translated_texts = translator_en_to_fr(texts)
    translated_texts = [t['translation_text'] for t in translated_texts]

    # Translate back from French to English
    back_translated_texts = translator_fr_to_en(translated_texts)
    back_translated_texts = [t['translation_text'] for t in back_translated_texts]

    return back_translated_texts

original_text = "The quick brown fox jumps over the lazy dog."
augmented_text = translate_back_and_forth_batch(original_text)
print(f"Original: {original_text}")
print(f"Augmented: {augmented_text[0]}")

In [None]:
%%time

def aug_with_fr_en(input_df):
    """Augment dataset with English->French->English alternatives."""
    
    # Filter only English text
    en_df = input_df.loc[input_df.lang_abv.eq('en')]
    
    # Translate 'premise' column in batches
    premise_texts = en_df['premise'].tolist()
    premise_translations = translate_back_and_forth_batch(premise_texts)
    
    # Translate 'hypothesis' column in batches
    hypothesis_texts = en_df['hypothesis'].tolist()
    hypothesis_translations = translate_back_and_forth_batch(hypothesis_texts)
    
    # Concatenate original and augmented data
    return pd.concat([input_df,
                      en_df\
                         .assign(
                              premise=premise_translations,
                              hypothesis=hypothesis_translations
                      )])

train_aug = aug_with_fr_en(train_raw)\
    .drop_duplicates(subset=['premise', 'hypothesis'])\
    .reset_index(drop=True)

## Create the PyTorch `Dataset`

In [None]:
train_hf = Dataset.from_pandas(train_aug)
test_hf = Dataset.from_pandas(test)

## Tokenization

In [None]:
# Load the tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['premise'],
                     examples['hypothesis'],
                     padding='max_length',
                     truncation=True)

# Apply tokenization
train_encoded = train_hf.map(tokenize_function, batched=True).map(
    lambda examples: {'labels': examples['label']},
    batched=True
)
test_encoded = test_hf.map(tokenize_function, batched=True)

In [None]:
train_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask'])

## Create `DataLoaders` for training and testing

In [None]:
train_loader = DataLoader(train_encoded, batch_size=16, shuffle=True)
test_loader = DataLoader(test_encoded, batch_size=16)

## Modeling

In [None]:
# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(3):  # 3 epochs for this example
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        # Move batch to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        # Update weights
        optimizer.step()

        # Print loss
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

## Make Predictions on Test Dataset

In [None]:
# Set model to eval mode
model.eval()

# Create an empty list to store predictions
predictions = []

# Disable gradient calculation for inference
with torch.no_grad():  
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Get model outputs (logits) for the batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Convert logits to predicted labels (class with the highest score)
        predicted_labels = torch.argmax(logits, dim=-1)

        # Move predictions back to CPU and append to the list
        predictions.extend(predicted_labels.cpu().numpy())


test_with_preds = test\
    .assign(
        prediction = predictions
    )

test_with_preds\
    [['id', 'prediction']]\
    .to_csv('submission.csv', index=False)

## Translation Experiment


```python
def translate_back_and_forth(text, tf=False):
    """Translate English text to French and then French back to English."""
    
    if tf:
        return text
    
    if not tf:
        from transformers import pipeline
        
        # Load French/English translation models
        translator_en_to_fr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr")
        translator_fr_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")

        # Translate from English to French
        translated_text = translator_en_to_fr(text)[0]['translation_text']

        # Translate back from French to English
        back_translated_text = translator_fr_to_en(translated_text)[0]['translation_text']

        return back_translated_text

# Example usage
original_text = "The quick brown fox jumps over the lazy dog."
augmented_text = translate_back_and_forth(original_text)
print(f"Original: {original_text}")
print(f"Augmented: {augmented_text}")

train_raw\
    .head()\
    .assign(
        premise_aug = lambda df: np.select(
            [df.lang_abv.ne('en')],
            [df.premise],
            df.premise.apply(translate_back_and_forth)
        ),
        hypothesis_aug = lambda df: np.select(
            [df.lang_abv.ne('en')],
            [df.hypothesis],
            df.hypothesis.apply(translate_back_and_forth)
        )
    )\
    [['premise', 'premise_aug', 'hypothesis', 'hypothesis_aug']]\
    .style.hide(axis='index')

def aug_with_fr_en(input_df):
    """Augment dataset with English->French->English alternatives."""
    return pd.concat([
        input_df,
        input_df\
            .loc[lambda df: df.lang_abv.eq('en')]\
            .assign(
                premise = lambda df: np.select(
                    [df.lang_abv.ne('en')],
                    [df.premise],
                    df.premise.apply(translate_back_and_forth)
                ),
                hypothesis = lambda df: np.select(
                    [df.lang_abv.ne('en')],
                    [df.hypothesis],
                    df.hypothesis.apply(translate_back_and_forth)
                )
            )\
        ])

train_aug = aug_with_fr_en(train_raw)\
    .drop_duplicates(subset=['premise', 'hypothesis'])\
    .reset_index(drop=True)

```

Does not work on TPU.