In [1]:
!pip install pandas
!pip install opencv-python
!pip install transformers
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_bodies = pd.read_csv('train_bodies.csv')
train_stances = pd.read_csv('train_stances.csv')

# Merge datasets based on Body ID
merged_data = pd.merge(train_stances, train_bodies, on='Body ID')

print(merged_data.head())

                                            Headline  Body ID     Stance  \
0  Police find mass graves with at least '15 bodi...      712  unrelated   
1  Hundreds of Palestinians flee floods in Gaza a...      158      agree   
2  Christian Bale passes on role of Steve Jobs, a...      137  unrelated   
3  HBO and Apple in Talks for $15/Month Apple TV ...     1034  unrelated   
4  Spider burrowed through tourist's stomach and ...     1923   disagree   

                                         articleBody  
0  Danny Boyle is directing the untitled film\n\n...  
1  Hundreds of Palestinians were evacuated from t...  
2  30-year-old Moscow resident was hospitalized w...  
3  (Reuters) - A Canadian soldier was shot at the...  
4  Fear not arachnophobes, the story of Bunbury's...  


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define special token to separate headline and body text
special_token = "[SEP]"

# Concatenate headline and body text with special token
combined_texts = merged_data['Headline'] + " " + special_token + " " + merged_data['articleBody']

input_ids = []
attention_masks = []

max_length = 128  

for text in combined_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        max_length=max_length, 
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [4]:
# Map labels to integers
label_map = {'unrelated': 0, 'discuss': 1, 'agree': 2, 'disagree': 3}
merged_data['Stance'] = merged_data['Stance'].map(label_map)
labels = torch.tensor(merged_data['Stance'].values)

train_dataset = TensorDataset(input_ids, attention_masks, labels)

batch_size = 32 

# Create DataLoader
train_dataloader = DataLoader(
    train_dataset,  
    sampler=RandomSampler(train_dataset), 
    batch_size=batch_size 
)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 20
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        
        input_ids, attention_mask, labels = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    
        loss = torch.nn.CrossEntropyLoss()(logits, labels)

        loss.backward()

        # Clip gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    # Calculate average loss for the epoch
    avg_train_loss = total_loss / len(train_dataloader)

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Average Training Loss: {avg_train_loss:.4f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Average Training Loss: 0.2323
Epoch 2/20
Average Training Loss: 0.0764
Epoch 3/20
Average Training Loss: 0.0413
Epoch 4/20
Average Training Loss: 0.0239
Epoch 5/20
Average Training Loss: 0.0141
Epoch 6/20
Average Training Loss: 0.0126
Epoch 7/20
Average Training Loss: 0.0086
Epoch 8/20
Average Training Loss: 0.0079
Epoch 9/20
Average Training Loss: 0.0066
Epoch 10/20
Average Training Loss: 0.0049
Epoch 11/20
Average Training Loss: 0.0046
Epoch 12/20
Average Training Loss: 0.0032
Epoch 13/20
Average Training Loss: 0.0026
Epoch 14/20
Average Training Loss: 0.0022
Epoch 15/20
Average Training Loss: 0.0019
Epoch 16/20
Average Training Loss: 0.0015
Epoch 17/20
Average Training Loss: 0.0010
Epoch 18/20
Average Training Loss: 0.0006
Epoch 19/20
Average Training Loss: 0.0007
Epoch 20/20
Average Training Loss: 0.0006


In [6]:
# Process Test Data
test_bodies = pd.read_csv('competition_test_bodies.csv')
test_stances = pd.read_csv('competition_test_stances.csv')

merged_test_data = pd.merge(test_stances, test_bodies, on='Body ID')

special_token = "[SEP]"

test_combined_texts = merged_test_data['Headline'] + " " + special_token + " " + merged_test_data['articleBody']

test_input_ids = []
test_attention_masks = []

max_length = 128  

for text in test_combined_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        max_length=max_length, 
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )
    
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

label_map = {'unrelated': 0, 'discuss': 1, 'agree': 2, 'disagree': 3}
merged_test_data['Stance'] = merged_test_data['Stance'].map(label_map)
test_labels = torch.tensor(merged_test_data['Stance'].values)

# Create DataLoader for Test Data
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)  

batch_size = 32
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [9]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()

total_loss = 0
predictions = []
all_test_labels = []
evaluation_score = 0

with torch.no_grad():
    for batch in test_dataloader:
        test_input_ids, test_attention_mask, test_labels = batch

        test_input_ids = test_input_ids.to(device)
        test_attention_mask = test_attention_mask.to(device)
        test_labels = test_labels.to(device)

        outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask)
        logits = outputs.logits

        loss = torch.nn.CrossEntropyLoss()(logits, test_labels)
        total_loss += loss.item()

        _, predicted_labels = torch.max(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())
        all_test_labels.extend(test_labels.cpu().numpy())

        for true_label, pred_label in zip(test_labels, predicted_labels):
            if true_label == pred_label:
                evaluation_score += 0.25  # Increase score for any match
                if true_label != 0:  # If the label is not 'unrelated' (index 0)
                    evaluation_score += 0.50  # Increase score further for non-'unrelated' matches
            elif true_label in [1, 2, 3] and pred_label in [1, 2, 3]:  # Check if both are related
                evaluation_score += 0.25  # Increase score for related but not exact match

# Print the total evaluation score
print('Total Evaluation Score:', evaluation_score)

# Print the total evaluation score
print('Total Evaluation Score:', evaluation_score)

# Calculate average test loss
avg_test_loss = total_loss / len(test_dataloader)

# Calculate accuracy for each class separately
accuracy_per_class = {}
for class_label in range(4):  # Assuming 4 classes
    class_indices = [i for i, label in enumerate(all_test_labels) if label == class_label]
    class_predictions = [predictions[i] for i in class_indices]
    class_labels = [all_test_labels[i] for i in class_indices]
    accuracy_per_class[class_label] = accuracy_score(class_labels, class_predictions)

classification_rep = classification_report(all_test_labels, predictions)

print(f'Average Test Loss: {avg_test_loss:.4f}')
print('Accuracy per Class:')
for class_label, accuracy in accuracy_per_class.items():
    print(f'Class {class_label}: {accuracy:.4f}')
print('Classification Report:')
print(classification_rep)
print('Evaluation Score:', evaluation_score)

Total Evaluation Score: 8844.5
Total Evaluation Score: 8844.5
Average Test Loss: 0.7676
Accuracy per Class:
Class 0: 0.9905
Class 1: 0.8448
Class 2: 0.6448
Class 3: 0.3113
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     18349
           1       0.81      0.84      0.83      4464
           2       0.66      0.64      0.65      1903
           3       0.51      0.31      0.39       697

    accuracy                           0.92     25413
   macro avg       0.74      0.70      0.71     25413
weighted avg       0.92      0.92      0.92     25413

Evaluation Score: 8844.5
