In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-m2-models/final_trained_model/trained_model/config.json
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/README.md
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/tokenizer.json
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/tokenizer_config.json
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/sentence_bert_config.json
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/config_sentence_transformers.json
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/model.safetensors
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/modules.json
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/special_tokens_map.json
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/sentencepiece.bpe.model
/kaggle/input/nlp-m2-models/final_trained_model/trained_model/1_Pooling/config.json
/kaggle/input/nlp-m2-models/final_trained_model/final_trained_model/config.json
/kaggle/input/nlp-

In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


In [3]:
import pandas as pd
from transformers import AutoTokenizer

def preprocess_test_data(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
    
    # Tokenize the text pairs
    encoded_data = []
    for _, row in df.iterrows():
        text_a = row['text_a']
        text_b = row['text_b']
        
        # Handle list-like strings in text_a
        if isinstance(text_a, str) and text_a.startswith('[') and text_a.endswith(']'):
            try:
                text_a_list = eval(text_a)
                text_a = text_a_list[0] if isinstance(text_a_list, list) else text_a
            except (ValueError, SyntaxError):
                # If parsing fails, use the original string
                pass
        
        encoding = tokenizer.encode_plus(
            text_a,
            text_b,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        encoded_data.append({
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        })
    
    return encoded_data, df

# Usage:
# test_data, original_df = preprocess_test_data('test.csv')

In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def predict_categories(model, test_data, batch_size=16):
    # Create a TensorDataset
    dataset = TensorDataset(
        torch.stack([item['input_ids'] for item in test_data]),
        torch.stack([item['attention_mask'] for item in test_data]),
        torch.stack([item['token_type_ids'] for item in test_data])
    )
    
    # Create a DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    # Set the model to evaluation mode
    model.eval()
    
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            # Move batch to the same device as the model
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, token_type_ids = batch
            
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
    
    # Convert numerical predictions to category labels
    category_map = {0: 'linkage', 1: 'neutral', 2: 'contradiction'}
    predicted_categories = [category_map[pred] for pred in predictions]
    
    return predicted_categories

# Usage:
# predicted_categories = predict_categories(model, test_data)

In [10]:
import torch
from torch import nn
from transformers import AutoConfig, BertModel, BertPreTrainedModel
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

class MultiBERTForSemanticSearch(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 3)  # 3 classes: linkage, neutral, contradiction
        self.init_weights()
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 3), labels.view(-1))
            return loss, logits
        else:
            return logits


In [11]:
from transformers import AutoConfig
import pandas as pd

def run_predictions(test_csv_path, model_path, output_csv_path):
    # Load and preprocess test data
    test_data, original_df = preprocess_test_data(test_csv_path)
    
    # Load the model
    config = AutoConfig.from_pretrained(model_path)
    model = MultiBERTForSemanticSearch.from_pretrained(model_path, config=config)
    
    # Make predictions
    predicted_categories = predict_categories(model, test_data)
    
    # Add predictions to the original dataframe
    original_df['predicted_category'] = predicted_categories
    
    # Save results to CSV
    original_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

# Usage:
run_predictions('/kaggle/input/nlp-m2-oct-2024/test.csv', '/kaggle/input/nlp-m2-models/best_model/best_model', '/kaggle/working/output_predictions.csv')



Predictions saved to /kaggle/working/output_predictions.csv
