In [15]:
# Install necessary libraries
!pip install transformers pandas

# Import necessary libraries
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch

# Function to load the model and tokenizer either from a local directory or Hugging Face
def load_model(model_name_or_path):
    if os.path.isdir(model_name_or_path):
        # Load model and tokenizer from local directory
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
    else:
        # Load model and tokenizer from Hugging Face
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
    return tokenizer, model

# Function to split long sentences into chunks of max 512 tokens
def chunk_text(text, tokenizer, max_length):
    # Tokenize the text and get token IDs
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
    # Split the tokens into chunks of max_length
    chunks = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]
    # Decode each chunk back into text
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

# Function to classify text with chunking
def classify_text_with_chunks(model_name_or_path, input_tsv, sentence_column, output_csv):
    # Load the model and tokenizer
    tokenizer, model = load_model(model_name_or_path)
    
    # Load the text data from TSV
    data = pd.read_csv(input_tsv, sep='\t')
    
    # Prepare the pipeline
    classification_pipeline = pipeline(
        "text-classification", 
        model=model, 
        tokenizer=tokenizer, 
        truncation=True,  # Ensure the tokens are truncated if too long
        max_length=512,  # Truncate to the model's maximum token limit
        return_all_scores=False,  # Only returns the label with the highest score
        device=0 if torch.cuda.is_available() else -1
    )
    
    # Classify each sentence (with chunking if needed)
    results = []
    for sentence, identifier in zip(data[sentence_column], data['id']):
        # Split sentence into chunks if it's too long
        sentence_chunks = chunk_text(sentence, tokenizer, max_length=512)
        chunk_results = []
        for chunk in sentence_chunks:
            # Pass the chunk as a string to the classification pipeline
            result = classification_pipeline(chunk)[0]
            binary_label = 0 if result['label'] == 'LABEL_0' else 1
            chunk_results.append((binary_label, result['score']))
        
        # Aggregate chunk results by averaging probabilities or labels
        avg_score = sum([score for _, score in chunk_results]) / len(chunk_results)
        avg_label = round(sum([label for label, _ in chunk_results]) / len(chunk_results))  # Majority voting for label
        
        results.append({
            "id": identifier,
            sentence_column: sentence,
            "Classification": avg_label,
            "Prediction Score": avg_score  # Changed from "Probability Score" to "Prediction Score"
        })
    
    # Convert results to DataFrame and merge with original data
    results_df = pd.DataFrame(results)
    output_df = pd.merge(data, results_df[['id', 'Classification', 'Prediction Score']], on='id')
    
    # Save the results to CSV
    output_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"Classification results saved to {output_csv}")

# Example usage
# Replace 'your_local_model_directory' with the path to your local directory, or use Hugging Face model name
model_name_or_path = "/scratch/bbov/aalshammari/COLING/Training/ConfliBERT/outputs/en_ar_DEEPL_binary/best_model"  # Can be a local directory or Hugging Face model link
input_tsv = 'en_ar_DEEPL.tsv'  # Your input file path
sentence_column = 'en_ar_DEEPL'  # Replace with the actual column name containing sentences
output_csv = 'en_ar_DEEPL_binary_predictions.csv'  # Your output file path

# Call the function
classify_text_with_chunks(model_name_or_path, input_tsv, sentence_column, output_csv)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m




Classification results saved to en_ar_DEEPL_binary_predictions.csv
