In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np

def load_pretrained_classifier(model_path="GiliGold/Knesset-DictaBERT"):
    """Load the pretrained BERT model for classification."""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
    return model, tokenizer

def predict_offensive_content(df, model, tokenizer, batch_size=32, device='cuda'):
    """
    Predict offensive content for each conversation in the dataframe.
    
    Args:
        df: DataFrame containing the conversations
        model: Pretrained BERT model
        tokenizer: BERT tokenizer
        batch_size: Number of samples to process at once
        device: Computing device (cuda/cpu)
    
    Returns:
        DataFrame with new 'is_offensive_predicted' column
    """
    model = model.to(device)
    model.eval()
    
    result_df = df.copy()
    predictions = []
    confidence_scores = []
    
    for i in range(0, len(df), batch_size):
        # Convert batch texts to list and ensure they're strings
        batch_texts = [str(text) for text in df['conversation'].iloc[i:i+batch_size].tolist()]
        
        try:
            # Tokenize with error handling
            encodings = tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors='pt'
            )
            
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                batch_probabilities = torch.softmax(outputs.logits, dim=1)
                batch_predictions = torch.argmax(outputs.logits, dim=1)
                
                predictions.extend(batch_predictions.cpu().numpy())
                confidence_scores.extend(batch_probabilities[:, 1].cpu().numpy())
            
        except Exception as e:
            print(f"Error processing batch {i}-{i+batch_size}: {str(e)}")
            # Fill with default values for failed batches
            batch_size_actual = len(batch_texts)
            predictions.extend([0] * batch_size_actual)
            confidence_scores.extend([0.0] * batch_size_actual)
        
        print(f"Processed {min(i+batch_size, len(df))}/{len(df)} rows")
    
    # Add predictions and confidence scores to dataframe
    result_df['is_offensive_predicted'] = predictions[:len(df)]
    result_df['offensive_confidence'] = confidence_scores[:len(df)]
    
    return result_df

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load model and tokenizer
    model, tokenizer = load_pretrained_classifier()
    
    # Load your data
    df = pd.read_csv("/home/gorelikk/NLP-PROJECT/Pre_Process/MICKEY/Detect/all_batches_analysis.csv")
    
    # Ensure conversation column exists and contains string data
    if 'conversation' not in df.columns:
        raise ValueError("DataFrame must contain a 'conversation' column")
    
    # Make predictions
    result_df = predict_offensive_content(df, model, tokenizer, device=device)
    
    # Save results
    output_path = 'classified_conversations.csv'
    result_df.to_csv(output_path, index=False)
    print(f"\nClassified data saved to: {output_path}")
    
    # Print summary statistics
    total_offensive = result_df['is_offensive_predicted'].sum()
    print("\nClassification Summary:")
    print(f"Total conversations: {len(result_df)}")
    print(f"Predicted offensive: {total_offensive} ({(total_offensive/len(result_df))*100:.2f}%)")
    print(f"Predicted non-offensive: {len(result_df)-total_offensive} ({((len(result_df)-total_offensive)/len(result_df))*100:.2f}%)")

if __name__ == "__main__":
    main()

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GiliGold/Knesset-DictaBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed 32/2537931 rows
Processed 64/2537931 rows
Processed 96/2537931 rows
Processed 128/2537931 rows
Processed 160/2537931 rows
Processed 192/2537931 rows
Processed 224/2537931 rows
Processed 256/2537931 rows
Processed 288/2537931 rows
Processed 320/2537931 rows
Processed 352/2537931 rows
Processed 384/2537931 rows
Processed 416/2537931 rows
Processed 448/2537931 rows
Processed 480/2537931 rows
Processed 512/2537931 rows
Processed 544/2537931 rows
Processed 576/2537931 rows
Processed 608/2537931 rows
Processed 640/2537931 rows
Processed 672/2537931 rows
Processed 704/2537931 rows
Processed 736/2537931 rows
Processed 768/2537931 rows
Processed 800/2537931 rows
Processed 832/2537931 rows
Processed 864/2537931 rows
Processed 896/2537931 rows
Processed 928/2537931 rows
Processed 960/2537931 rows
Processed 992/2537931 rows
Processed 1024/2537931 rows
Processed 1056/2537931 rows
Processed 1088/2537931 rows
Processed 1120/2537931 rows
Processed 1152/2537931 rows
Processed 1184/2537931 row