In [None]:
# BERT Project with Hugging Face - Complete Implementation
# Sentiment Analysis on IMDb Movie Reviews

import torch
import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader
import logging
import warnings
warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class BERTSentimentAnalyzer:
    def __init__(self, model_name="bert-base-uncased", max_length=512):
        """
        Initialize BERT Sentiment Analyzer
        
        Args:
            model_name (str): Hugging Face model name
            max_length (int): Maximum sequence length for tokenization
        """
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.training_logs = []
        
    def load_model_and_tokenizer(self):
        """Load pre-trained BERT model and tokenizer"""
        print(f"Loading model: {self.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, 
            num_labels=2,  # Binary classification (positive/negative)
            output_attentions=False,
            output_hidden_states=False
        )
        
        # Add padding token if not present
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print("Model and tokenizer loaded successfully!")
        
    def prepare_dataset(self, dataset_name="imdb", sample_size=None):
        """
        Load and prepare the IMDb dataset
        
        Args:
            dataset_name (str): Dataset name from Hugging Face
            sample_size (int): Optional sample size for faster training
        """
        print(f"Loading {dataset_name} dataset...")
        
        # Load dataset
        dataset = load_dataset(dataset_name)
        
        # Sample dataset if specified (for faster experimentation)
        if sample_size:
            train_dataset = dataset['train'].shuffle(seed=42).select(range(sample_size))
            test_dataset = dataset['test'].shuffle(seed=42).select(range(sample_size // 4))
        else:
            train_dataset = dataset['train']
            test_dataset = dataset['test']
            
        # Split training set for validation
        train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)
        self.train_dataset = train_test_split['train']
        self.val_dataset = train_test_split['test']
        self.test_dataset = test_dataset
        
        print(f"Dataset loaded - Train: {len(self.train_dataset)}, "
              f"Val: {len(self.val_dataset)}, Test: {len(self.test_dataset)}")
        
    def tokenize_function(self, examples):
        """Tokenize text data"""
        return self.tokenizer(
            examples['text'],
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
    
    def prepare_data_for_training(self):
        """Tokenize datasets for training"""
        print("Tokenizing datasets...")
        
        self.train_dataset = self.train_dataset.map(self.tokenize_function, batched=True)
        self.val_dataset = self.val_dataset.map(self.tokenize_function, batched=True)
        self.test_dataset = self.test_dataset.map(self.tokenize_function, batched=True)
        
        # Set format for PyTorch
        self.train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        self.val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        self.test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        
        print("Tokenization completed!")
        
    def compute_metrics(self, eval_pred):
        """Compute evaluation metrics"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
        accuracy = accuracy_score(labels, predictions)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def setup_training(self, output_dir="./results", **kwargs):
        """
        Set up training arguments and trainer
        
        Args:
            output_dir (str): Directory to save model and logs
            **kwargs: Additional training arguments
        """
        # Default training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=kwargs.get('num_train_epochs', 3),
            per_device_train_batch_size=kwargs.get('batch_size', 16),
            per_device_eval_batch_size=kwargs.get('batch_size', 16),
            warmup_steps=kwargs.get('warmup_steps', 500),
            weight_decay=kwargs.get('weight_decay', 0.01),
            learning_rate=kwargs.get('learning_rate', 2e-5),
            logging_dir=f'{output_dir}/logs',
            logging_steps=kwargs.get('logging_steps', 100),
            evaluation_strategy="steps",
            eval_steps=kwargs.get('eval_steps', 500),
            save_strategy="steps",
            save_steps=kwargs.get('save_steps', 500),
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            report_to=None,  # Disable wandb/tensorboard
            save_total_limit=2,
            dataloader_num_workers=0,  # Avoid multiprocessing issues
        )
        
        # Set up trainer
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )
        
        print("Training setup completed!")
        
    def train(self):
        """Train the model"""
        print("Starting training...")
        
        # Train the model
        train_result = self.trainer.train()
        
        # Save the model
        self.trainer.save_model()
        
        # Store training logs
        self.training_logs = self.trainer.state.log_history
        
        print("Training completed!")
        return train_result
    
    def evaluate_model(self, dataset_type="test"):
        """
        Evaluate the model on specified dataset
        
        Args:
            dataset_type (str): "test", "val", or "train"
        """
        if dataset_type == "test":
            dataset = self.test_dataset
        elif dataset_type == "val":
            dataset = self.val_dataset
        else:
            dataset = self.train_dataset
            
        print(f"Evaluating on {dataset_type} dataset...")
        
        # Get predictions
        predictions = self.trainer.predict(dataset)
        
        # Extract metrics
        metrics = predictions.metrics
        
        # Get predicted labels
        predicted_labels = np.argmax(predictions.predictions, axis=1)
        true_labels = predictions.label_ids
        
        # Print detailed results
        print(f"\n{dataset_type.upper()} RESULTS:")
        print(f"Accuracy: {metrics[f'test_accuracy']:.4f}")
        print(f"F1-Score: {metrics[f'test_f1']:.4f}")
        print(f"Precision: {metrics[f'test_precision']:.4f}")
        print(f"Recall: {metrics[f'test_recall']:.4f}")
        
        return metrics, predicted_labels, true_labels
    
    def plot_confusion_matrix(self, true_labels, predicted_labels, title="Confusion Matrix"):
        """Plot confusion matrix"""
        cm = confusion_matrix(true_labels, predicted_labels)
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Negative', 'Positive'],
                   yticklabels=['Negative', 'Positive'])
        plt.title(title)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
    def plot_training_history(self):
        """Plot training history"""
        if not self.training_logs:
            print("No training logs available!")
            return
            
        # Extract metrics
        train_loss = []
        eval_loss = []
        eval_f1 = []
        steps = []
        
        for log in self.training_logs:
            if 'loss' in log:
                train_loss.append(log['loss'])
                steps.append(log['step'])
            if 'eval_loss' in log:
                eval_loss.append(log['eval_loss'])
                eval_f1.append(log['eval_f1'])
        
        # Create subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Plot loss
        ax1.plot(steps, train_loss, label='Training Loss', color='blue')
        if eval_loss:
            eval_steps = steps[-len(eval_loss):]
            ax1.plot(eval_steps, eval_loss, label='Validation Loss', color='red')
        ax1.set_xlabel('Steps')
        ax1.set_ylabel('Loss')
        ax1.set_title('Training and Validation Loss')
        ax1.legend()
        ax1.grid(True)
        
        # Plot F1 score
        if eval_f1:
            eval_steps = steps[-len(eval_f1):]
            ax2.plot(eval_steps, eval_f1, label='Validation F1', color='green')
            ax2.set_xlabel('Steps')
            ax2.set_ylabel('F1 Score')
            ax2.set_title('Validation F1 Score')
            ax2.legend()
            ax2.grid(True)
        
        plt.tight_layout()
        plt.show()
    
    def predict_sentiment(self, texts):
        """
        Predict sentiment for new texts
        
        Args:
            texts (list): List of text strings to classify
        """
        if isinstance(texts, str):
            texts = [texts]
            
        # Tokenize texts
        inputs = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Get predictions
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
        # Convert to labels
        predicted_labels = torch.argmax(predictions, dim=-1)
        confidence_scores = torch.max(predictions, dim=-1)[0]
        
        results = []
        for i, text in enumerate(texts):
            sentiment = "Positive" if predicted_labels[i] == 1 else "Negative"
            confidence = confidence_scores[i].item()
            results.append({
                'text': text,
                'sentiment': sentiment,
                'confidence': confidence
            })
            
        return results
    
    def debug_model_performance(self, threshold=0.8):
        """
        Debug model performance and suggest improvements
        
        Args:
            threshold (float): Accuracy threshold for "good" performance
        """
        print("🔍 DEBUGGING MODEL PERFORMANCE")
        print("=" * 50)
        
        # Evaluate on validation set
        val_metrics, val_pred, val_true = self.evaluate_model("val")
        val_accuracy = val_metrics['test_accuracy']
        
        print(f"Current Validation Accuracy: {val_accuracy:.4f}")
        
        if val_accuracy < threshold:
            print(f"⚠️  Performance below threshold ({threshold})")
            print("\n🛠️  DEBUGGING SUGGESTIONS:")
            
            if val_accuracy < 0.6:
                print("1. CRITICAL: Very low accuracy detected!")
                print("   - Check data preprocessing and tokenization")
                print("   - Verify label encoding (0/1 for binary classification)")
                print("   - Consider using a smaller learning rate (1e-5)")
                print("   - Increase training epochs")
                
            elif val_accuracy < 0.75:
                print("1. MODERATE: Below average performance")
                print("   - Try different model variants (distilbert, roberta)")
                print("   - Adjust learning rate and batch size")
                print("   - Implement data augmentation")
                print("   - Check for class imbalance")
                
            else:
                print("1. MINOR: Close to threshold")
                print("   - Fine-tune hyperparameters")
                print("   - Increase training data")
                print("   - Try ensemble methods")
                
            # Check for overfitting
            train_metrics, _, _ = self.evaluate_model("train")
            train_accuracy = train_metrics['test_accuracy']
            
            if train_accuracy - val_accuracy > 0.1:
                print("\n2. OVERFITTING DETECTED!")
                print("   - Add dropout or regularization")
                print("   - Reduce model complexity")
                print("   - Use early stopping")
                print("   - Increase validation data")
                
        else:
            print("✅ Model performance looks good!")
            
        print("\n📊 DETAILED METRICS:")
        for metric, value in val_metrics.items():
            if metric.startswith('test_'):
                clean_metric = metric.replace('test_', '').title()
                print(f"   {clean_metric}: {value:.4f}")


def main():
    """Main function to run the complete BERT project"""
    print("🚀 BERT SENTIMENT ANALYSIS PROJECT")
    print("=" * 50)
    
    # Initialize analyzer
    analyzer = BERTSentimentAnalyzer(model_name="bert-base-uncased")
    
    try:
        # Part 1: Fine-tuning BERT
        print("\n📋 PART 1: FINE-TUNING BERT")
        analyzer.load_model_and_tokenizer()
        analyzer.prepare_dataset(sample_size=5000)  # Use sample for faster training
        analyzer.prepare_data_for_training()
        
        # Setup training with optimized parameters
        analyzer.setup_training(
            output_dir="./bert_sentiment_model",
            num_train_epochs=3,
            batch_size=16,
            learning_rate=2e-5,
            warmup_steps=500,
            weight_decay=0.01,
            logging_steps=100,
            eval_steps=500,
            save_steps=500
        )
        
        # Train the model
        train_result = analyzer.train()
        
        # Part 2: Debugging Issues
        print("\n🔧 PART 2: DEBUGGING ISSUES")
        analyzer.debug_model_performance(threshold=0.85)
        
        # Part 3: Evaluating the Model
        print("\n📊 PART 3: MODEL EVALUATION")
        test_metrics, test_pred, test_true = analyzer.evaluate_model("test")
        
        # Plot results
        analyzer.plot_training_history()
        analyzer.plot_confusion_matrix(test_true, test_pred, "Test Set Confusion Matrix")
        
        # Part 4: Creative Application
        print("\n🎨 PART 4: CREATIVE APPLICATION")
        
        # Test on custom examples
        sample_texts = [
            "This movie was absolutely fantastic! I loved every minute of it.",
            "Terrible film, complete waste of time. Very disappointing.",
            "The movie was okay, nothing special but not bad either.",
            "Outstanding performance by the actors, brilliant cinematography!",
            "I fell asleep halfway through. Boring and predictable."
        ]
        
        predictions = analyzer.predict_sentiment(sample_texts)
        
        print("Sample Predictions:")
        for pred in predictions:
            print(f"Text: {pred['text'][:50]}...")
            print(f"Sentiment: {pred['sentiment']} (Confidence: {pred['confidence']:.3f})")
            print("-" * 60)
            
        # Final Summary
        print("\n🎯 PROJECT SUMMARY")
        print("=" * 50)
        print(f"✅ Model trained successfully")
        print(f"✅ Final Test Accuracy: {test_metrics['test_accuracy']:.4f}")
        print(f"✅ Final Test F1-Score: {test_metrics['test_f1']:.4f}")
        print(f"✅ Model saved to: ./bert_sentiment_model")
        
        # Advanced techniques summary
        print(f"\n🔬 TECHNIQUES USED:")
        print(f"   • Pre-trained BERT (bert-base-uncased)")
        print(f"   • Fine-tuning with Hugging Face Trainer")
        print(f"   • Early stopping to prevent overfitting")
        print(f"   • Learning rate scheduling with warmup")
        print(f"   • Weight decay for regularization")
        print(f"   • Comprehensive evaluation metrics")
        print(f"   • Performance debugging and optimization")
        
    except Exception as e:
        print(f"❌ Error occurred: {str(e)}")
        logger.error(f"Training failed: {str(e)}")
        
        # Debugging suggestions for common errors
        print("\n🛠️  COMMON ISSUES AND SOLUTIONS:")
        print("1. CUDA out of memory: Reduce batch_size to 8 or 4")
        print("2. Dataset loading issues: Check internet connection")
        print("3. Tokenization errors: Verify model_name is correct")
        print("4. Training crashes: Try reducing max_length to 256")

# Additional utility functions for advanced users
class AdvancedBERTFeatures:
    """Advanced features for BERT fine-tuning"""
    
    @staticmethod
    def data_augmentation(texts, labels, augment_factor=2):
        """
        Simple data augmentation by adding synonym replacement
        Note: This is a placeholder - implement proper augmentation
        """
        augmented_texts = []
        augmented_labels = []
        
        for text, label in zip(texts, labels):
            augmented_texts.append(text)
            augmented_labels.append(label)
            
            # Simple augmentation (in practice, use libraries like nlpaug)
            if len(text.split()) > 5:  # Only augment longer texts
                words = text.split()
                # Simple word shuffling (replace with proper synonym replacement)
                if len(words) > 3:
                    shuffled = words[:-1] + [words[-1]]  # Keep last word
                    augmented_text = ' '.join(shuffled)
                    augmented_texts.append(augmented_text)
                    augmented_labels.append(label)
                    
        return augmented_texts[:len(texts) * augment_factor], augmented_labels[:len(labels) * augment_factor]
    
    @staticmethod
    def ensemble_predictions(models, texts, tokenizer):
        """
        Ensemble predictions from multiple models
        
        Args:
            models (list): List of trained models
            texts (list): Input texts
            tokenizer: Tokenizer for preprocessing
        """
        all_predictions = []
        
        for model in models:
            inputs = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                all_predictions.append(predictions)
        
        # Average predictions
        ensemble_pred = torch.mean(torch.stack(all_predictions), dim=0)
        return torch.argmax(ensemble_pred, dim=-1)

if __name__ == "__main__":
    main()