#  Text Sentiment Analysis
## Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the text sentiment classification dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")



## 1. Data Loading and Initial Exploration

In [None]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/mc-datathon-2025-sentiment-analysis/train.csv')
test_df = pd.read_csv('/kaggle/input/mc-datathon-2025-sentiment-analysis/test.csv')

print(" Dataset Overview")
print("="*50)
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print("\n Training Data Info:")
print(train_df.info())
print("\n Test Data Info:")
print(test_df.info())

In [None]:
# Display first few rows
print(" First 5 rows of Training Data:")
print(train_df.head())
print("\n First 5 rows of Test Data:")
print(test_df.head())

# Check columns
print(f"\n Training data columns: {list(train_df.columns)}")
print(f" Test data columns: {list(test_df.columns)}")

In [None]:
# Check for missing values
print(" Missing Values Analysis:")
print("="*30)
print("Training Data:")
print(train_df.isnull().sum())
print("\nTest Data:")
print(test_df.isnull().sum())

# Check for duplicates
print(f"\n Duplicate rows in training data: {train_df.duplicated().sum()}")
print(f" Duplicate rows in test data: {test_df.duplicated().sum()}")

# Check unique values in sentiment column (only for training data)
if 'sentiment' in train_df.columns:
    print(f"\n Unique sentiments (before cleaning): {train_df['sentiment'].unique()}")
    print(f"Number of unique sentiments: {train_df['sentiment'].nunique()}")
    
    # Check for missing sentiment values
    missing_sentiment_count = train_df['sentiment'].isnull().sum()
    if missing_sentiment_count > 0:
        print(f" Missing sentiment values: {missing_sentiment_count}")
        print("These will be handled during preprocessing...")
    else:
        print(" No missing sentiment values")

## 2. Target Variable Analysis (Sentiment Distribution)

In [None]:
# Analyze sentiment distribution
if 'sentiment' in train_df.columns:
    sentiment_counts = train_df['sentiment'].value_counts()
    print(" Sentiment Distribution:")
    print(sentiment_counts)
    print(f"\nPercentages:")
    print(train_df['sentiment'].value_counts(normalize=True) * 100)

    # Visualize sentiment distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    # Bar plot
    sentiment_counts.plot(kind='bar', ax=ax1, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
    ax1.set_title('Sentiment Distribution (Count)', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Sentiment')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)

    # Pie chart
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    ax2.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
    ax2.set_title('Sentiment Distribution (Percentage)', fontsize=14, fontweight='bold')

    plt.tight_layout()
    plt.show()
else:
    print("No sentiment column found in training data")

## 3. Text Length Analysis

In [None]:
# Calculate text statistics
train_df['text_length'] = train_df['text'].str.len()
train_df['word_count'] = train_df['text'].str.split().str.len()
test_df['text_length'] = test_df['text'].str.len()
test_df['word_count'] = test_df['text'].str.split().str.len()

print(" Text Length Statistics:")
print("="*30)
print("Training Data:")
print(train_df[['text_length', 'word_count']].describe())
print("\nTest Data:")
print(test_df[['text_length', 'word_count']].describe())

In [None]:
# Visualize text length distribution by sentiment
if 'sentiment' in train_df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # Character length distribution
    for sentiment in train_df['sentiment'].unique():
        subset = train_df[train_df['sentiment'] == sentiment]
        axes[0, 0].hist(subset['text_length'], alpha=0.7, label=sentiment, bins=30)
    axes[0, 0].set_title('Character Length Distribution by Sentiment')
    axes[0, 0].set_xlabel('Character Length')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].legend()

    # Word count distribution
    for sentiment in train_df['sentiment'].unique():
        subset = train_df[train_df['sentiment'] == sentiment]
        axes[0, 1].hist(subset['word_count'], alpha=0.7, label=sentiment, bins=30)
    axes[0, 1].set_title('Word Count Distribution by Sentiment')
    axes[0, 1].set_xlabel('Word Count')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].legend()

    # Box plots
    sns.boxplot(data=train_df, x='sentiment', y='text_length', ax=axes[1, 0])
    axes[1, 0].set_title('Character Length by Sentiment (Box Plot)')

    sns.boxplot(data=train_df, x='sentiment', y='word_count', ax=axes[1, 1])
    axes[1, 1].set_title('Word Count by Sentiment (Box Plot)')

    plt.tight_layout()
    plt.show()

## 4. Text Content Analysis

In [None]:
# Function to clean text for analysis
def clean_text_for_analysis(text):
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text

# Sample some texts to see patterns
print(" Sample Texts by Sentiment:")
print("="*50)
if 'sentiment' in train_df.columns:
    for sentiment in train_df['sentiment'].unique():
        print(f"\n{sentiment} Examples:")
        samples = train_df[train_df['sentiment'] == sentiment]['text'].head(3)
        for i, text in enumerate(samples, 1):
            print(f"{i}. {text[:100]}...")

In [None]:
# Most common words by sentiment
def get_most_common_words(texts, n=20):
    all_words = []
    for text in texts:
        cleaned = clean_text_for_analysis(str(text))
        words = cleaned.split()
        # Filter out very short words
        words = [word for word in words if len(word) > 2]
        all_words.extend(words)
    return Counter(all_words).most_common(n)

print(" Most Common Words by Sentiment:")
print("="*40)
if 'sentiment' in train_df.columns:
    for sentiment in train_df['sentiment'].unique():
        texts = train_df[train_df['sentiment'] == sentiment]['text']
        common_words = get_most_common_words(texts, 15)

        for word, count in common_words:
            print(f"  {word}: {count}")

## 5. Comparative Analysis: Train vs Test

In [None]:
# Compare distributions between train and test
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Character length comparison
axes[0, 0].hist(train_df['text_length'], alpha=0.7, label='Train', bins=30, color='blue')
axes[0, 0].hist(test_df['text_length'], alpha=0.7, label='Test', bins=30, color='red')
axes[0, 0].set_title('Character Length Distribution: Train vs Test')
axes[0, 0].set_xlabel('Character Length')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Word count comparison
axes[0, 1].hist(train_df['word_count'], alpha=0.7, label='Train', bins=30, color='blue')
axes[0, 1].hist(test_df['word_count'], alpha=0.7, label='Test', bins=30, color='red')
axes[0, 1].set_title('Word Count Distribution: Train vs Test')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# Statistical comparison
train_stats = train_df[['text_length', 'word_count']].describe()
test_stats = test_df[['text_length', 'word_count']].describe()

# Plot means comparison
metrics = ['text_length', 'word_count']
train_means = [train_stats.loc['mean', metric] for metric in metrics]
test_means = [test_stats.loc['mean', metric] for metric in metrics]

x = np.arange(len(metrics))
width = 0.35

axes[1, 0].bar(x - width/2, train_means, width, label='Train', color='blue', alpha=0.7)
axes[1, 0].bar(x + width/2, test_means, width, label='Test', color='red', alpha=0.7)
axes[1, 0].set_title('Mean Values: Train vs Test')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(metrics)
axes[1, 0].legend()

# Remove the last subplot
fig.delaxes(axes[1, 1])

plt.tight_layout()
plt.show()

In [None]:
# Summary statistics comparison
print(" Train vs Test Statistics Comparison:")
print("="*50)
print("\nCharacter Length:")
print(f"Train - Mean: {train_df['text_length'].mean():.2f}, Std: {train_df['text_length'].std():.2f}")
print(f"Test  - Mean: {test_df['text_length'].mean():.2f}, Std: {test_df['text_length'].std():.2f}")

print("\nWord Count:")
print(f"Train - Mean: {train_df['word_count'].mean():.2f}, Std: {train_df['word_count'].std():.2f}")
print(f"Test  - Mean: {test_df['word_count'].mean():.2f}, Std: {test_df['word_count'].std():.2f}")

# Check for data quality issues
print("\n Data Quality Assessment:")
print("="*30)

# Check for very short texts
short_texts_train = train_df[train_df['text_length'] < 10]
short_texts_test = test_df[test_df['text_length'] < 10]
print(f"Very short texts (< 10 chars) - Train: {len(short_texts_train)}, Test: {len(short_texts_test)}")

# Check for very long texts
long_texts_train = train_df[train_df['text_length'] > 500]
long_texts_test = test_df[test_df['text_length'] > 500]
print(f"Very long texts (> 500 chars) - Train: {len(long_texts_train)}, Test: {len(long_texts_test)}")

## 6. Key Insights and Recommendations

In [None]:
print(" KEY INSIGHTS FROM EDA:")
print("="*50)
print(f"1. Dataset Size: {train_df.shape[0]} training samples, {test_df.shape[0]} test samples")

if 'sentiment' in train_df.columns:
    print(f"2. Sentiment Distribution:")
    for sentiment, count in train_df['sentiment'].value_counts().items():
        percentage = (count / len(train_df)) * 100
        print(f"   - {sentiment}: {count} ({percentage:.1f}%)")

print(f"\n3. Text Characteristics:")
print(f"   - Average text length: {train_df['text_length'].mean():.1f} characters")
print(f"   - Average word count: {train_df['word_count'].mean():.1f} words")
print(f"   - Text length range: {train_df['text_length'].min()} - {train_df['text_length'].max()} characters")

print(f"\n4. Data Quality:")
print(f"   - Missing values: {train_df.isnull().sum().sum()} in train, {test_df.isnull().sum().sum()} in test")
print(f"   - Duplicates: {train_df.duplicated().sum()} in train, {test_df.duplicated().sum()} in test")

print(f"\n5. Train vs Test Comparison:")
print(f"   - Text length difference: {abs(train_df['text_length'].mean() - test_df['text_length'].mean()):.2f} chars")
print(f"   - Word count difference: {abs(train_df['word_count'].mean() - test_df['word_count'].mean()):.2f} words")

print(f"\n RECOMMENDATIONS:")
print("="*30)
print("1. Text Preprocessing:")
print("   - Remove special characters and normalize text")
print("   - Handle case sensitivity")
print("   - Consider removing very short/long texts if they're outliers")

print("\n2.  Feature Engineering:")
print("   - Extract text length and word count as features")
print("   - Consider TF-IDF or word embeddings")
print("   - Analyze n-grams for sentiment patterns")

print("\n3.  Model Considerations:")
if 'sentiment' in train_df.columns:
    sentiment_counts = train_df['sentiment'].value_counts()
    is_balanced = max(sentiment_counts) / min(sentiment_counts) < 2
    if not is_balanced:
        print("   - Handle class imbalance with sampling techniques")
    else:
        print("   - Classes are reasonably balanced")
print("   - Use stratified split for validation")
print("   - Consider ensemble methods for better performance")

print("\n✅ EDA Complete! Ready for model development.")

## 7. Model Development - DeBERTa for Sentiment Analysis

In [None]:
# Install required packages
!pip install transformers torch scikit-learn datasets accelerate tqdm -q

# Import additional libraries for modeling
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding,
    pipeline
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
from torch.utils.data import DataLoader
import gc
from tqdm.auto import tqdm

print(" Model libraries imported successfully!")
print(f" CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f" GPU: {torch.cuda.get_device_name(0)}")
    print(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Using device: {device}")

In [None]:
# Data preprocessing for DeBERTa
def preprocess_text(text):
    """Clean and preprocess text for DeBERTa"""
    if pd.isna(text):
        return ""
    
    # Convert to string
    text = str(text)
    
    # Basic cleaning while preserving important sentiment indicators
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#(\w+)', r'\1', text)  # Remove # but keep hashtag words
    text = re.sub(r'\s+', ' ', text)     # Remove extra whitespace
    text = text.strip()
    
    return text

# Preprocess the text data
print("🧹 Preprocessing text data...")
train_df['processed_text'] = train_df['text'].apply(preprocess_text)
test_df['processed_text'] = test_df['text'].apply(preprocess_text)

# Create label mapping
if 'sentiment' in train_df.columns:
    # Handle missing values and get unique sentiments
    unique_sentiments = train_df['sentiment'].dropna().unique()
    unique_sentiments = sorted([str(s) for s in unique_sentiments])  # Convert to string and sort
    
    print(f" Found sentiments: {unique_sentiments}")
    
    # Check for missing values in sentiment column
    missing_sentiments = train_df['sentiment'].isnull().sum()
    if missing_sentiments > 0:
        print(f" Warning: {missing_sentiments} missing sentiment labels found!")
        # Drop rows with missing sentiment labels
        train_df = train_df.dropna(subset=['sentiment'])
        print(f" After removing missing labels: {len(train_df)} samples remain")
    
    label2id = {label: i for i, label in enumerate(unique_sentiments)}
    id2label = {i: label for label, i in label2id.items()}
    
    print(f" Label mapping: {label2id}")
    
    # Convert sentiments to numerical labels
    train_df['labels'] = train_df['sentiment'].map(label2id)
    
    print(f" Preprocessing complete!")
    print(f" Training samples: {len(train_df)}")
    print(f" Test samples: {len(test_df)}")
else:
    print(" No sentiment column found in training data")

In [None]:
# Model configuration
MODEL_NAME = "microsoft/deberta-v3-base"  # DeBERTa v3 base model
MAX_LENGTH = 512
BATCH_SIZE = 8  # Adjust based on GPU memory
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3

print(f" Loading {MODEL_NAME}...")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Initialize model
if 'sentiment' in train_df.columns:
    num_labels = len(unique_sentiments)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    
    # Move model to GPU if available
    model = model.to(device)
    print(f" Model loaded with {num_labels} classes: {list(label2id.keys())}")
    print(f" Model moved to: {next(model.parameters()).device}")
else:
    print(" Cannot initialize model without sentiment labels")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['processed_text'],
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

print(f" Model configuration:")
print(f"   - Max length: {MAX_LENGTH}")
print(f"   - Batch size: {BATCH_SIZE}")
print(f"   - Learning rate: {LEARNING_RATE}")
print(f"   - Epochs: {NUM_EPOCHS}")

In [None]:
# GPU Verification and Optimization
print(" GPU Setup and Verification:")
print("="*35)

if torch.cuda.is_available():
    print(f" CUDA is available!")
    print(f" Device: {torch.cuda.get_device_name(0)}")
    print(f" Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f" CUDA Version: {torch.version.cuda}")
    
    # Clear cache and optimize
    torch.cuda.empty_cache()
    
    # Set memory optimization
    if hasattr(torch.cuda, 'set_memory_fraction'):
        torch.cuda.set_memory_fraction(0.9)  # Use 90% of GPU memory
        print(" Memory fraction set to 90%")
    
    # Enable optimized attention if available
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    print(" TensorFloat-32 optimizations enabled")
    
    print(f" Available Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print(" CUDA not available - training will use CPU")
    print(" This will be significantly slower!")

print(f"\n Final device: {device}")
print("="*35)

In [None]:
# Create train/validation split
if 'sentiment' in train_df.columns:
    X_train, X_val, y_train, y_val = train_test_split(
        train_df['processed_text'].tolist(),
        train_df['labels'].tolist(),
        test_size=0.2,
        random_state=42,
        stratify=train_df['labels']
    )
    
    print(f" Dataset splits:")
    print(f"   - Training: {len(X_train)} samples")
    print(f"   - Validation: {len(X_val)} samples")
    print(f"   - Test: {len(test_df)} samples")
    
    # Create datasets
    train_dataset = Dataset.from_dict({
        'processed_text': X_train,
        'labels': y_train
    })
    
    val_dataset = Dataset.from_dict({
        'processed_text': X_val,
        'labels': y_val
    })
    
    test_dataset = Dataset.from_dict({
        'processed_text': test_df['processed_text'].tolist()
    })
    
    # Tokenize datasets
    print(" Tokenizing datasets...")
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    
    # Set format for PyTorch
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    
    print(" Datasets prepared and tokenized!")
    
    # Validation split distribution
    val_labels = [id2label[label] for label in y_val]
    val_dist = pd.Series(val_labels).value_counts()
    print(f"\n Validation set distribution:")
    for sentiment, count in val_dist.items():
        percentage = (count / len(y_val)) * 100
        print(f"   - {sentiment}: {count} ({percentage:.1f}%)")

In [None]:
# Training configuration
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)
    
    return {'accuracy': accuracy}

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",  # Updated parameter name
    save_strategy="epoch",
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    learning_rate=LEARNING_RATE,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    dataloader_drop_last=False,
    save_total_limit=2,
    seed=42,
    # Progress bar and logging improvements
    disable_tqdm=False,  # Enable tqdm progress bars

    greater_is_better=True,  # For accuracy metric
)

# Initialize trainer
if 'sentiment' in train_df.columns:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    print(" Trainer initialized successfully!")
    print(f" Training will run for {NUM_EPOCHS} epochs")
    print(f" Results will be saved to './results'")
else:
    print(" Cannot initialize trainer without sentiment labels")

In [None]:
# Train the model
if 'sentiment' in train_df.columns:
    print(" Starting model training...")
    print("="*50)
    
    # Clear GPU cache and check memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print(f" GPU Memory before training:")
        print(f"   - Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
        print(f"   - Cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB")
    
    # Verify model is on GPU
    print(f" Model device: {next(model.parameters()).device}")
    print(f" Training on: {device}")
    
    # Enable tqdm for progress tracking
    tqdm.pandas()
    
    # Train the model with progress tracking
    print("\n Training Progress:")
    trainer.train()
    
    print("\n Training completed!")
    
    # Check GPU memory after training
    if torch.cuda.is_available():
        print(f" GPU Memory after training:")
        print(f"   - Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
        print(f"   - Cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB")
    
    # Save the final model
    trainer.save_model('./best_model')
    tokenizer.save_pretrained('./best_model')
    
    print(" Model saved to './best_model'")
else:
    print(" Cannot train model without sentiment labels")

In [None]:
# Model evaluation
if 'sentiment' in train_df.columns:
    print(" Evaluating model on validation set...")
    print("="*40)
    
    # Evaluate on validation set
    eval_results = trainer.evaluate()
    
    print(" Validation Results:")
    for key, value in eval_results.items():
        if isinstance(value, float):
            print(f"   - {key}: {value:.4f}")
        else:
            print(f"   - {key}: {value}")
    
    # Get predictions on validation set
    predictions = trainer.predict(val_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = y_val
    
    # Detailed classification report
    print("\n Detailed Classification Report:")
    print("="*40)
    target_names = [id2label[i] for i in range(len(id2label))]
    report = classification_report(y_true, y_pred, target_names=target_names)
    print(report)
    
    # Confusion matrix
    print("\n Confusion Matrix:")
    print("="*25)
    cm = confusion_matrix(y_true, y_pred)
    
    # Create confusion matrix visualization
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=target_names, yticklabels=target_names)
    plt.title('Confusion Matrix - DeBERTa Model')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()
    
    # Per-class accuracy
    print("\n Per-class Accuracy:")
    print("="*25)
    for i, sentiment in enumerate(target_names):
        class_correct = cm[i, i]
        class_total = cm[i, :].sum()
        class_accuracy = class_correct / class_total if class_total > 0 else 0
        print(f"   - {sentiment}: {class_accuracy:.3f} ({class_correct}/{class_total})")
    
    # Overall accuracy
    overall_accuracy = accuracy_score(y_true, y_pred)
    print(f"\n Overall Validation Accuracy: {overall_accuracy:.4f}")
else:
    print(" Cannot evaluate model without sentiment labels")

In [None]:
# Generate predictions on test set
if 'sentiment' in train_df.columns:
    print(" Generating predictions on test set...")
    print("="*40)
    
    # Get predictions on test set
    test_predictions = trainer.predict(test_dataset)
    test_pred_labels = np.argmax(test_predictions.predictions, axis=1)
    
    # Convert numerical predictions back to sentiment labels
    test_pred_sentiments = [id2label[pred] for pred in test_pred_labels]
    
    # Add predictions to test dataframe
    test_df['predicted_sentiment'] = test_pred_sentiments
    
    print(f" Generated {len(test_pred_sentiments)} predictions")
    
    # Show prediction distribution
    pred_dist = pd.Series(test_pred_sentiments).value_counts()
    print(f"\n Test Predictions Distribution:")
    for sentiment, count in pred_dist.items():
        percentage = (count / len(test_pred_sentiments)) * 100
        print(f"   - {sentiment}: {count} ({percentage:.1f}%)")
    
    # Show some sample predictions
    print(f"\n Sample Predictions:")
    print("="*30)
    sample_indices = np.random.choice(len(test_df), size=5, replace=False)
    for i, idx in enumerate(sample_indices, 1):
        text = test_df.iloc[idx]['text'][:100]
        pred = test_df.iloc[idx]['predicted_sentiment']
        print(f"{i}. Text: {text}...")
        print(f"   Prediction: {pred}")
        print()
else:
    print(" Cannot generate predictions without trained model")

In [None]:
# Create Kaggle submission file
if 'sentiment' in train_df.columns and 'predicted_sentiment' in test_df.columns:
    print(" Creating Kaggle submission file...")
    print("="*35)
    
    # Prepare submission dataframe
    # Assuming test.csv has an 'id' column for Kaggle submission
    if 'id' in test_df.columns:
        submission = pd.DataFrame({
            'id': test_df['id'],
            'sentiment': test_df['predicted_sentiment']
        })
    else:
        # If no id column, create one
        submission = pd.DataFrame({
            'id': range(len(test_df)),
            'sentiment': test_df['predicted_sentiment']
        })
        print(" No 'id' column found in test data, created sequential IDs")
    
    # Save submission file
    submission_filename = 'deberta_sentiment_submission.csv'
    submission.to_csv(submission_filename, index=False)
    
    print(f"Submission file saved as '{submission_filename}'")
    print(f" Submission shape: {submission.shape}")
    print(f"\n Submission file preview:")
    print(submission.head(10))
    
    # Submission statistics
    print(f"\n Submission Statistics:")
    print("="*30)
    submission_dist = submission['sentiment'].value_counts()
    for sentiment, count in submission_dist.items():
        percentage = (count / len(submission)) * 100
        print(f"   - {sentiment}: {count} ({percentage:.1f}%)")
    
    print(f"\n Model Summary:")
    print("="*20)
    print(f"   - Model: {MODEL_NAME}")
    print(f"   - Validation Accuracy: {overall_accuracy:.4f}")
    print(f"   - Training Epochs: {NUM_EPOCHS}")
    print(f"   - Max Length: {MAX_LENGTH}")
    print(f"   - Batch Size: {BATCH_SIZE}")
    print(f"   - Learning Rate: {LEARNING_RATE}")
    
    print(f"\n Ready for Kaggle submission!")
    print(f" Submit file: {submission_filename}")
else:
    print(" Cannot create submission file without predictions")