In [3]:
!pip install transformers[torch] datasets evaluate --upgrade

Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting transformers[torch]
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading datasets-4.1.1-py3-none-any.whl (503 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstal

In [5]:
# Google Colab Optimized Sentiment Analysis Training with RoBERTa
# Optimized for Colab environment with GPU support

# 1. Install and upgrade libraries (Colab specific)
!pip install --upgrade transformers datasets evaluate torch -q
!pip install accelerate -q  # For better GPU utilization

import os
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed
)
import logging
from IPython.display import clear_output

# Clear installation output
clear_output()

# Setup logging for Colab
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# Set random seed for reproducibility
set_seed(42)

# Colab environment check
print("🔥 Google Colab Sentiment Analysis Training 🔥")
print("=" * 50)
print(f"📦 PyTorch version: {torch.__version__}")

# GPU setup for Colab
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🖥️  Device: {device}")
if torch.cuda.is_available():
    print(f"🚀 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("⚠️  WARNING: GPU not available. Training will be slow!")
print("=" * 50)

# 1. Model and tokenizer setup
model_name = 'roberta-base'
print(f"\n📚 Loading model and tokenizer: {model_name}")

try:
    tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
    print("✅ Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    raise e

# 2. Dataset loading and preprocessing
print("\n📊 Loading SST-2 dataset...")
try:
    train_dataset = load_dataset('sst2', split='train').shuffle(seed=42)
    eval_dataset = load_dataset('sst2', split='validation')
    print(f"✅ Train samples: {len(train_dataset):,}")
    print(f"✅ Validation samples: {len(eval_dataset):,}")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    raise e

def preprocess_function(examples):
    """Preprocess text data for tokenization"""
    tokenized = tokenizer(
        examples['sentence'],
        truncation=True,
        padding='max_length',
        max_length=128
    )
    tokenized['label'] = examples['label']
    return tokenized

print("\n🔄 Tokenizing datasets...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
print("✅ Tokenization completed!")

# 3. Load evaluation metrics
print("\n📈 Loading evaluation metrics...")
try:
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    print("✅ Metrics loaded successfully!")
except Exception as e:
    print(f"❌ Error loading metrics: {e}")
    raise e

def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']
    f1 = f1_metric.compute(predictions=predictions, references=labels)['f1']
    precision = precision_metric.compute(predictions=predictions, references=labels)['precision']
    recall = recall_metric.compute(predictions=predictions, references=labels)['recall']

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# 4. Training arguments setup (Colab optimized)
print("\n⚙️  Setting up training configuration...")

# Colab-optimized batch sizes and settings
if torch.cuda.is_available():
    train_batch_size = 16
    eval_batch_size = 32
    dataloader_workers = 2
    use_fp16 = True
else:
    train_batch_size = 8  # Smaller for CPU
    eval_batch_size = 16
    dataloader_workers = 0
    use_fp16 = False

training_args = TrainingArguments(
    output_dir='/content/results',  # Colab path
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir='/content/logs',  # Colab path
    logging_steps=50,  # More frequent logging for Colab
    eval_strategy="steps",
    eval_steps=250,  # Evaluate more frequently
    save_strategy="steps",
    save_steps=250,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    report_to="none",
    dataloader_num_workers=dataloader_workers,
    fp16=use_fp16,
    save_total_limit=2,
    remove_unused_columns=True,
    push_to_hub=False,  # Disable hub pushing in Colab
    disable_tqdm=False,  # Keep progress bars in Colab
)

print("✅ Training configuration set!")

# 5. Initialize trainer
print("\n🏃‍♂️ Initializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 6. Training progress callback for Colab
from transformers import TrainerCallback

class ColabProgressCallback(TrainerCallback):
    def __init__(self):
        self.training_loss = []
        self.eval_accuracy = []

    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        if logs:
            if 'train_loss' in logs:
                self.training_loss.append(logs['train_loss'])
                print(f"🔥 Step {state.global_step}: Loss = {logs['train_loss']:.4f}")
            if 'eval_accuracy' in logs:
                self.eval_accuracy.append(logs['eval_accuracy'])
                print(f"🎯 Step {state.global_step}: Accuracy = {logs['eval_accuracy']:.4f}")

progress_callback = ColabProgressCallback()
trainer.add_callback(progress_callback)

# 7. Start training
print("\n" + "🚀" + "="*48 + "🚀")
print("                STARTING TRAINING")
print("🚀" + "="*48 + "🚀")
print("⏰ Estimated time:")
if torch.cuda.is_available():
    print("   - GPU (T4): ~15-20 minutes")
    print("   - GPU (V100): ~8-10 minutes")
else:
    print("   - CPU: ~2-3 hours (not recommended)")

print("\n💡 Tip: Keep this tab active to prevent Colab from disconnecting!")
print("-" * 50)

try:
    training_results = trainer.train()
    print("\n✅ Training completed successfully!")
except Exception as e:
    print(f"\n❌ Training failed: {e}")
    raise e

# 8. Final evaluation
print("\n🔍 Running final evaluation...")
final_metrics = trainer.evaluate()

# 9. Print results with emojis for Colab
print("\n" + "🏆" + "="*58 + "🏆")
print("                    FINAL RESULTS")
print("🏆" + "="*58 + "🏆")

for key, value in final_metrics.items():
    if key.startswith('eval_') and key not in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second']:
        metric_name = key.replace('eval_', '').replace('_', ' ').title()
        if isinstance(value, float):
            if 'accuracy' in key.lower():
                emoji = "🎯"
            elif 'f1' in key.lower():
                emoji = "⚖️"
            elif 'precision' in key.lower():
                emoji = "🔍"
            elif 'recall' in key.lower():
                emoji = "📊"
            else:
                emoji = "📈"
            print(f"{emoji} {metric_name:<15}: {value*100:.2f}%")

if 'eval_loss' in final_metrics:
    print(f"📉 {'Loss':<15}: {final_metrics['eval_loss']:.4f}")

print("🏆" + "="*58 + "🏆")

# 10. Save model to Google Drive (optional)
print("\n💾 Saving model...")
model_save_path = "/content/final_sentiment_model"

try:
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"✅ Model saved to: {model_save_path}")

    # Optional: Save to Google Drive
    print("\n💡 To save to Google Drive, run:")
    print("from google.colab import drive")
    print("drive.mount('/content/drive')")
    print("!cp -r /content/final_sentiment_model /content/drive/MyDrive/")

except Exception as e:
    print(f"❌ Error saving model: {e}")

# 11. Test predictions
print("\n" + "🧪" + "="*48 + "🧪")
print("                TESTING PREDICTIONS")
print("🧪" + "="*48 + "🧪")

test_sentences = [
    "This movie is absolutely fantastic and amazing!",
    "I hate this boring and terrible film.",
    "The acting was decent but the plot was confusing.",
    "Great story, wonderful characters!",
    "Worst movie ever, completely disappointing.",
    "The cinematography was beautiful.",
    "Not bad, but could be better."
]

model.eval()
for i, sentence in enumerate(test_sentences, 1):
    inputs = tokenizer(
        sentence,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=128
    )

    # Move to GPU if available
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model = model.cuda()

    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(prediction, dim=-1).item()
        confidence = prediction[0][predicted_class].item()

    if predicted_class == 1:
        sentiment = "Positive 😊"
        emoji = "✅"
    else:
        sentiment = "Negative 😞"
        emoji = "❌"

    print(f"\n{emoji} Test {i}:")
    print(f"   📝 Text: {sentence}")
    print(f"   🎯 Prediction: {sentiment}")
    print(f"   📊 Confidence: {confidence*100:.1f}%")

# 12. Training summary
print("\n" + "📈" + "="*48 + "📈")
print("                TRAINING SUMMARY")
print("📈" + "="*48 + "📈")

try:
    if progress_callback.training_loss:
        print(f"🔥 Initial loss: {progress_callback.training_loss[0]:.4f}")
        print(f"🎯 Final loss: {progress_callback.training_loss[-1]:.4f}")

    if progress_callback.eval_accuracy:
        print(f"🏆 Best accuracy: {max(progress_callback.eval_accuracy)*100:.2f}%")

    print(f"⏱️  Total training steps: {trainer.state.global_step}")
    print(f"🔄 Epochs completed: {trainer.state.epoch}")

except Exception as e:
    print("📊 Training history not available")

print("\n🎉 CONGRATULATIONS! Training completed successfully! 🎉")
print("\n📱 Next steps:")
print("1. Test more examples above")
print("2. Save to Google Drive if needed")
print("3. Use the model for your own text classification!")

print(f"\n🔧 Model loading code for future use:")
print("```python")
print(f"from transformers import RobertaTokenizerFast, RobertaForSequenceClassification")
print(f"model = RobertaForSequenceClassification.from_pretrained('{model_save_path}')")
print(f"tokenizer = RobertaTokenizerFast.from_pretrained('{model_save_path}')")
print("```")

🔥 Google Colab Sentiment Analysis Training 🔥
📦 PyTorch version: 2.8.0+cu126
🖥️  Device: cuda
🚀 GPU: Tesla T4
💾 GPU Memory: 14.7 GB

📚 Loading model and tokenizer: roberta-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model and tokenizer loaded successfully!

📊 Loading SST-2 dataset...
✅ Train samples: 67,349
✅ Validation samples: 872

🔄 Tokenizing datasets...


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

✅ Tokenization completed!

📈 Loading evaluation metrics...
✅ Metrics loaded successfully!

⚙️  Setting up training configuration...
✅ Training configuration set!

🏃‍♂️ Initializing trainer...

                STARTING TRAINING
⏰ Estimated time:
   - GPU (T4): ~15-20 minutes
   - GPU (V100): ~8-10 minutes

💡 Tip: Keep this tab active to prevent Colab from disconnecting!
--------------------------------------------------


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
250,0.3513,0.242969,0.912844,0.911628,0.942308,0.882883
500,0.2596,0.2189,0.919725,0.919169,0.943128,0.896396
750,0.25,0.26811,0.913991,0.91018,0.971867,0.855856
1000,0.2402,0.193825,0.93578,0.936652,0.940909,0.932432
1250,0.2063,0.1925,0.934633,0.936027,0.932886,0.939189
1500,0.211,0.270254,0.916284,0.921758,0.879346,0.968468
1750,0.1895,0.219779,0.936927,0.939361,0.920086,0.959459
2000,0.1835,0.186398,0.940367,0.941573,0.939462,0.943694
2250,0.1561,0.242346,0.924312,0.92779,0.902128,0.954955
2500,0.1544,0.211072,0.932339,0.934807,0.91757,0.952703


🎯 Step 250: Accuracy = 0.9128
🎯 Step 500: Accuracy = 0.9197
🎯 Step 750: Accuracy = 0.9140
🎯 Step 1000: Accuracy = 0.9358
🎯 Step 1250: Accuracy = 0.9346
🎯 Step 1500: Accuracy = 0.9163
🎯 Step 1750: Accuracy = 0.9369
🎯 Step 2000: Accuracy = 0.9404
🎯 Step 2250: Accuracy = 0.9243
🎯 Step 2500: Accuracy = 0.9323


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
250,0.3513,0.242969,0.912844,0.911628,0.942308,0.882883
500,0.2596,0.2189,0.919725,0.919169,0.943128,0.896396
750,0.25,0.26811,0.913991,0.91018,0.971867,0.855856
1000,0.2402,0.193825,0.93578,0.936652,0.940909,0.932432
1250,0.2063,0.1925,0.934633,0.936027,0.932886,0.939189
1500,0.211,0.270254,0.916284,0.921758,0.879346,0.968468
1750,0.1895,0.219779,0.936927,0.939361,0.920086,0.959459
2000,0.1835,0.186398,0.940367,0.941573,0.939462,0.943694
2250,0.1561,0.242346,0.924312,0.92779,0.902128,0.954955
2500,0.1544,0.211072,0.932339,0.934807,0.91757,0.952703


🎯 Step 2750: Accuracy = 0.9323
🔥 Step 2750: Loss = 0.2385

✅ Training completed successfully!

🔍 Running final evaluation...


🎯 Step 2750: Accuracy = 0.9404

                    FINAL RESULTS
🎯 Accuracy       : 94.04%
⚖️ F1             : 94.16%
🔍 Precision      : 93.95%
📊 Recall         : 94.37%
📉 Loss           : 0.1864

💾 Saving model...
✅ Model saved to: /content/final_sentiment_model

💡 To save to Google Drive, run:
from google.colab import drive
drive.mount('/content/drive')
!cp -r /content/final_sentiment_model /content/drive/MyDrive/

                TESTING PREDICTIONS

✅ Test 1:
   📝 Text: This movie is absolutely fantastic and amazing!
   🎯 Prediction: Positive 😊
   📊 Confidence: 99.8%

❌ Test 2:
   📝 Text: I hate this boring and terrible film.
   🎯 Prediction: Negative 😞
   📊 Confidence: 99.7%

❌ Test 3:
   📝 Text: The acting was decent but the plot was confusing.
   🎯 Prediction: Negative 😞
   📊 Confidence: 96.6%

✅ Test 4:
   📝 Text: Great story, wonderful characters!
   🎯 Prediction: Positive 😊
   📊 Confidence: 99.8%

❌ Test 5:
   📝 Text: Worst movie ever, completely disappointing.
   🎯 Predicti

In [6]:
# 포괄적 NLP 태스크 성능 평가: BERTweet vs RoBERTa 비교
# POS Tagging, NER, Sentiment Analysis, Irony Detection 모든 태스크 포함

# 필요한 라이브러리 설치
!pip install --upgrade transformers datasets evaluate torch seqeval -q
!pip install accelerate scikit-learn -q

import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizerFast, RobertaForSequenceClassification, RobertaForTokenClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback, set_seed,
    DataCollatorForTokenClassification
)
import evaluate
from sklearn.metrics import classification_report, confusion_matrix
import json
from IPython.display import clear_output, display
import warnings
warnings.filterwarnings('ignore')

clear_output()
set_seed(42)

print("🚀 포괄적 NLP 태스크 성능 평가 시스템")
print("=" * 60)
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print("=" * 60)

class ComprehensiveNLPEvaluator:
    def __init__(self, model_name='roberta-base'):
        self.model_name = model_name
        self.tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
        self.results = {}

        # 평가 지표 로드
        self.accuracy_metric = evaluate.load("accuracy")
        self.f1_metric = evaluate.load("f1")
        self.precision_metric = evaluate.load("precision")
        self.recall_metric = evaluate.load("recall")
        self.seqeval_metric = evaluate.load("seqeval")

    def prepare_sequence_classification_data(self, dataset_name, text_column, label_column):
        """시퀀스 분류 데이터 준비"""
        print(f"\n📊 Loading {dataset_name} dataset...")

        if dataset_name == "sst2":
            train_dataset = load_dataset('sst2', split='train')
            eval_dataset = load_dataset('sst2', split='validation')
        elif dataset_name == "tweet_eval_sentiment":
            train_dataset = load_dataset('tweet_eval', 'sentiment', split='train')
            eval_dataset = load_dataset('tweet_eval', 'sentiment', split='validation')
        elif dataset_name == "tweet_eval_irony":
            train_dataset = load_dataset('tweet_eval', 'irony', split='train')
            eval_dataset = load_dataset('tweet_eval', 'irony', split='validation')

        def preprocess_function(examples):
            tokenized = self.tokenizer(
                examples[text_column],
                truncation=True,
                padding='max_length',
                max_length=128
            )
            tokenized['labels'] = examples[label_column]
            return tokenized

        tokenized_train = train_dataset.map(preprocess_function, batched=True)
        tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

        print(f"✅ {dataset_name}: Train={len(tokenized_train)}, Eval={len(tokenized_eval)}")
        return tokenized_train, tokenized_eval

    def prepare_token_classification_data(self, dataset_name):
        """토큰 분류 데이터 준비 (POS tagging, NER)"""
        print(f"\n📊 Loading {dataset_name} dataset...")

        if dataset_name == "conll2003_ner":
            train_dataset = load_dataset('conll2003', split='train')
            eval_dataset = load_dataset('conll2003', split='validation')
            label_names = train_dataset.features['ner_tags'].feature.names

        elif dataset_name == "pos_tags":
            # Universal POS tags dataset 사용
            train_dataset = load_dataset('universal_dependencies', 'en_ewt', split='train')
            eval_dataset = load_dataset('universal_dependencies', 'en_ewt', split='validation')
            label_names = train_dataset.features['upos'].feature.names

        def tokenize_and_align_labels(examples, label_column='ner_tags'):
            tokenized_inputs = self.tokenizer(
                examples['tokens'],
                truncation=True,
                is_split_into_words=True,
                padding='max_length',
                max_length=128
            )

            labels = []
            for i, label in enumerate(examples[label_column]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []

                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx

                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        if dataset_name == "conll2003_ner":
            tokenized_train = train_dataset.map(
                lambda x: tokenize_and_align_labels(x, 'ner_tags'),
                batched=True
            )
            tokenized_eval = eval_dataset.map(
                lambda x: tokenize_and_align_labels(x, 'ner_tags'),
                batched=True
            )
        else:  # POS tagging
            tokenized_train = train_dataset.map(
                lambda x: tokenize_and_align_labels(x, 'upos'),
                batched=True
            )
            tokenized_eval = eval_dataset.map(
                lambda x: tokenize_and_align_labels(x, 'upos'),
                batched=True
            )

        print(f"✅ {dataset_name}: Train={len(tokenized_train)}, Eval={len(tokenized_eval)}")
        return tokenized_train, tokenized_eval, label_names

    def compute_classification_metrics(self, eval_pred, average='weighted'):
        """분류 태스크 평가 지표 계산"""
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        accuracy = self.accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']
        f1 = self.f1_metric.compute(predictions=predictions, references=labels, average=average)['f1']
        precision = self.precision_metric.compute(predictions=predictions, references=labels, average=average)['precision']
        recall = self.recall_metric.compute(predictions=predictions, references=labels, average=average)['recall']

        return {
            "accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall
        }

    def compute_token_classification_metrics(self, eval_pred, label_names):
        """토큰 분류 태스크 평가 지표 계산"""
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        # -100을 제거하고 실제 라벨로 변환
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = self.seqeval_metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "accuracy": results["overall_accuracy"],
            "f1": results["overall_f1"],
            "precision": results["overall_precision"],
            "recall": results["overall_recall"]
        }

    def train_and_evaluate_classification(self, dataset_name, text_column, label_column, num_labels):
        """분류 태스크 학습 및 평가"""
        print(f"\n🔥 Training {dataset_name} Classification Model")
        print("-" * 50)

        # 데이터 준비
        train_dataset, eval_dataset = self.prepare_sequence_classification_data(
            dataset_name, text_column, label_column
        )

        # 모델 초기화
        model = RobertaForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=num_labels
        )

        # 훈련 설정
        training_args = TrainingArguments(
            output_dir=f'./results_{dataset_name}',
            learning_rate=2e-5,
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            weight_decay=0.01,
            logging_steps=100,
            eval_strategy="steps",
            eval_steps=200,
            save_strategy="steps",
            save_steps=200,
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            greater_is_better=True,
            report_to="none",
            save_total_limit=2,
        )

        # Trainer 초기화
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=self.compute_classification_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        # 학습 실행
        trainer.train()

        # 최종 평가
        final_metrics = trainer.evaluate()

        # 결과 저장
        self.results[dataset_name] = {
            'task_type': 'classification',
            'accuracy': final_metrics['eval_accuracy'],
            'f1': final_metrics['eval_f1'],
            'precision': final_metrics['eval_precision'],
            'recall': final_metrics['eval_recall'],
            'model_path': f'./results_{dataset_name}'
        }

        print(f"✅ {dataset_name} Results:")
        print(f"   Accuracy: {final_metrics['eval_accuracy']*100:.2f}%")
        print(f"   F1: {final_metrics['eval_f1']*100:.2f}%")

        return final_metrics

    def train_and_evaluate_token_classification(self, dataset_name, num_labels):
        """토큰 분류 태스크 학습 및 평가"""
        print(f"\n🔥 Training {dataset_name} Token Classification Model")
        print("-" * 50)

        # 데이터 준비
        train_dataset, eval_dataset, label_names = self.prepare_token_classification_data(dataset_name)

        # 모델 초기화
        model = RobertaForTokenClassification.from_pretrained(
            self.model_name,
            num_labels=num_labels
        )

        # 데이터 콜레이터
        data_collator = DataCollatorForTokenClassification(
            tokenizer=self.tokenizer,
            padding=True
        )

        # 훈련 설정
        training_args = TrainingArguments(
            output_dir=f'./results_{dataset_name}',
            learning_rate=2e-5,
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            weight_decay=0.01,
            logging_steps=100,
            eval_strategy="steps",
            eval_steps=200,
            save_strategy="steps",
            save_steps=200,
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            greater_is_better=True,
            report_to="none",
            save_total_limit=2,
        )

        # Trainer 초기화
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            compute_metrics=lambda eval_pred: self.compute_token_classification_metrics(eval_pred, label_names),
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        # 학습 실행
        trainer.train()

        # 최종 평가
        final_metrics = trainer.evaluate()

        # 결과 저장
        self.results[dataset_name] = {
            'task_type': 'token_classification',
            'accuracy': final_metrics['eval_accuracy'],
            'f1': final_metrics['eval_f1'],
            'precision': final_metrics['eval_precision'],
            'recall': final_metrics['eval_recall'],
            'model_path': f'./results_{dataset_name}'
        }

        print(f"✅ {dataset_name} Results:")
        print(f"   Accuracy: {final_metrics['eval_accuracy']*100:.2f}%")
        print(f"   F1: {final_metrics['eval_f1']*100:.2f}%")

        return final_metrics

    def run_all_evaluations(self):
        """모든 태스크 실행"""
        print("🚀 Starting Comprehensive NLP Evaluation")
        print("=" * 60)

        tasks = [
            # 분류 태스크
            ("sst2", "sentence", "label", 2, "classification"),
            ("tweet_eval_sentiment", "text", "label", 3, "classification"),
            ("tweet_eval_irony", "text", "label", 2, "classification"),

            # 토큰 분류 태스크
            ("conll2003_ner", None, None, 9, "token_classification"),  # B-PER, I-PER, B-ORG, etc.
        ]

        for task_name, text_col, label_col, num_labels, task_type in tasks:
            try:
                if task_type == "classification":
                    self.train_and_evaluate_classification(task_name, text_col, label_col, num_labels)
                else:
                    self.train_and_evaluate_token_classification(task_name, num_labels)

            except Exception as e:
                print(f"❌ Error in {task_name}: {e}")
                self.results[task_name] = {'error': str(e)}

        self.generate_final_report()

    def generate_final_report(self):
        """최종 성능 보고서 생성"""
        print("\n" + "🏆" + "=" * 58 + "🏆")
        print("                    COMPREHENSIVE RESULTS")
        print("🏆" + "=" * 58 + "🏆")

        # 결과를 BERTweet 논문과 비교할 수 있는 형태로 정리
        comparison_data = {
            'Task': [],
            'Dataset': [],
            'Our_Accuracy': [],
            'Our_F1': [],
            'Our_Precision': [],
            'Our_Recall': [],
            'BERTweet_Accuracy': [],  # 논문 결과 (수동 입력 필요)
            'BERTweet_F1': []
        }

        for task_name, metrics in self.results.items():
            if 'error' not in metrics:
                task_type = 'POS Tagging' if 'pos' in task_name else \
                           'NER' if 'ner' in task_name else \
                           'Sentiment Analysis' if 'sentiment' in task_name else \
                           'Irony Detection' if 'irony' in task_name else 'Classification'

                comparison_data['Task'].append(task_type)
                comparison_data['Dataset'].append(task_name)
                comparison_data['Our_Accuracy'].append(f"{metrics['accuracy']*100:.2f}%")
                comparison_data['Our_F1'].append(f"{metrics['f1']*100:.2f}%")
                comparison_data['Our_Precision'].append(f"{metrics['precision']*100:.2f}%")
                comparison_data['Our_Recall'].append(f"{metrics['recall']*100:.2f}%")

                # BERTweet 결과는 논문에서 가져온 값들 (해당하는 경우만)
                if 'sentiment' in task_name:
                    comparison_data['BERTweet_Accuracy'].append('73.2%')  # SemEval2017 기준
                    comparison_data['BERTweet_F1'].append('72.8%')
                elif 'irony' in task_name:
                    comparison_data['BERTweet_Accuracy'].append('78.2%')  # SemEval2018 기준
                    comparison_data['BERTweet_F1'].append('74.6%')
                else:
                    comparison_data['BERTweet_Accuracy'].append('N/A')
                    comparison_data['BERTweet_F1'].append('N/A')

        # 데이터프레임 생성 및 출력
        df = pd.DataFrame(comparison_data)
        print("\n📊 Detailed Results Comparison:")
        display(df)

        # 성능 요약
        print(f"\n📈 Performance Summary:")
        for task_name, metrics in self.results.items():
            if 'error' not in metrics:
                print(f"📋 {task_name}:")
                print(f"   🎯 Accuracy: {metrics['accuracy']*100:.2f}%")
                print(f"   ⚖️  F1: {metrics['f1']*100:.2f}%")
                print(f"   🔍 Precision: {metrics['precision']*100:.2f}%")
                print(f"   📊 Recall: {metrics['recall']*100:.2f}%")
                print()

        # JSON으로 결과 저장
        with open('comprehensive_results.json', 'w') as f:
            json.dump(self.results, f, indent=2, default=str)

        print("💾 Results saved to 'comprehensive_results.json'")
        print("🏆" + "=" * 58 + "🏆")

# 평가 실행
if __name__ == "__main__":
    evaluator = ComprehensiveNLPEvaluator('roberta-base')
    evaluator.run_all_evaluations()

    print("\n🎉 Comprehensive evaluation completed!")
    print("📋 All results have been saved and are ready for paper comparison!")

🚀 포괄적 NLP 태스크 성능 평가 시스템
Device: CUDA
GPU: Tesla T4


Downloading builder script: 0.00B [00:00, ?B/s]

🚀 Starting Comprehensive NLP Evaluation

🔥 Training sst2 Classification Model
--------------------------------------------------

📊 Loading sst2 dataset...


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

✅ sst2: Train=67349, Eval=872


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.4174,0.300473,0.876147,0.875715,0.8798,0.876147
400,0.3471,0.35971,0.900229,0.900017,0.905537,0.900229
600,0.3098,0.259998,0.912844,0.912761,0.913643,0.912844
800,0.2906,0.251884,0.915138,0.9151,0.91541,0.915138
1000,0.292,0.218425,0.923165,0.923118,0.923619,0.923165
1200,0.2517,0.298762,0.909404,0.909337,0.911883,0.909404
1400,0.2487,0.35751,0.888761,0.887849,0.899077,0.888761
1600,0.2513,0.213809,0.928899,0.928887,0.928962,0.928899
1800,0.2462,0.221157,0.933486,0.933489,0.933501,0.933486
2000,0.2506,0.256024,0.925459,0.925464,0.925764,0.925459


✅ sst2 Results:
   Accuracy: 93.35%
   F1: 93.35%

🔥 Training tweet_eval_sentiment Classification Model
--------------------------------------------------

📊 Loading tweet_eval_sentiment dataset...


README.md: 0.00B [00:00, ?B/s]

sentiment/train-00000-of-00001.parquet:   0%|          | 0.00/3.78M [00:00<?, ?B/s]

sentiment/test-00000-of-00001.parquet:   0%|          | 0.00/901k [00:00<?, ?B/s]

sentiment/validation-00000-of-00001.parq(…):   0%|          | 0.00/167k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45615 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12284 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/45615 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

✅ tweet_eval_sentiment: Train=45615, Eval=2000


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.7695,0.702508,0.684,0.676357,0.706001,0.684
400,0.715,0.651009,0.724,0.725952,0.733434,0.724
600,0.6945,0.729215,0.659,0.639655,0.690519,0.659
800,0.6415,0.657599,0.713,0.706018,0.741754,0.713
1000,0.6306,0.634538,0.7115,0.711971,0.735472,0.7115


✅ tweet_eval_sentiment Results:
   Accuracy: 72.40%
   F1: 72.60%

🔥 Training tweet_eval_irony Classification Model
--------------------------------------------------

📊 Loading tweet_eval_irony dataset...


irony/train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

irony/test-00000-of-00001.parquet:   0%|          | 0.00/54.0k [00:00<?, ?B/s]

irony/validation-00000-of-00001.parquet:   0%|          | 0.00/61.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2862 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/784 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/955 [00:00<?, ? examples/s]

Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

✅ tweet_eval_irony: Train=2862, Eval=955


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.5559,0.579419,0.722513,0.72056,0.724611,0.722513
400,0.4372,0.544858,0.75288,0.752515,0.758962,0.75288


✅ tweet_eval_irony Results:
   Accuracy: 75.29%
   F1: 75.25%

🔥 Training conll2003_ner Token Classification Model
--------------------------------------------------

📊 Loading conll2003_ner dataset...


README.md: 0.00B [00:00, ?B/s]

conll2003.py: 0.00B [00:00, ?B/s]

❌ Error in conll2003_ner: Dataset scripts are no longer supported, but found conll2003.py

                    COMPREHENSIVE RESULTS

📊 Detailed Results Comparison:


Unnamed: 0,Task,Dataset,Our_Accuracy,Our_F1,Our_Precision,Our_Recall,BERTweet_Accuracy,BERTweet_F1
0,Classification,sst2,93.35%,93.35%,93.35%,93.35%,,
1,Sentiment Analysis,tweet_eval_sentiment,72.40%,72.60%,73.34%,72.40%,73.2%,72.8%
2,Irony Detection,tweet_eval_irony,75.29%,75.25%,75.90%,75.29%,78.2%,74.6%



📈 Performance Summary:
📋 sst2:
   🎯 Accuracy: 93.35%
   ⚖️  F1: 93.35%
   🔍 Precision: 93.35%
   📊 Recall: 93.35%

📋 tweet_eval_sentiment:
   🎯 Accuracy: 72.40%
   ⚖️  F1: 72.60%
   🔍 Precision: 73.34%
   📊 Recall: 72.40%

📋 tweet_eval_irony:
   🎯 Accuracy: 75.29%
   ⚖️  F1: 75.25%
   🔍 Precision: 75.90%
   📊 Recall: 75.29%

💾 Results saved to 'comprehensive_results.json'

🎉 Comprehensive evaluation completed!
📋 All results have been saved and are ready for paper comparison!
