In [None]:
# ==========================================
# Airline Tweet Classifier - 1000 Samples
# DistilBERT + Focal Loss + 8 Epochs
# Auto ZIP Export for Streamlit
# ==========================================

# 1. Install dependencies
!pip install transformers datasets scikit-learn pandas torch evaluate -q

# 2. Imports
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight
import evaluate
import joblib
import shutil, os
from google.colab import files

# 3. Load dataset
df = pd.read_csv("AirlineTweets.csv")
df = df[df["airline_sentiment"].isin(["positive", "neutral", "negative"])]

# Balance to ~1000 samples (333/class)
samples_per_class = 333
df_balanced = (
    df.groupby("airline_sentiment", group_keys=False)
      .apply(lambda x: x.sample(min(len(x), samples_per_class), random_state=42))
      .reset_index(drop=True)
)

text_column = "text"
label_column = "airline_sentiment"

# 4. Encode labels
label_encoder = LabelEncoder()
df_balanced["label"] = label_encoder.fit_transform(df_balanced[label_column])
joblib.dump(label_encoder, "label_encoder.pkl")

# 5. Train-test split
train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.2,
    stratify=df_balanced["label"],
    random_state=42
)

# 6. Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# 7. Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example[text_column],
        padding="max_length",
        truncation=True
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 8. Remove unnecessary columns
train_dataset = train_dataset.remove_columns([label_column, text_column, "__index_level_0__"])
test_dataset = test_dataset.remove_columns([label_column, text_column, "__index_level_0__"])

# 9. Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["label"]),
    y=train_df["label"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# 10. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_encoder.classes_)
)

# 11. Define Focal Loss
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = torch.nn.functional.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

focal_loss_fn = FocalLoss(alpha=class_weights.to(model.device), gamma=2)

def compute_loss_with_focal(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
    logits = outputs.get("logits")
    loss = focal_loss_fn(logits, labels)
    return (loss, outputs) if return_outputs else loss

model.compute_loss = compute_loss_with_focal

# 12. Metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

# 13. Training args
training_args = TrainingArguments(
    output_dir="./results_1000",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  # longer training
    weight_decay=0.01,
    logging_dir="./logs_1000",
    load_best_model_at_end=False,
    report_to="none"
)

# 14. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 15. Train
trainer.train()

# 16. Evaluate
eval_results = trainer.evaluate()
print("\n📊 Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# 17. Save model
model_dir = "fine-tuned-airline-model-1000"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# 18. ZIP export for Streamlit
zip_filename = "streamlit_model_package.zip"
package_dir = "streamlit_model_package"

shutil.rmtree(package_dir, ignore_errors=True)
os.makedirs(package_dir, exist_ok=True)
shutil.copytree(model_dir, f"{package_dir}/{model_dir}")
shutil.copy("label_encoder.pkl", f"{package_dir}/label_encoder.pkl")
shutil.make_archive("streamlit_model_package", 'zip', package_dir)

files.download(zip_filename)