In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW, lr_scheduler

# Ensure output directory exists
output_dir = "/kaggle/working/output"
os.makedirs(output_dir, exist_ok=True)

# Load and prepare data
df = pd.read_parquet("/kaggle/input/climatetext/train.parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')

# Create dictionary for label names
label_dict = df[['label_int', 'label']].drop_duplicates().set_index('label_int')['label'].to_dict()

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return QuotesDataset(encodings, labels)

# Training params
MAX_LENGTH = 365
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 64
LEARNING_RATE = 1e-5
STEP_SIZE = 2
GAMMA = 0.1
EPOCHS = 10

# Split data
texts = df["quote"].to_list()
labels = df["label_int"].to_list()

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

train_dataset = encode_data(tokenizer, X_train, y_train, MAX_LENGTH)
val_dataset = encode_data(tokenizer, X_test, y_test, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)

# Initialize optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

# Plotting function for accuracy
def plot_accuracies(training_accuracies, validation_accuracies):
    plt.figure(figsize=(8, 6))
    plt.plot(training_accuracies, label='Training Accuracy')
    plt.plot(validation_accuracies, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(f"{output_dir}/accuracy_plot.png")
    plt.close()

def plot_confusion_matrix(cm, class_labels, epoch, output_dir):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title(f'Confusion Matrix for Epoch {epoch + 1}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, f'confusion_matrix_epoch_{epoch + 1}.png'))
    plt.close()

# Training and validation loop
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
metrics_df = pd.DataFrame()

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)

    scheduler.step()
    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(correct_train / total_train)

    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions, all_true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())

            correct_val += (predictions == batch['labels']).sum().item()
            total_val += batch['labels'].size(0)

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(correct_val / total_val)

    # Compute confusion matrix
    cm = confusion_matrix(all_true_labels, all_predictions)
    class_labels = [label_dict.get(i, f'Class {i}') for i in range(len(np.unique(all_true_labels)))]
    plot_confusion_matrix(cm, class_labels, epoch, output_dir)

    # Calculate metrics
    balanced_acc = balanced_accuracy_score(all_true_labels, all_predictions)
    average_f1 = f1_score(all_true_labels, all_predictions, average='macro')
    weighted_f1 = f1_score(all_true_labels, all_predictions, average='weighted')
    f1_scores_per_class = f1_score(all_true_labels, all_predictions, average=None)
    precision_per_class = precision_score(all_true_labels, all_predictions, average=None, zero_division=0)
    recall_per_class = recall_score(all_true_labels, all_predictions, average=None, zero_division=0)

    # Append metrics to DataFrame with class labels
    epoch_metrics = {
        "Epoch": epoch + 1,
        "Train Loss": train_losses[-1],
        "Validation Loss": val_losses[-1],
        "Train Accuracy": train_accuracies[-1],
        "Validation Accuracy": val_accuracies[-1],
        "Balanced Accuracy": balanced_acc,
        "Average F1": average_f1,
        "Weighted F1": weighted_f1
    }
    epoch_metrics.update({f"{label_dict[i]} F1": f1_scores_per_class[i] for i in range(len(f1_scores_per_class))})
    epoch_metrics.update({f"{label_dict[i]} Precision": precision_per_class[i] for i in range(len(precision_per_class))})
    epoch_metrics.update({f"{label_dict[i]} Recall": recall_per_class[i] for i in range(len(recall_per_class))})

    metrics_df = pd.concat([metrics_df, pd.DataFrame([epoch_metrics])], ignore_index=True)

# Save metrics to CSV and plot accuracies
metrics_df.to_csv(f"{output_dir}/training_metrics.csv", index=False)
plot_accuracies(train_accuracies, val_accuracies)

# Print overall model accuracy
overall_accuracy = accuracy_score(y_test, all_predictions)
print(f"Overall Model Accuracy: {overall_accuracy:.4f}")

In [None]:
import shutil
import zipfile

def zip_directory(folder_path, output_path):
    """Zip the contents of an entire directory and save the archive to the specified output path."""
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a relative path for files to preserve the directory structure
                zipf.write(os.path.join(root, file),
                           os.path.relpath(os.path.join(root, file),
                                           os.path.join(folder_path, '..')))

# Directory to be zipped
input_dir = '/kaggle/working/output'

# Output path for the zip file
zip_file_path = '/kaggle/working/output.zip'

# Creating the ZIP file
zip_directory(input_dir, zip_file_path)

print(f"Created zip file at: {zip_file_path}")