In [None]:
# added the following:

| **Strategy**                     | **Purpose**                                                                                      | **Impact on Model**                                                                                     |
|----------------------------------|--------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|
| **Gradient Clipping**            | Stabilizes training by capping gradients.                                                       | Prevents exploding gradients, ensuring smooth convergence.                                              |
| **Learning Rate Scheduler**      | Gradually reduces learning rate during training.                                                 | Enables fine-tuning and prevents overshooting.                                                         |
| **Early Stopping**               | Stops training when validation loss stops improving.                                             | Prevents overfitting and saves computational resources.                                                 |


In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW, lr_scheduler
import time
import subprocess

# Ensure output directory exists
output_dir = "/kaggle/working/output"
os.makedirs(output_dir, exist_ok=True)

# Load and prepare data
df = pd.read_parquet("/kaggle/input/climatetext/train.parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')

# Create dictionary for label names
label_dict = df[['label_int', 'label']].drop_duplicates().set_index('label_int')['label'].to_dict()

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return QuotesDataset(encodings, labels)

# Training params
MAX_LENGTH = 365
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 64
LEARNING_RATE = 1e-5
STEP_SIZE = 2
GAMMA = 0.1
EPOCHS = 10
GRAD_CLIP = 1.0  # Gradient clipping value
PATIENCE = 3  # early stopping: Number of epochs to wait for improvement
MIN_DELTA = 0.01 # early stopping: Minimum loss improvement required

# Initialize early stopping parameters
best_val_loss = float('inf')  # Track the best validation loss
no_improvement_count = 0  # Counter for epochs with no improvement

# Split data
texts = df["quote"].to_list()
labels = df["label_int"].to_list()

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

train_dataset = encode_data(tokenizer, X_train, y_train, MAX_LENGTH)
val_dataset = encode_data(tokenizer, X_val, y_val, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)

# Initialize optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

# Plotting functions
def plot_accuracies(training_accuracies, validation_accuracies):
    plt.figure(figsize=(8, 6))
    plt.plot(training_accuracies, label='Training Accuracy')
    plt.plot(validation_accuracies, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(f"{output_dir}/accuracy_plot.png")
    plt.close()

def plot_confusion_matrix(cm, class_labels, epoch, output_dir):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title(f'Confusion Matrix for Epoch {epoch + 1}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, f'confusion_matrix_epoch_{epoch + 1}.png'))
    plt.close()

# Function to track GPU power (requires NVIDIA GPU and nvidia-smi)
def get_gpu_power():
    try:
        result = subprocess.check_output(
            "nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits",
            shell=True
        )
        power_values = [float(p.strip()) for p in result.decode('utf-8').split('\n') if p.strip()]
        return sum(power_values) / len(power_values)  # Average power in watts
    except Exception as e:
        print(f"Error fetching GPU power usage: {e}")
        return 0

# Training and validation loop
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
metrics_df = pd.DataFrame()

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        optimizer.step()
        train_loss += loss.item()

        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)

    scheduler.step()
    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(correct_train / total_train)

    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions, all_true_labels = [] , []
    total_gpu_power = 0
    batch_count = 0

    with torch.no_grad():
        for batch in val_loader:
            batch_gpu_power = get_gpu_power()  # Track GPU power for each batch
            total_gpu_power += batch_gpu_power
            batch_count += 1

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())

            correct_val += (predictions == batch['labels']).sum().item()
            total_val += batch['labels'].size(0)

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(correct_val / total_val)

    # Compute confusion matrix
    cm = confusion_matrix(all_true_labels, all_predictions)
    class_labels = [label_dict.get(i, f'Class {i}') for i in range(len(np.unique(all_true_labels)))]
    plot_confusion_matrix(cm, class_labels, epoch, output_dir)

    # Track energy consumption during validation
    avg_gpu_power = total_gpu_power / batch_count if batch_count > 0 else 0
    duration_seconds = len(val_loader) * (1 / len(val_loader))  # Approximate time per batch
    duration_hours = duration_seconds / 3600.0
    energy_kwh = (avg_gpu_power * duration_hours) / 1000.0

    print(f"Validation Energy Consumption: {energy_kwh:.4f} kWh")
    print(f"Validation Average GPU Power: {avg_gpu_power:.2f} watts")

    # Early stopping check
    if (best_val_loss - val_loss) > MIN_DELTA:  # Significant improvement
        best_val_loss = val_loss
        no_improvement_count = 0
    else:  # No significant improvement
        no_improvement_count += 1
        if no_improvement_count >= PATIENCE:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # Calculate metrics
    balanced_acc = balanced_accuracy_score(all_true_labels, all_predictions)
    average_f1 = f1_score(all_true_labels, all_predictions, average='macro')
    weighted_f1 = f1_score(all_true_labels, all_predictions, average='weighted')
    f1_scores_per_class = f1_score(all_true_labels, all_predictions, average=None)
    precision_per_class = precision_score(all_true_labels, all_predictions, average=None, zero_division=0)
    recall_per_class = recall_score(all_true_labels, all_predictions, average=None, zero_division=0)

    # Append metrics to DataFrame with class labels
    epoch_metrics = {
        "Epoch": epoch + 1,
        "Train Loss": train_losses[-1],
        "Validation Loss": val_losses[-1],
        "Train Accuracy": train_accuracies[-1],
        "Validation Accuracy": val_accuracies[-1],
        "Balanced Accuracy": balanced_acc,
        "Average F1": average_f1,
        "Weighted F1": weighted_f1,
        "Energy Consumption (kWh)": energy_kwh,
        "Average GPU Power (Watts)": avg_gpu_power
    }
    epoch_metrics.update({f"{label_dict[i]} F1": f1_scores_per_class[i] for i in range(len(f1_scores_per_class))})
    epoch_metrics.update({f"{label_dict[i]} Precision": precision_per_class[i] for i in range(len(precision_per_class))})
    epoch_metrics.update({f"{label_dict[i]} Recall": recall_per_class[i] for i in range(len(recall_per_class))})

    metrics_df = pd.concat([metrics_df, pd.DataFrame([epoch_metrics])], ignore_index=True)

# Save metrics to CSV and plot accuracies
metrics_df.to_csv(f"{output_dir}/training_metrics.csv", index=False)
plot_accuracies(train_accuracies, val_accuracies)

# Calculate final model accuracy
final_accuracy = accuracy_score(all_true_labels, all_predictions)
print(f"Final Model Accuracy: {final_accuracy:.4f}")

# Save the trained model
model_path = f"{output_dir}/trained_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


In [None]:
import shutil
import zipfile

def zip_directory(folder_path, output_path):
    """Zip the contents of an entire directory and save the archive to the specified output path."""
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a relative path for files to preserve the directory structure
                zipf.write(os.path.join(root, file),
                           os.path.relpath(os.path.join(root, file),
                                           os.path.join(folder_path, '..')))

# Directory to be zipped
input_dir = '/kaggle/working/output'

# Output path for the zip file
zip_file_path = '/kaggle/working/output.zip'

# Creating the ZIP file
zip_directory(input_dir, zip_file_path)

print(f"Created zip file at: {zip_file_path}")

## Run with best hyperparams

Trial 21 finished with value: 0.8999179655455292 and parameters: {'learning_rate': 0.0001932713713098696, 'num_trainable_layers': 1, 'dropout_rate': 0.4517023694187439, 'batch_size': 32, 'step_size': 8, 'gamma': 0.7710877288358753, 'epochs': 5}. Best is trial 21 with value: 0.8999179655455292.

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW, lr_scheduler
import time
import subprocess

# Training params
MAX_LENGTH = 365
TRAIN_BATCH_SIZE = 32
VAL_BATCH_SIZE = 32
LEARNING_RATE = 0.0001932713713098696
STEP_SIZE = 8
GAMMA = 0.7710877288358753
EPOCHS = 5
#GRAD_CLIP = 1.0  # Gradient clipping value
#PATIENCE = 3  # early stopping: Number of epochs to wait for improvement
#MIN_DELTA = 0.01 # early stopping: Minimum loss improvement required
DROPOUT_RATE = 0.4517023694187439
NUM_TRAINABLE_LAYERS = 1


# Ensure output directory exists
output_dir = "/kaggle/working/output"
os.makedirs(output_dir, exist_ok=True)

# Load and prepare data
df = pd.read_parquet("/kaggle/input/climatetext/train.parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')

# Create dictionary for label names
label_dict = df[['label_int', 'label']].drop_duplicates().set_index('label_int')['label'].to_dict()

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

# Initialize model configuration with custom dropout rate
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', 
                                          num_labels=8, 
                                          dropout=DROPOUT_RATE,
                                          attention_dropout=DROPOUT_RATE)

# Load the model with the updated configuration
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# Freeze all layers except the last `num_trainable_layers`
for name, param in model.distilbert.named_parameters():
    param.requires_grad = False

# Unfreeze the last `num_trainable_layers`
for layer_idx in range(6 - NUM_TRAINABLE_LAYERS, 6):  # DistilBERT has 6 layers
    for name, param in model.distilbert.transformer.layer[layer_idx].named_parameters():
        param.requires_grad = True

# Ensure classification head is trainable
for name, param in model.classifier.named_parameters():
    param.requires_grad = True


#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return QuotesDataset(encodings, labels)




# Initialize early stopping parameters
#best_val_loss = float('inf')  # Track the best validation loss
#no_improvement_count = 0  # Counter for epochs with no improvement

# Split data
texts = df["quote"].to_list()
labels = df["label_int"].to_list()

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

train_dataset = encode_data(tokenizer, X_train, y_train, MAX_LENGTH)
val_dataset = encode_data(tokenizer, X_val, y_val, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)

# Initialize optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

# Plotting functions
def plot_accuracies(training_accuracies, validation_accuracies):
    plt.figure(figsize=(8, 6))
    plt.plot(training_accuracies, label='Training Accuracy')
    plt.plot(validation_accuracies, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(f"{output_dir}/accuracy_plot.png")
    plt.close()

def plot_confusion_matrix(cm, class_labels, epoch, output_dir):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title(f'Confusion Matrix for Epoch {epoch + 1}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, f'confusion_matrix_epoch_{epoch + 1}.png'))
    plt.close()

# Function to track GPU power (requires NVIDIA GPU and nvidia-smi)
#def get_gpu_power():
#    try:
#        result = subprocess.check_output(
#           "nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits",
#            shell=True
#        )
#        power_values = [float(p.strip()) for p in result.decode('utf-8').split('\n') if p.strip()]
#        return sum(power_values) / len(power_values)  # Average power in watts
#    except Exception as e:
#        print(f"Error fetching GPU power usage: {e}")
#       return 0

# Training and validation loop
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
metrics_df = pd.DataFrame()

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        # Gradient clipping
        #torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        optimizer.step()  
        train_loss += loss.item()

        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)

    scheduler.step()
    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(correct_train / total_train)

    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions, all_true_labels = [] , []
    total_gpu_power = 0
    batch_count = 0

    with torch.no_grad():
        for batch in val_loader:
            #batch_gpu_power = get_gpu_power()  # Track GPU power for each batch
            #total_gpu_power += batch_gpu_power
            #batch_count += 1

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())

            correct_val += (predictions == batch['labels']).sum().item()
            total_val += batch['labels'].size(0)

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(correct_val / total_val)

    # Compute confusion matrix
    cm = confusion_matrix(all_true_labels, all_predictions)
    class_labels = [label_dict.get(i, f'Class {i}') for i in range(len(np.unique(all_true_labels)))]
    plot_confusion_matrix(cm, class_labels, epoch, output_dir)

    # Track energy consumption during validation
    #avg_gpu_power = total_gpu_power / batch_count if batch_count > 0 else 0
    #duration_seconds = len(val_loader) * (1 / len(val_loader))  # Approximate time per batch
    #duration_hours = duration_seconds / 3600.0
    #energy_kwh = (avg_gpu_power * duration_hours) / 1000.0

    #print(f"Validation Energy Consumption: {energy_kwh:.4f} kWh")
    #print(f"Validation Average GPU Power: {avg_gpu_power:.2f} watts")

    # Early stopping check
    #if (best_val_loss - val_loss) > MIN_DELTA:  # Significant improvement
    #    best_val_loss = val_loss
    #    no_improvement_count = 0
    #else:  # No significant improvement
    #   no_improvement_count += 1
    #    if no_improvement_count >= PATIENCE:
    #        print(f"Early stopping at epoch {epoch + 1}")
    #        break

    # Calculate metrics
    balanced_acc = balanced_accuracy_score(all_true_labels, all_predictions)
    average_f1 = f1_score(all_true_labels, all_predictions, average='macro')
    weighted_f1 = f1_score(all_true_labels, all_predictions, average='weighted')
    f1_scores_per_class = f1_score(all_true_labels, all_predictions, average=None)
    precision_per_class = precision_score(all_true_labels, all_predictions, average=None, zero_division=0)
    recall_per_class = recall_score(all_true_labels, all_predictions, average=None, zero_division=0)

    # Append metrics to DataFrame with class labels
    epoch_metrics = {
        "Epoch": epoch + 1,
        "Train Loss": train_losses[-1],
        "Validation Loss": val_losses[-1],
        "Train Accuracy": train_accuracies[-1],
        "Validation Accuracy": val_accuracies[-1],
        "Balanced Accuracy": balanced_acc,
        "Average F1": average_f1,
        "Weighted F1": weighted_f1,
        #"Energy Consumption (kWh)": energy_kwh,
        #"Average GPU Power (Watts)": avg_gpu_power
    }
    epoch_metrics.update({f"{label_dict[i]} F1": f1_scores_per_class[i] for i in range(len(f1_scores_per_class))})
    epoch_metrics.update({f"{label_dict[i]} Precision": precision_per_class[i] for i in range(len(precision_per_class))})
    epoch_metrics.update({f"{label_dict[i]} Recall": recall_per_class[i] for i in range(len(recall_per_class))})

    metrics_df = pd.concat([metrics_df, pd.DataFrame([epoch_metrics])], ignore_index=True)

# Save metrics to CSV and plot accuracies
metrics_df.to_csv(f"{output_dir}/training_metrics.csv", index=False)
plot_accuracies(train_accuracies, val_accuracies)

# Calculate final model accuracy
final_accuracy = accuracy_score(all_true_labels, all_predictions)
print(f"Final Model Accuracy: {final_accuracy:.4f}")

# Save the trained model
model_path = f"{output_dir}/trained_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")
