In [17]:
!pip install codecarbon



In [18]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW, lr_scheduler
import shutil
import zipfile
import copy
from torch.nn.utils import prune
import io
from codecarbon import EmissionsTracker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [19]:
model_path = '/kaggle/input/baseline/pytorch/default/1/distilbert_trained.pth'
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
model = DistilBertForSequenceClassification(config)

model.load_state_dict(torch.load(model_path))

model.to(device)

  model.load_state_dict(torch.load(model_path))


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [20]:
def apply_pruning(model, pruning_params):
    for name, module in model.named_modules():
        if name in pruning_params:
            if hasattr(module, 'weight'):
                prune.l1_unstructured(module, name='weight', amount=pruning_params[name])

In [21]:
lin1_layer0_pruning = 0.2017953262091607
lin2_layer0_pruning = 0.15729525440967862
lin1_layer1_pruning = 0.12969848850621088
lin2_layer1_pruning = 0.454859759610882
lin1_layer2_pruning = 0.2310579033256247
lin2_layer2_pruning = 0.1607487810536109
lin1_layer3_pruning = 0.22990726915583418
lin2_layer3_pruning = 0.29273042218157586
lin1_layer4_pruning = 0.44861219131635766
lin2_layer4_pruning = 0.23703770072386673
lin1_layer5_pruning = 0.45099619043007
lin2_layer5_pruning = 0.4751512722238028

In [22]:
pruning_params = {
    'distilbert.transformer.layer.0.ffn.lin1': 0.2017953262091607,
    'distilbert.transformer.layer.0.ffn.lin2': 0.15729525440967862,
    'distilbert.transformer.layer.1.ffn.lin1': 0.12969848850621088,
    'distilbert.transformer.layer.1.ffn.lin2': 0.454859759610882,
    'distilbert.transformer.layer.2.ffn.lin1': 0.2310579033256247,
    'distilbert.transformer.layer.2.ffn.lin2': 0.1607487810536109,
    'distilbert.transformer.layer.3.ffn.lin1': 0.22990726915583418,
    'distilbert.transformer.layer.3.ffn.lin2': 0.29273042218157586,
    'distilbert.transformer.layer.4.ffn.lin1': 0.44861219131635766,
    'distilbert.transformer.layer.4.ffn.lin2': 0.23703770072386673,
    'distilbert.transformer.layer.5.ffn.lin1': 0.45099619043007,
    'distilbert.transformer.layer.5.ffn.lin2': 0.4751512722238028
}

In [23]:
# config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
# model = DistilBertForSequenceClassification(config)
# model.load_state_dict(torch.load(model_path))
# model.to(device)

apply_pruning(model, pruning_params)

In [24]:
def validate_model(model, val_loader, device):
    # Ensure only one tracker instance runs at a time
    tracker = EmissionsTracker(allow_multiple_runs=True)
    tracker.start()

    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions = []
    all_true_labels = []

    try:
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                val_loss += outputs.loss.item()
                predictions = torch.argmax(outputs.logits, dim=-1)
                all_predictions.extend(predictions.cpu().numpy())
                all_true_labels.extend(batch['labels'].cpu().numpy())
                correct_val += (predictions == batch['labels']).sum().item()
                total_val += batch['labels'].size(0)
        average_val_loss = val_loss / len(val_loader)
        accuracy = correct_val / total_val

    finally:
        emissions = tracker.stop()
        if emissions:
            total_energy_used = getattr(emissions, 'energy_consumed', 0)
        else:
            total_energy_used = 0  # Default to 0 if emissions data is not available

    return average_val_loss, accuracy, all_predictions, all_true_labels, total_energy_used

In [25]:
df = pd.read_parquet("/kaggle/input/test-parquet/test-00000-of-00001.parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')

texts = df["quote"].to_list()
labels = df["label_int"].to_list()

In [26]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
MAX_LENGTH = 365

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()
            
        encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        return QuotesDataset(encodings, labels)

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

In [27]:
val_dataset = encode_data(tokenizer, texts, labels, MAX_LENGTH)
val_loader = DataLoader(val_dataset, batch_size= 16, shuffle=False)

In [28]:
val_loss, val_accuracy, all_predictions, all_true_labels, emissions = validate_model(model, val_loader, device)
print(val_accuracy)

[codecarbon INFO @ 17:27:52] [setup] RAM Tracking...
[codecarbon INFO @ 17:27:52] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 17:27:53] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 17:27:53] [setup] GPU Tracking...
[codecarbon INFO @ 17:27:53] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:27:53] >>> Tracker's metadata:
[codecarbon INFO @ 17:27:53]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 17:27:53]   Python version: 3.10.12
[codecarbon INFO @ 17:27:53]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 17:27:53]   Available RAM : 31.351 GB
[codecarbon INFO @ 17:27:53]   CPU count: 4
[codecarbon INFO @ 17:27:53]   CPU model: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 17:27:53]   GPU count: 2
[codecarbon INFO @ 17:27:53]   GPU model: 2 x Tesla T4
[codecarbon INFO @ 17:27:56] Saving emissions data to file

0.977850697292863


In [29]:
model_path = '/kaggle/input/baseline/pytorch/default/1/distilbert_trained.pth'
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
model1 = DistilBertForSequenceClassification(config)

model1.load_state_dict(torch.load(model_path))

model1.to(device)

  model1.load_state_dict(torch.load(model_path))


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [30]:
val_loss, val_accuracy, all_predictions, all_true_labels, emissions = validate_model(model1, val_loader, device)
print(val_accuracy)

[codecarbon INFO @ 17:29:04] [setup] RAM Tracking...
[codecarbon INFO @ 17:29:04] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 17:29:05] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 17:29:05] [setup] GPU Tracking...
[codecarbon INFO @ 17:29:05] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:29:05] >>> Tracker's metadata:
[codecarbon INFO @ 17:29:05]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 17:29:05]   Python version: 3.10.12
[codecarbon INFO @ 17:29:05]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 17:29:05]   Available RAM : 31.351 GB
[codecarbon INFO @ 17:29:05]   CPU count: 4
[codecarbon INFO @ 17:29:05]   CPU model: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 17:29:05]   GPU count: 2
[codecarbon INFO @ 17:29:05]   GPU model: 2 x Tesla T4
[codecarbon INFO @ 17:29:08] Saving emissions data to file

0.9794913863822805
