In [1]:
pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import nlpaug.augmenter.word as naw

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
def extract_number(label):
    match = re.match(r'(\d+)_', label)
    if match:
        return int(match.group(1))  
    return None

- source of data: https://huggingface.co/datasets/QuotaClimat/frugalaichallenge-text-train

In [5]:
df = pd.read_parquet('/kaggle/input/train.parquet')
df['numeric_label'] = df['label'].apply(extract_number)
# print(df.head())

In [22]:
# print(df['label'].unique())

In [23]:
# print(df.head())

In [24]:
# filtered_df = df[df['numeric_label'] == 0]
# pd.set_option('display.max_colwidth', None)

# # Display the 'quote' column
# print(filtered_df['quote'])

In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
num_rows = len(train)
first_quarter = int(num_rows * 0.25)

# Get the first 25% of the rows
train1 = train.iloc[:first_quarter]

In [27]:
# train_texts, test_texts, train_labels, test_labels = train_test_split(df['quote'], df['numeric_label'], test_size=0.2, random_state=42)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


- Distilbert should be less energy consuming, it has less params 
- Lower case so less params 

**Augmentation**

In [9]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action='substitute',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
class_counts = train1['numeric_label'].value_counts()
# class_counts

In [11]:
max_count = class_counts.max()
augment_counts = max_count - class_counts
# augment_counts

below will take a while to run

In [12]:
augmented_rows = []

# Perform augmentation for underrepresented classes
for label, deficit in augment_counts.items():
    if deficit > 0:
        # Sample from the existing rows of the class
        sample_rows = train1[train1['numeric_label'] == label].sample(n=deficit, replace=True)
        for _, row in sample_rows.iterrows():
            augmented_text = aug.augment(row['quote'])
            # Create a new row with the augmented text and same label
            augmented_rows.append([augmented_text, label, row['source'], row['url'], row['language'], row['subsource'], row['id']])

# Create a DataFrame from the augmented rows
augmented_df = pd.DataFrame(augmented_rows, columns=['quote', 'numeric_label', 'source', 'url', 'language', 'subsource', 'id'])
augmented_df.head() 

Unnamed: 0,quote,numeric_label,source,url,language,subsource,id
0,"[the clean air agenda, by series of decisions ...",4,Desmog,https://www.desmog.com/oren-cass/,en,,
1,[america is much more regulated than the uk an...,4,Desmog,https://www.desmog.com/jim-ratcliffe/,en,,
2,[one to his central building blocks of the ’ w...,4,Desmog,https://www.desmog.com/american-energy-allianc...,en,,
3,[whilst my business case for government addres...,4,Desmog,https://www.desmog.com/andrea-leadsom/,en,,
4,[the counter - intuitive truth is that to succ...,4,Desmog,https://www.desmog.com/tim-worstall/,en,,


Above take long to run, create a copy incase mess up and need rerun 

In [13]:
augmented_df_copy = augmented_df.copy()

In [14]:
augmented_df_copy['quote'] = augmented_df_copy['quote'].astype(str)
augmented_df_copy['quote'] = augmented_df_copy['quote'].str.replace(r"^\['|'\]$", '', regex=True)

augmented_df_copy

Unnamed: 0,quote,numeric_label,source,url,language,subsource,id
0,"the clean air agenda, by series of decisions m...",4,Desmog,https://www.desmog.com/oren-cass/,en,,
1,america is much more regulated than the uk and...,4,Desmog,https://www.desmog.com/jim-ratcliffe/,en,,
2,one to his central building blocks of the ’ wi...,4,Desmog,https://www.desmog.com/american-energy-allianc...,en,,
3,whilst my business case for government address...,4,Desmog,https://www.desmog.com/andrea-leadsom/,en,,
4,the counter - intuitive truth is that to succe...,4,Desmog,https://www.desmog.com/tim-worstall/,en,,
...,...,...,...,...,...,...,...
1121,energy is the lifeblood our society. it doesn ...,7,Desmog,https://www.desmog.com/derrick-hollie/,en,,
1122,"then if many get ’ better set policy, your sta...",7,Desmog,https://www.desmog.com/bruce-everett/,en,,
1123,it is no necessity to export domestic water an...,7,Desmog,https://www.desmog.com/peter-lilley/,en,,
1124,so the supply should be increased by hook or c...,7,Desmog,https://www.desmog.com/vincent-devito/,en,,


In [15]:
df_balanced1 = pd.concat([train1, augmented_df_copy], ignore_index=True)
df_balanced1['numeric_label'] = df_balanced1['numeric_label'].astype(int)

# df_balanced1.head() 

Check if matches so does not cause trouble tokenize 

In [16]:
# train1.dtypes

In [17]:
# df_balanced1.dtypes

check for class balance 

In [18]:
print(df_balanced1['numeric_label'].value_counts()) 

numeric_label
0    293
6    293
3    293
2    293
1    293
4    293
5    293
7    293
Name: count, dtype: int64


**split data**

In [19]:
train1_texts = train1['quote']
train1_labels = train1['numeric_label']
df_balanced1_texts = df_balanced1['quote']
df_balanced1_labels = df_balanced1['numeric_label']
test_texts = test['quote']
test_labels = test['numeric_label']

**Tokenize** 

In [20]:
# Initialize the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

# Function to tokenize data
def tokenize_data(texts, labels):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()

        encodings = tokenizer(
            texts, 
            padding=True, 
            truncation=True, 
            max_length=367, 
            return_tensors="pt"
        )

        dataset = CustomTextDataset(encodings, labels)
        return dataset

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None
# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [int(label) for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [21]:
train1_dataset = tokenize_data(train1_texts, train1_labels)
train1_balanced_dataset = tokenize_data(df_balanced1_texts, df_balanced1_labels)
test_dataset = tokenize_data(test_texts, test_labels)

In [None]:
# # Prepare datasets
# train_dataset = tokenize_data(train_texts, train_labels)
# test_dataset = tokenize_data(test_texts, test_labels)

In [22]:
train1_loader = DataLoader(train1_dataset, batch_size=32, shuffle=True)
train1_balanced_loader = DataLoader(train1_balanced_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [23]:
model1 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
model1_balanced = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)

model1.to(device)
model1_balanced.to(device)# Move model to GPU if available
optimizer1 = AdamW(model1.parameters(), lr=5e-5)
optimizer1_balanced = AdamW(model1_balanced.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [93]:
print(device)

cuda


In [95]:
# Training loop
model1.train()

for epoch in range(4):  # Train for 4 epochs
    for batch in train1_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model1(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer1.step()
        optimizer1.zero_grad()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 2.0737757682800293
Epoch 2, Loss: 1.3507673740386963
Epoch 3, Loss: 0.3720700740814209
Epoch 4, Loss: 0.2362169325351715


In [96]:
# # Training loop
# model1_balanced.train()

# for epoch in range(4):  # Train for 4 epochs
#     for batch in train1_balanced_loader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model1_balanced(**batch)
#         loss = outputs.loss
#         loss.backward()
#         optimizer1_balanced.step()
#         optimizer1_balanced.zero_grad()
#     print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 1.054932713508606
Epoch 2, Loss: 0.6145038604736328
Epoch 3, Loss: 0.17743824422359467
Epoch 4, Loss: 0.11474393308162689


In [24]:
model1_balanced.train()  # Set the model to training mode

for epoch in range(4):  # Train for 4 epochs
    total_loss = 0
    total_correct = 0
    total_examples = 0

    for batch in train1_balanced_loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to device
        outputs = model1_balanced(**batch)  # Forward pass
        loss = outputs.loss
        loss.backward()  # Backpropagation
        optimizer1_balanced.step()  # Update parameters
        optimizer1_balanced.zero_grad()  # Clear gradients

        # Calculate the loss
        total_loss += loss.item()

        # Calculate accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_examples += predictions.size(0)

    # Calculate average loss and accuracy for the epoch
    avg_loss = total_loss / len(train1_balanced_loader)
    avg_accuracy = 100 * total_correct / total_examples

    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.2f}, Accuracy: {avg_accuracy:.2f}%")

Epoch 1, Loss: 1.55, Accuracy: 44.58%
Epoch 2, Loss: 0.70, Accuracy: 78.16%
Epoch 3, Loss: 0.27, Accuracy: 93.39%
Epoch 4, Loss: 0.12, Accuracy: 97.14%


In [98]:
model1.eval()
total1_eval_accuracy = 0
total1_eval_loss = 0

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total1_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
    total1_eval_accuracy += accuracy

avg1_test_accuracy = total1_eval_accuracy / len(test_loader)
avg1_test_loss = total1_eval_loss / len(test_loader)

print(f"For unballanced: Test Loss: {avg1_test_loss}, Test Accuracy: {avg1_test_accuracy}")

For unballanced: Test Loss: 1.2131333030187166, Test Accuracy: 61.51175213675214


In [25]:
model1_balanced.eval()
total1_balanced_eval_accuracy = 0
total1_balanced_eval_loss = 0

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1_balanced(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total1_balanced_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
    total1_balanced_eval_accuracy += accuracy

avg1_balanced_test_accuracy = total1_balanced_eval_accuracy / len(test_loader)
avg1_balanced_test_loss = total1_balanced_eval_loss / len(test_loader)

print(f"For ballanced: Test Loss: {avg1_balanced_test_loss}, Test Accuracy: {avg1_balanced_test_accuracy}")

For ballanced: Test Loss: 1.3438816529053907, Test Accuracy: 62.28632478632478


In [100]:
# Assuming you have the test_loader set up and the model in evaluation mode
predictions1, true_labels1 = [], []

for batch in test_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions1.extend(pred_labels.cpu().numpy())
    true_labels1.extend(batch['labels'].cpu().numpy())

f1 = f1_score(true_labels1, predictions1, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"For unbalanced: F1 Score: {f1}")

For unbalanced: F1 Score: 0.6235038470799567


In [26]:
# Assuming you have the test_loader set up and the model in evaluation mode
predictions1_balanced, true_labels1_balanced = [], []

for batch in test_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1_balanced(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions1_balanced.extend(pred_labels.cpu().numpy())
    true_labels1_balanced.extend(batch['labels'].cpu().numpy())

f1 = f1_score(true_labels1_balanced, predictions1_balanced, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"For balanced: F1 Score: {f1}")

For balanced: F1 Score: 0.6222021720416662
