In [38]:
# pip install nlpaug

In [21]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import nlpaug.augmenter.word as naw

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
def extract_number(label):
    match = re.match(r'(\d+)_', label)
    if match:
        return int(match.group(1))  
    return None

- source of data: https://huggingface.co/datasets/QuotaClimat/frugalaichallenge-text-train

In [4]:
df = pd.read_parquet('../input/train-parquet')
df['numeric_label'] = df['label'].apply(extract_number)
# print(df.head())

In [22]:
# print(df['label'].unique())

In [23]:
# print(df.head())

In [24]:
# filtered_df = df[df['numeric_label'] == 0]
# pd.set_option('display.max_colwidth', None)

# # Display the 'quote' column
# print(filtered_df['quote'])

In [26]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [39]:
num_rows = len(train)
first_quarter = int(num_rows * 0.25)

# Get the first 25% of the rows
train1 = train.iloc[:first_quarter]

In [27]:
# train_texts, test_texts, train_labels, test_labels = train_test_split(df['quote'], df['numeric_label'], test_size=0.2, random_state=42)

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


- Distilbert should be less energy consuming, it has less params 
- Lower case so less params 

**Augmentation**

In [41]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action='substitute',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

In [42]:
class_counts = train1['numeric_label'].value_counts()
class_counts

numeric_label
0    293
4    168
5    168
6    160
1    159
2    136
3     77
7     57
Name: count, dtype: int64

In [43]:
max_count = class_counts.max()
max_count

293

In [44]:
augment_counts = max_count - class_counts
augment_counts

numeric_label
0      0
4    125
5    125
6    133
1    134
2    157
3    216
7    236
Name: count, dtype: int64

below will take a while to run

In [45]:
augmented_rows = []

# Perform augmentation for underrepresented classes
for label, deficit in augment_counts.items():
    if deficit > 0:
        # Sample from the existing rows of the class
        sample_rows = train1[train1['numeric_label'] == label].sample(n=deficit, replace=True)
        for _, row in sample_rows.iterrows():
            augmented_text = aug.augment(row['quote'])
            # Create a new row with the augmented text and same label
            augmented_rows.append([augmented_text, label, row['source'], row['url'], row['language'], row['subsource'], row['id']])

# Create a DataFrame from the augmented rows
augmented_df = pd.DataFrame(augmented_rows, columns=['quote', 'numeric_label', 'source', 'url', 'language', 'subsource', 'id'])
augmented_df.head() 

Unnamed: 0,quote,numeric_label,source,url,language,subsource,id
0,[suppose you assume the worst then there is absolutely no danger risking making any money trying to prevent whether things changed.],4,Desmog,https://www.desmog.com/john-redwood/,en,,
1,"[another size - zero government is already achieving profits in the coal, natural gas, and coke plants that americans rely around providing reliable, affordable ‘ from heat ’ electricity.]",4,Desmog,https://www.desmog.com/jd-vance/,en,,
2,"[pursuing this 2c target is very costly and thus guaranteed to be feasible. much easier, if, to target a maximum for, say, 3c rise, which will cost about $ 40 trillion but avoid most damages if we insist on 2c, we will pay an extra $ 60, 000 billion, but only prevent a stream of $ 100 billion losses that begins in 70 to 80 years. moreover, all of these estimates assume cost - effective budget policies, yet in real ones they must often become many times more expensive.]",4,Desmog,https://www.desmog.com/bjorn-lomborg/,en,,
3,"[one of the central building blocks of the ’ s power plant rule is increased use of wind and solar for electricity generation. but wind und solar are uncompetitive without massive government subsidies and mandated renewable portfolio standards. for most, this takes the form of the production tax credit. [ … congress should reject any attempt by [ senate democratic leader harry ] reid to revive the wind production tax credit in the lame - duck session. it ’ s clearly a bad line for government, enriching out - of - town billionaires at increased expense of working men.]",4,Desmog,https://www.desmog.com/american-energy-alliance-aea/,en,,
4,[they have a moral responsibility to prevent the immense poverty that is characteristic of the paris climate treaty. any measure could cost whole world over a thousand dollars a day and its effects of the climate are imperceptible.],4,Desmog,https://www.desmog.com/myron-ebell/,en,,


In [47]:
df_balanced1 = pd.concat([train1, augmented_df], ignore_index=True)
df_balanced1.tail() 

Unnamed: 0,quote,label,source,url,language,subsource,id,numeric_label
2339,"[instead of agreeing to allow entry for oil and tear gas on what it considers environmentally sensitive federal lands including coastal cities, congress should open them up. shutting away areas that hold a large part of the estimated 112 billion barrels of synthetic oil and 656 trillion cubic feet of coal gas in the united states is a luxury we can no longer afford, not when federal policy constrains energy production but contributes to poor energy dependence for the american consumer.]",,Desmog,https://www.desmog.com/michael-fox/,en,,,7
2340,"[shale gas from fracking could also ensure that we have a reliable mix at our electricity supply but panels don ’ t generate water when the sun comes ’ t shining, and wind farms don ’ t generate electricity when there ’ s no wind. … ] “ i really believe doing this creates an opportunity we could ’ t afford to miss – but safety will always be my most important thing and we aren ’ indeed taking any chances, ” leadsom wrote at the yorkshire post.]",,Desmog,https://www.desmog.com/andrea-leadsom/,en,,,7
2341,"[the only realistic point that is must recognize the [ sic ] industrial - origin oil and gas sources remain a major part of canada ’ on energy supply and that an energy sector is an integral part of the climate change planning process, not anymore of the search for an effective transition to a more developed economy,]",,Desmog,https://www.desmog.com/macdonald-laurier-institute/,en,,,7
2342,"[as alex epstein has explained : ‘ reducing fossil fuel costs by 95 % can starve the modern mechanized agricultural industry of the energy source to continue its work producing fresh, abundant food — the worst of which would remain massive human malnourishment and starvation. ’ “ alternatives to some oil and gas products are available, but only in very modest quantities, and in many cases only by oil and gas make market possible in the first place.]",,Desmog,https://www.desmog.com/secondstreet-org/,en,,,7
2343,"[it sure was easy not to concede the coal hasn ’ t been put on earth and other fossil fuels for much of his collective sacrifice,]",,Desmog,https://www.desmog.com/fred-palmer/,en,,,7


In [48]:
class_counts_b = df_balanced1['numeric_label'].value_counts()
class_counts_b

numeric_label
0    293
6    293
3    293
2    293
1    293
4    293
5    293
7    293
Name: count, dtype: int64

**Tokenize** 

In [None]:
# Initialize the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

# Function to tokenize data
def tokenize_data(texts, labels):
    encodings = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=367, return_tensors="pt")
    dataset = CustomTextDataset(encodings, labels.tolist())
    return dataset

# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [int(label) for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Prepare datasets
train_dataset = tokenize_data(train_texts, train_labels)
test_dataset = tokenize_data(test_texts, test_labels)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
model.to(device)  # Move model to GPU if available
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
print(device)

In [None]:
# Training loop
model.train()

for epoch in range(4):  # Train for 4 epochs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

In [None]:
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
    total_eval_accuracy += accuracy

avg_test_accuracy = total_eval_accuracy / len(test_loader)
avg_test_loss = total_eval_loss / len(test_loader)

print(f"Test Loss: {avg_test_loss}, Test Accuracy: {avg_test_accuracy}")

In [None]:
# Assuming you have the test_loader set up and the model in evaluation mode
predictions, true_labels = [], []

for batch in test_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions.extend(pred_labels.cpu().numpy())
    true_labels.extend(batch['labels'].cpu().numpy())

# Now predictions and true_labels are complete lists of all test data


In [None]:
f1 = f1_score(true_labels, predictions, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"F1 Score: {f1}")

In [None]:
model1 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
for name, param in model1.named_parameters():
    if 'classifier' not in name:  # Freeze layers that are not part of the classifier
        param.requires_grad = False

model1.to(device)  # Move model to GPU if available
optimizer = AdamW(model1.parameters(), lr=5e-5)

In [None]:
model1.train()

for epoch in range(4):  # Train for 4 epochs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model1(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch for model1 {epoch + 1}, Loss for model1: {loss.item()}")

In [None]:
model1.eval()
total_eval_accuracy = 0
total_eval_loss = 0

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total_eval_loss += loss.item()

    predictions1 = torch.argmax(logits, dim=-1)
    accuracy = (predictions1 == batch['labels']).cpu().numpy().mean() * 100
    total_eval_accuracy += accuracy

avg_test_accuracy = total_eval_accuracy / len(test_loader)
avg_test_loss = total_eval_loss / len(test_loader)

print(f"Test Loss for model1: {avg_test_loss}, Test Accuracy for model1: {avg_test_accuracy}")

In [None]:
# Assuming you have the test_loader set up and the model in evaluation mode
predictions1, true_labels = [], []

for batch in test_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions1.extend(pred_labels.cpu().numpy())
    true_labels.extend(batch['labels'].cpu().numpy())

# Now predictions and true_labels are complete lists of all test data


In [None]:
f1 = f1_score(true_labels, predictions1, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"F1 Score for model1: {f1}")