In [2]:
pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import nlpaug.augmenter.word as naw

In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [5]:
def extract_number(label):
    match = re.match(r'(\d+)_', label)
    if match:
        return int(match.group(1))  
    return None

In [8]:
df = pd.read_parquet('/kaggle/input/train-parquet/train.parquet')
df['numeric_label'] = df['label'].apply(extract_number)

In [9]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
num_rows = len(train)
first_quarter = int(num_rows * 0.25)
second_quarter = int(num_rows * 0.50)
third_quarter = int(num_rows * 0.75)

# Get the first 25% of the rows
train1 = train.iloc[:first_quarter]

# Get the second 25% of the rows
train2 = train.iloc[first_quarter:second_quarter]

# Get the third 25% of the rows
train3 = train.iloc[second_quarter:third_quarter]

# Get the last 25% of the rows
train4 = train.iloc[third_quarter:]

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [12]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action='substitute',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [13]:
class_counts1 = train1['numeric_label'].value_counts()
max_count1 = class_counts1.max()
augment_counts1 = max_count1 - class_counts1

In [15]:
augmented_rows1 = []

# Perform augmentation for underrepresented classes
for label, deficit in augment_counts1.items():
    if deficit > 0:
        # Sample from the existing rows of the class
        sample_rows = train1[train1['numeric_label'] == label].sample(n=deficit, replace=True)
        for _, row in sample_rows.iterrows():
            augmented_text = aug.augment(row['quote'])
            # Create a new row with the augmented text and same label
            augmented_rows1.append([augmented_text, label, row['source'], row['url'], row['language'], row['subsource'], row['id']])

# Create a DataFrame from the augmented rows
augmented_df1 = pd.DataFrame(augmented_rows1, columns=['quote', 'numeric_label', 'source', 'url', 'language', 'subsource', 'id'])

In [22]:
augmented_df_copy1 = augmented_df1.copy()
augmented_df_copy1['quote'] = augmented_df_copy1['quote'].astype(str)
augmented_df_copy1['quote'] = augmented_df_copy1['quote'].str.replace(r"^\['|'\]$", '', regex=True)
df_balanced1 = pd.concat([train1, augmented_df_copy1], ignore_index=True)
df_balanced1['numeric_label'] = df_balanced1['numeric_label'].astype(int)
df_balanced1.to_csv('train1.csv', index=False)

In [18]:
class_counts2 = train2['numeric_label'].value_counts()
max_count2 = class_counts2.max()
augment_counts2 = max_count2 - class_counts2

In [20]:
augmented_rows2 = []

# Perform augmentation for underrepresented classes
for label, deficit in augment_counts2.items():
    if deficit > 0:
        # Sample from the existing rows of the class
        sample_rows = train2[train2['numeric_label'] == label].sample(n=deficit, replace=True)
        for _, row in sample_rows.iterrows():
            augmented_text = aug.augment(row['quote'])
            # Create a new row with the augmented text and same label
            augmented_rows2.append([augmented_text, label, row['source'], row['url'], row['language'], row['subsource'], row['id']])

# Create a DataFrame from the augmented rows
augmented_df2 = pd.DataFrame(augmented_rows2, columns=['quote', 'numeric_label', 'source', 'url', 'language', 'subsource', 'id'])

In [23]:
augmented_df_copy2 = augmented_df2.copy()
augmented_df_copy2['quote'] = augmented_df_copy2['quote'].astype(str)
augmented_df_copy2['quote'] = augmented_df_copy2['quote'].str.replace(r"^\['|'\]$", '', regex=True)
df_balanced2 = pd.concat([train2, augmented_df_copy2], ignore_index=True)
df_balanced2['numeric_label'] = df_balanced2['numeric_label'].astype(int)
df_balanced2.to_csv('train2.csv', index=False)

In [24]:
class_counts3 = train3['numeric_label'].value_counts()
max_count3 = class_counts3.max()
augment_counts3 = max_count3 - class_counts3

In [25]:
augmented_rows3 = []

# Perform augmentation for underrepresented classes
for label, deficit in augment_counts3.items():
    if deficit > 0:
        # Sample from the existing rows of the class
        sample_rows = train3[train3['numeric_label'] == label].sample(n=deficit, replace=True)
        for _, row in sample_rows.iterrows():
            augmented_text = aug.augment(row['quote'])
            # Create a new row with the augmented text and same label
            augmented_rows3.append([augmented_text, label, row['source'], row['url'], row['language'], row['subsource'], row['id']])

# Create a DataFrame from the augmented rows
augmented_df3 = pd.DataFrame(augmented_rows3, columns=['quote', 'numeric_label', 'source', 'url', 'language', 'subsource', 'id'])

In [26]:
augmented_df_copy3 = augmented_df3.copy()
augmented_df_copy3['quote'] = augmented_df_copy3['quote'].astype(str)
augmented_df_copy3['quote'] = augmented_df_copy3['quote'].str.replace(r"^\['|'\]$", '', regex=True)
df_balanced3 = pd.concat([train2, augmented_df_copy3], ignore_index=True)
df_balanced3['numeric_label'] = df_balanced3['numeric_label'].astype(int)
df_balanced3.to_csv('train3.csv', index=False)

In [27]:
class_counts4 = train4['numeric_label'].value_counts()
max_count4 = class_counts4.max()
augment_counts4 = max_count4 - class_counts4

In [28]:
augmented_rows4 = []

# Perform augmentation for underrepresented classes
for label, deficit in augment_counts4.items():
    if deficit > 0:
        # Sample from the existing rows of the class
        sample_rows = train4[train4['numeric_label'] == label].sample(n=deficit, replace=True)
        for _, row in sample_rows.iterrows():
            augmented_text = aug.augment(row['quote'])
            # Create a new row with the augmented text and same label
            augmented_rows4.append([augmented_text, label, row['source'], row['url'], row['language'], row['subsource'], row['id']])

# Create a DataFrame from the augmented rows
augmented_df4 = pd.DataFrame(augmented_rows4, columns=['quote', 'numeric_label', 'source', 'url', 'language', 'subsource', 'id'])

In [29]:
augmented_df_copy4 = augmented_df4.copy()
augmented_df_copy4['quote'] = augmented_df_copy4['quote'].astype(str)
augmented_df_copy4['quote'] = augmented_df_copy4['quote'].str.replace(r"^\['|'\]$", '', regex=True)
df_balanced4 = pd.concat([train4, augmented_df_copy4], ignore_index=True)
df_balanced4['numeric_label'] = df_balanced4['numeric_label'].astype(int)
df_balanced4.to_csv('train4.csv', index=False)

In [30]:
test.to_csv('test.csv', index=False)