In [1]:
pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
import numpy as np
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import nlpaug.augmenter.word as naw
from sklearn.model_selection import KFold
import os

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [5]:
def extract_number(label):
    match = re.match(r'(\d+)_', label)
    if match:
        return int(match.group(1))  
    return None

In [6]:
df = pd.read_parquet('/kaggle/input/train-parquet/train.parquet')
df['numeric_label'] = df['label'].apply(extract_number)

In [8]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
subsets = []
for train_index, test_index in kf.split(df):
    # Creating subsets based on the indices: here, we use 'test_index' for simplicity
    # to denote that these are just the split parts, not actually 'testing' data.
    subset = df.iloc[test_index]
    subsets.append(subset)

In [11]:
df1 = subsets[0]
df2 = subsets[1]
df3 = subsets[2]
df4 = subsets[3]
df5 = subsets[4]

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [18]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action='substitute',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [25]:
def balance_dataset(train_df, label_column, text_column, augmenter):
    """
    Augments underrepresented classes in a dataset to balance class distribution.

    Args:
    - train_df (pd.DataFrame): DataFrame containing the data.
    - label_column (str): Name of the column containing class labels.
    - text_column (str): Name of the column containing text to augment.
    - augmenter: Text augmentation object with an 'augment' method.

    Returns:
    - pd.DataFrame: A DataFrame with balanced class distribution.
    """
    # Count each class's occurrences
    class_counts = train_df[label_column].value_counts()
    max_count = class_counts.max()

    # Calculate how many samples each class needs to be balanced
    augment_counts = max_count - class_counts
    augmented_rows = []

    # Perform augmentation for underrepresented classes
    for label, deficit in augment_counts.items():
        if deficit > 0:
            # Sample from the existing rows of the class
            sample_rows = train_df[train_df[label_column] == label].sample(n=deficit, replace=True)
            for _, row in sample_rows.iterrows():
                augmented_text = augmenter.augment(row[text_column])
                # Create a new row with the augmented text and the same label
                augmented_rows.append([augmented_text, label] + row.drop([text_column, label_column]).tolist())

    # Create a DataFrame from the augmented rows
    augmented_df = pd.DataFrame(augmented_rows, columns=[text_column, label_column] + [col for col in train_df.columns if col not in [text_column, label_column]])
    
    # Clean up and concat with original DataFrame
    augmented_df[text_column] = augmented_df[text_column].astype(str)
    augmented_df[text_column] = augmented_df[text_column].str.replace(r"^\['|'\]$", '', regex=True)
    df_balanced = pd.concat([train_df, augmented_df], ignore_index=True)
    df_balanced[label_column] = df_balanced[label_column].astype(int)
    
    return df_balanced

In [23]:
df_balanced1 = balance_dataset(df1, 'numeric_label', 'quote', aug)
df1.to_csv('df1.csv', index=False)
df_balanced1.to_csv('df_balanced1.csv', index=False)

In [None]:
df_balanced2 = balance_dataset(df2, 'numeric_label', 'quote', aug)
df2.to_csv('df2.csv', index=False)
df_balanced2.to_csv('df_balanced2.csv', index=False)

In [None]:
df_balanced3 = balance_dataset(df3, 'numeric_label', 'quote', aug)
df3.to_csv('df3.csv', index=False)
df_balanced3.to_csv('df_balanced3.csv', index=False)

In [None]:
df_balanced4 = balance_dataset(df4, 'numeric_label', 'quote', aug)
df4.to_csv('df4.csv', index=False)
df_balanced4.to_csv('df_balanced4.csv', index=False)

In [None]:
df_balanced5 = balance_dataset(df5, 'numeric_label', 'quote', aug)
df5.to_csv('df5.csv', index=False)
df_balanced5.to_csv('df_balanced5.csv', index=False)