Already run notebook on kaggle can be viewed [here](https://www.kaggle.com/code/hazrulakmal/augmentation)

In [1]:
!pip install numpy requests nlpaug

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#import nlpaug.augmenter.word as naw

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
class config:
    MAX_LEN = 80
    LOWER_CASE = True
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    VALIDATION_SIZE = 0.2
    NUM_LABELS = 3
    BATCH_SIZE = 100
    LEARNING_RATE = 5e-5
    EPOCHS = 10
    WEIGTH_DECAY = 0.01
    EPSILON = 1e-8
    DEVICE = "cuda"

In [4]:
def find_max_length(dataset):
    return len(max(dataset, key=lambda x: len(x.split())).split())

def split_data(dataframe, ylabel, test_size=0.2):
    
    training_df, test_df = train_test_split(
        dataframe,
        test_size=test_size,
        random_state=config.RANDOM_STATE,
        shuffle= True,
        stratify=dataframe[ylabel],
    )

    return training_df, test_df

In [5]:
financial_news = pd.read_csv("/kaggle/input/financial-phrasebank/phrasebank.csv")

le = LabelEncoder()
le.fit(financial_news["labels"])
financial_news["labels"] = le.transform(financial_news["labels"])

class_to_label = {}
for each_class in le.classes_:
    class_to_label[each_class] = int(le.transform([each_class]))
    
class_to_label

In [6]:
train, test = split_data(financial_news, "labels")

In [7]:
train.shape

In [8]:
financial_news.shape

In [37]:
text = "The net sales of the Power Plants business were EUR 710.3 million in 2005"

back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en')

back_translation_aug.augment(text)

In [42]:
sub_aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")

sub_aug.augment(text, n=4, num_thread=8)

In [43]:
in_aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="insert")

in_aug.augment(text, n=4, num_thread=8)

In [None]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en', device="cuda")

sub_aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute", device="cuda")

in_aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="insert", device="cuda")

def augmenting_word(word):
    translation = back_translation_aug.augment(word)
    subtitution = sub_aug.augment(word, n=4)
    insertion = in_aug.augment(word, n=4)
    return translation + subtitution + insertion

new_sentences = []
labels = []

for i in range(len(train)):
    augmented_sentences = augmenting_word(train.iloc[i,0])
    new_sentences += augmented_sentences
    labels += [train.iloc[i,1]]*len(augmented_sentences)
    if i%200 == 0:
        print(f"number of titles completed {i}")
    
new_sentences_df = pd.DataFrame({"titles": new_sentences,"labels": labels})

In [None]:
print(new_sentences_df.shape[0])
new_sentences_df = new_sentences_df.drop_duplicates(subset=["titles"])
print(new_sentences_df.shape[0])

new_sentences_df.to_csv("augmentation.csv")