In [1]:
# Importing necessary libraries
!pip install nlpaug
!pip install ktrain
!pip install tensorflow
!pip install transformers

import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
import nlpaug.augmenter.word as naw

# Mount Google Drive (only applicable in Google Colab)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Load the dataset
path = 'drive/My Drive/Colab Notebooks/public-data/text_data'
df = pd.read_csv(path + '/public_data.csv')

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

# Data preprocessing
df['text'] = df['text'].astype(str)
indexes_to_drop = df[df['label'].isin(['|', '11', '22', '32', '21'])].index
df.drop(indexes_to_drop, inplace=True)
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df.dropna(subset=['label'], inplace=True)
df['label'] = df['label'].astype('int32')
df = df.drop_duplicates(subset='text', keep='first')
df['text'] = df['text'].apply(clean_text)

# Augment text function
def augment_text(df, augmenter, repetitions=1):
    aug_texts = []
    for _, row in df.iterrows():
        text = row['text']
        for _ in range(repetitions):
            augmented_text = augmenter.augment(text)
            aug_texts.append((augmented_text, row['label']))
    return pd.DataFrame(aug_texts, columns=['text', 'label'])

# Synonym Augmenter
augmenter = naw.SynonymAug(aug_src='wordnet')

# Upsampling using augmentation
max_size = df['label'].value_counts().max()
df_balanced = pd.DataFrame()
for label in df['label'].unique():
    df_class = df[df['label'] == label]
    df_class_aug = augment_text(df_class, augmenter, repetitions=max_size // len(df_class) - 1)
    df_balanced = pd.concat([df_balanced, df_class, df_class_aug])

# Shuffle dataset
df_balanced = df_balanced.sample(frac=1, random_state=123).reset_index(drop=True)

# Splitting the data
X_train, X_temp, y_train, y_temp = train_test_split(df_balanced['text'], df_balanced['label'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Creating DataFrame for train, validation, and test
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
val_df = pd.DataFrame({'text': X_val, 'label': y_val})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Saving to CSV
train_df.to_csv('drive/My Drive/Colab Notebooks/public-data/text_data/train_data.csv', index=False)
val_df.to_csv('drive/My Drive/Colab Notebooks/public-data/text_data/val_data.csv', index=False)
test_df.to_csv('drive/My Drive/Colab Notebooks/public-data/text_data/test_data.csv', index=False)


Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Collecting ktrain
  Downloading ktrain-0.41.3.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
