<a href="https://colab.research.google.com/github/joycerlz/bigfive-text-classification/blob/main/clean_ocean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Convert to multiclass

In [None]:
import re
import pandas as pd

In [None]:
df = pd.read_csv('OCEAN-synthetic.csv')

In [None]:
df.head()

Unnamed: 0,Text,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,text,personality_trait,labels
0,I love exploring new cultures through cuisine ...,4.7,3.1,3.5,3.9,2.1,i love exploring new cultures through cuisine ...,Openness,2
1,My workspace is always organized; I can't focu...,2.9,4.8,2.1,3.2,2.4,my workspace is always organized i cant focus ...,Conscientiousness,3
2,Large social gatherings make me feel energized...,3.1,2.9,4.6,3.5,1.7,large social gatherings make me feel energized...,Extraversion,1
3,I often worry about things not going as planned.,3.0,3.9,2.0,3.4,4.5,i often worry about things not going as planned,Neuroticism,4
4,Having a daily routine is comforting and helps...,2.3,4.6,1.7,3.7,2.2,having a daily routine is comforting and helps...,Conscientiousness,3


In [None]:
print(df.iloc[798]['text'])

why do they call it beauty sleep when you wake up looking like a troll


In [None]:
def clean_text(text):
    # Remove non-English characters, punctuations, and convert to lowercase
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [None]:
df['text'] = df['Text'].apply(clean_text)

In [None]:
df['personality_trait'] = df[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']].idxmax(axis=1)

In [None]:
label_map = {
        'Agreeableness': 0,
        'Extraversion': 1,
        'Openness': 2,
        'Conscientiousness': 3,
        'Neuroticism': 4
    }

In [None]:
df['labels'] = df['personality_trait'].map(label_map)

In [None]:
df['labels'].value_counts()

labels
2    505
3    195
1    155
0    153
4    152
Name: count, dtype: int64

In [None]:
new_df = df[['text', 'labels']]
new_df.head()

Unnamed: 0,text,labels
0,i love exploring new cultures through cuisine ...,2
1,my workspace is always organized i cant focus ...,3
2,large social gatherings make me feel energized...,1
3,i often worry about things not going as planned,4
4,having a daily routine is comforting and helps...,3


In [None]:
new_df.to_csv('ocean_multiclass.csv', index=False)

## Data augmentation

In [None]:
!pip install numpy requests nlpaug

In [None]:
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from tqdm import tqdm

In [None]:
df = pd.read_csv('ocean_multiclass.csv')

In [None]:
def augment_sentence(sentence, aug, num_threads):
    return aug.augment(sentence, num_thread=num_threads)[0]

In [None]:
def augment_text(df, aug, num_threads, num_times, aug_label):
    """""""""
    Takes a pandas DataFrame and augments its text data.

    Input:
        - df:            A pandas DataFrame containing the columns:
                                - 'text' containing strings of text to augment.
                                - 'labels' target variable containing 0-4.
        - aug:           Augmentation object defined by the nlpaug library.
        - num_threads:   Integer controlling number of threads to use if augmenting
                         text via CPU
        - num_times:     Integer representing the number of times to augment text.
    Output:
        - df:            Copy of the same pandas DataFrame with augmented data
                         appended to it and with rows randomly shuffled.
    """""""""

    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input df must be a pandas DataFrame.")

    # Get rows of data to augment
    to_augment = df[df['labels']==aug_label]
    to_augmentX = to_augment['text']
    # to_augmentY = np.ones(len(to_augmentX.index) * num_times, dtype=np.int8)

    # Build up dictionary containing augmented data
    aug_dict = {'text':[], 'labels':[]}
    for i in tqdm(range(num_times)):
        augX = [augment_sentence(x, aug, num_threads) for x in to_augmentX]
        augY = np.full(len(to_augmentX.index), aug_label, dtype=np.int8)
        aug_dict['text'].extend(augX)
        aug_dict['labels'].extend(augY.tolist())

    # Build DataFrame containing augmented data
    aug_df = pd.DataFrame.from_dict(aug_dict)

    # return df.append(aug_df, ignore_index=True).sample(frac=1, random_state=42)
    return pd.concat([df, aug_df], ignore_index=True).sample(frac=1, random_state=42)

In [None]:
# Define nlpaug augmentation object
aug10p = nlpaw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', aug_min=1, aug_max=3, aug_p=0.1, action="substitute")

balanced_df = df.copy()
balanced_df = augment_text(balanced_df, aug10p, num_threads=8, num_times=1, aug_label=2)

label_to_aug = [0, 1, 3, 4]
for l in label_to_aug:
    balanced_df = augment_text(balanced_df, aug10p, num_threads=8, num_times=6, aug_label=l)

# balanced_df = augment_text(balanced_df, aug10p, num_threads=8, num_times=1, aug_label=2)

100%|██████████| 1/1 [01:09<00:00, 69.19s/it]
100%|██████████| 6/6 [01:33<00:00, 15.60s/it]
100%|██████████| 6/6 [01:53<00:00, 18.98s/it]
100%|██████████| 6/6 [01:55<00:00, 19.17s/it]
100%|██████████| 6/6 [02:06<00:00, 21.12s/it]


In [None]:
df['labels'].value_counts()

labels
2    505
3    195
1    155
0    153
4    152
Name: count, dtype: int64

In [None]:
balanced_df['labels'].value_counts()

labels
3    1365
1    1085
0    1071
4    1064
2    1010
Name: count, dtype: int64

In [None]:
balanced_df.head()

Unnamed: 0,text,labels
3856,an excellent article that merit recognition,3
5965,so this article was time well —,0
6007,being an outlaw is just whispering what the de...,4
7572,when plans change unexpectedly i often feel wo...,4
1488,compassion meets your boundaries,0


In [None]:
balanced_df.to_csv('ocean_longer.csv', index=False)