<a href="https://colab.research.google.com/github/joycerlz/distilbert-text-classification/blob/main/clean_bigfive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Combine the two data files

Combine big_five_scores.csv and trait_scoring_keys.csv

Discard the uncessary columns and encode the labels to integers 0-4



*   0 : agreeable
*   1 : extraversion
*   2 : openness
*   3 : conscientiousness
*   4 : neuroticism

Resulting file: big_five_combined.csv



In [None]:
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet

In [None]:
df1 = pd.read_csv('big_five_scores.csv',header=0)
df2 = pd.read_csv('trait_scoring_keys.csv',header=0)

In [None]:
df1['max_personality'] = df1[['agreeable_score', 'extraversion_score', 'openness_score', 'conscientiousness_score', 'neuroticism_score']].apply(lambda row: row.idxmax(), axis=1)

In [None]:
def quantify_personality(row):
  if row['max_personality']=='agreeable_score':
    return 0
  elif row['max_personality']=='extraversion_score':
    return 1
  elif row['max_personality']=='openness_score':
    return 2
  elif row['max_personality']=='conscientiousness_score':
    return 3
  else:
    return 4

In [None]:
df1['personality'] = df1.apply(quantify_personality,axis=1)
df1['id'] = df1['case_id']

df_total = pd.merge(df1, df2, on='id')

df = pd.DataFrame()
df['item'] = df_total['item']
df['personality'] = df_total['personality']

In [None]:
df.to_csv('big_five_combined.csv', index=False)

## Manuel oversampling & data augmentation

This section was used to oversample the big five dataset by duplicating the rows, then create even more new rows using data augmentation by randomly swapping words or replacing words with their synonym

Resulting file: augmented_df.csv

In [None]:
df = pd.read_csv('big_five_combined.csv',header=0)

Duplicating rows

In [None]:
# Split on label 0
df_min = df[df['personality'] == 0]
df_max = df[df['personality'] != 0]
df_0_over = pd.concat([df_min, df_min]).reset_index(drop=True)  #  *2
df_over = pd.concat([df_0_over, df_max]).reset_index(drop=True)

# Split on label 1
df_min = df_over[df_over['personality'] == 1]
df_max = df_over[df_over['personality'] != 1]
df_1_over = pd.concat([df_min, df_min, df_min]).reset_index(drop=True)  #  *3
df_over = pd.concat([df_1_over, df_max]).reset_index(drop=True)

# Split on label 3
df_min = df_over[df_over['personality'] == 3]
df_max = df_over[df_over['personality'] != 3]
df_3_over = pd.concat([df_min, df_min]).reset_index(drop=True)  #  *2
df_over = pd.concat([df_3_over, df_max]).reset_index(drop=True)

# Split on label 4
df_min = df_over[df_over['personality'] == 4]
df_max = df_over[df_over['personality'] != 4]
df_4_over = pd.concat([df_min, df_min, df_min, df_min]).reset_index(drop=True)  #  *4
df_over = pd.concat([df_4_over, df_max]).reset_index(drop=True)

In [None]:
df_over['text'] = df_over['item']
df_over['labels'] = df_over['personality']
df_over.drop(['item', 'personality'], axis=1, inplace=True)
df_over.head()

Unnamed: 0,text,labels
0,Am easily intimidated.,4
1,Get irritated easily.,4
2,Am always prepared.,4
3,Feel sympathy for those who are worse off than...,4
4,Get upset easily.,4


Data augmentation

In [None]:
nltk.download('wordnet')

In [None]:
# Function to perform synonym replacement
def synonym_replacement(sentence):
    words = sentence.split()
    new_words = list(words)
    for i in range(len(words)):
        syns = wordnet.synsets(words[i])
        if syns:
            synonym = syns[0].lemmas()[0].name()  # Get the first synonym
            if synonym != words[i]:
                new_words[i] = synonym
    return ' '.join(new_words)

# Function to perform random swap
def random_swap(sentence, n=1):
    words = sentence.split()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# Oversample both text and labels
def oversample_data(df, augmentation_factor=1):
    augmented_data = []
    for index, row in df.iterrows():
        text = row['text']
        label = row['labels']
        for _ in range(augmentation_factor):
            augmented_text = random.choice([synonym_replacement(text), random_swap(text)])
            augmented_data.append({'text': augmented_text, 'labels': label})
        augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

In [None]:
# Example usage
sentence = "dislike new foods"
augmented_sentence = synonym_replacement(sentence)
print("Synonym Replacement:", augmented_sentence)

augmented_sentence = random_swap(sentence, n=1)
print("Random Swap:", augmented_sentence)

Synonym Replacement: disfavor new food
Random Swap: foods new dislike


In [None]:
# apply to df
augmented_df = oversample_data(df_over, augmentation_factor=2)
augmented_df['labels'].value_counts()

1    396
3    378
0    330
4    300
2    261
Name: labels, dtype: int64

In [None]:
augmented_df.to_csv('augmented_df.csv', index=False)