<a href="https://colab.research.google.com/github/joycerlz/distilbert-text-classification/blob/main/clean_essays.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspellchecker

In [2]:
import pandas as pd
import nltk
import random
# from spellchecker import SpellChecker

In [None]:
nltk.download('wordnet')

In [3]:
df_pre = pd.read_csv("essays.csv", encoding='latin1')

In [4]:
df_pre.drop(columns=['#AUTHID'], inplace=True)

df_pre.rename(columns={'TEXT': 'text'}, inplace=True)
df_pre.rename(columns={'cEXT': 'ext'}, inplace=True)
df_pre.rename(columns={'cNEU': 'neu'}, inplace=True)
df_pre.rename(columns={'cAGR': 'agr'}, inplace=True)
df_pre.rename(columns={'cCON': 'con'}, inplace=True)
df_pre.rename(columns={'cOPN': 'opn'}, inplace=True)

In [5]:
df_pre.head()

Unnamed: 0,text,ext,neu,agr,con,opn
0,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,I can't believe it! It's really happening! M...,y,n,y,y,n
4,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [None]:
def convert_to_labels(row):
  labels = []
  if row['agr'] == 'y':
    labels.append(0)  # Agreeableness
  if row['ext'] == 'y':
    labels.append(1)  # Extraversion
  if row['opn'] == 'y':
    labels.append(2)  # Openness
  if row['con'] == 'y':
    labels.append(3)  # Conscientiousness
  if row['neu'] == 'y':
    labels.append(4)  # Neuroticism
  return labels

def synonym_replacement(sentence, num_replacements=1):
  words = sentence.split()
  new_words = list(words)
  for _ in range(num_replacements):
    word_index = random.randint(0, len(words) - 1)
    syns = wordnet.synsets(words[word_index])
    if syns:
      synonym = syns[0].lemmas()[0].name()  # Get the first synonym
      if synonym != words[word_index]:
        new_words[word_index] = synonym
  return ' '.join(new_words)

## For each entry with one or more personality types, oversample the text section and assign all its personality types

In [None]:
new_rows = []
spell = SpellChecker()
for index, row in df_pre.iterrows():
  labels = convert_to_labels(row)
  if labels:
    for i, label in enumerate(labels):
      if i == 0:
        new_row = row.copy()  # Use original text for the first label
      else:
        new_row = row.copy()
        new_row['text'] = synonym_replacement(row['text'], 10)  # Replace 10 words
        new_row['text'] = spell.correction(new_row['text'])

      new_row['labels'] = label
      new_rows.append(new_row)

# Create a new DataFrame from the list of new rows
df = pd.DataFrame(new_rows)

In [None]:
df.drop(columns=['ext'], inplace=True)
df.drop(columns=['neu'], inplace=True)
df.drop(columns=['agr'], inplace=True)
df.drop(columns=['con'], inplace=True)
df.drop(columns=['opn'], inplace=True)

In [None]:
df.head()

Unnamed: 0,text,labels
0,"Well, right now I just woke up from a mid-day ...",0
0,"Well, right now I just woke up from a mid-day ...",2
0,"Well, right now I just woke up from a mid-day ...",4
1,"Well, here we go with the stream of consciousn...",0
2,An open keyboard and buttons to push. The thin...,2


In [None]:
df.to_csv('essays_cleaned.csv', index=False)

In [None]:
df['labels'].value_counts()

0    1310
1    1276
2    1271
3    1253
4    1233
Name: labels, dtype: int64

## keep only one occurence of each text and andomly assign one of its personality types

In [None]:
new_rows = []
for index, row in df_pre.iterrows():
  labels = convert_to_labels(row)
  if labels:
    new_row = row.copy()
    new_row['labels'] = random.choice(labels)
    new_rows.append(new_row)

# Create a new DataFrame from the list of new rows
df = pd.DataFrame(new_rows)

In [None]:
df.drop(columns=['ext'], inplace=True)
df.drop(columns=['neu'], inplace=True)
df.drop(columns=['agr'], inplace=True)
df.drop(columns=['con'], inplace=True)
df.drop(columns=['opn'], inplace=True)

In [None]:
df.head()

Unnamed: 0,text,labels
0,"Well, right now I just woke up from a mid-day ...",0
1,"Well, here we go with the stream of consciousn...",0
2,An open keyboard and buttons to push. The thin...,4
3,I can't believe it! It's really happening! M...,0
4,"Well, here I go with the good old stream of co...",2


In [None]:
df.to_csv('essays_cleaned_v2.csv', index=False)

In [None]:
df['labels'].value_counts()

4    539
0    488
2    476
3    453
1    451
Name: labels, dtype: int64

## Multilabels

In [6]:
def convert_multilabels(row):
  labels = [0] * 5
  if row['agr'] == 'y':
    labels[0] = 1  # Agreeableness
  if row['ext'] == 'y':
    labels[1] = 1  # Extraversion
  if row['opn'] == 'y':
    labels[2] = 1  # Openness
  if row['con'] == 'y':
    labels[3] = 1  # Conscientiousness
  if row['neu'] == 'y':
    labels[4] = 1  # Neuroticism
  return labels

In [15]:
new_rows = []
for index, row in df_pre.iterrows():
  new_row = row.copy()
  new_row['labels'] = convert_multilabels(row)
  new_rows.append(new_row)

# Create a new DataFrame from the list of new rows
df = pd.DataFrame(new_rows)

In [17]:
df.drop(columns=['ext'], inplace=True)
df.drop(columns=['neu'], inplace=True)
df.drop(columns=['agr'], inplace=True)
df.drop(columns=['con'], inplace=True)
df.drop(columns=['opn'], inplace=True)

In [20]:
df.head()

Unnamed: 0,text,labels
1211,Today has been one of the longest days of my l...,"[1, 0, 0, 0, 1]"
1112,I get sick to my stomach during your class and...,"[0, 0, 0, 0, 1]"
1342,I'm supposed to write for twenty minutes. So I...,"[0, 1, 1, 1, 1]"
411,I got very bored doing my chemistry homework s...,"[1, 1, 0, 1, 0]"
37,"Ok, I'm doing this stream of consciousness thi...","[1, 1, 1, 0, 0]"
1566,Somtimes I don't understand why people are the...,"[1, 0, 0, 1, 1]"
270,This is the first time I have ever had an assi...,"[0, 0, 0, 1, 1]"
661,I am glad that I am finally getting to this as...,"[1, 1, 1, 1, 0]"
1280,. It's amazing how much my mood differs by my ...,"[0, 1, 0, 1, 0]"
607,But I guess I'm supposed to be trying new thin...,"[1, 1, 0, 1, 0]"


In [19]:
df.to_csv('essays_multilabel.csv', index=False)