In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/Notebooks/Repository/go-emotions/notebooks
except: pass

In [9]:
%run library_utils.ipynb

import os 
import re
import pandas as pd
import preprocessor
from tqdm import tqdm
from sklearn.model_selection import train_test_split

tqdm.pandas()

module = 'go-emotion-pool'
args = load_args(module)
init_seed(args.seed)

AssertionError: 

## Cleaning dataset

In [3]:
dataset_source_path = os.path.join(args.input_dir, args.dataset_source_path)
spelling_map_path = os.path.join(args.input_dir,  'spelling', 'birkbeck.txt')
contraction_map_path = os.path.join(args.input_dir, 'contractions', 'contractions.csv')

In [4]:
# load dataset
D = pd.read_csv(dataset_source_path)
D.head(3)

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
# common spelling mistakes. 
S = pd.read_csv(spelling_map_path, sep=":",names=["correction","misspell"])
S['misspell'] = S['misspell'].str.strip()
S['misspell'] = S['misspell'].str.split(' ')
S = S.explode('misspell').reset_index(drop=True)
S = S.drop_duplicates('misspell')
S.head(3)

Unnamed: 0,correction,misspell
0,Albert,Ab
1,America,Ameraca
2,America,Amercia


In [6]:
# replacing constraction i.e. shoudn't => shoud not
C = pd.read_csv(contraction_map_path)
C.columns = ['contraction', 'meaning']
C = C[['meaning', 'contraction']]
C.head(3)

Unnamed: 0,meaning,contraction
0,alright,'aight
1,is not,ain't
2,am not,amn't


In [7]:
mapChar = {
    '’': '\''
}
mapC = dict(zip(C['contraction'], C['meaning']))
mapS = dict(zip(S['misspell'], S['correction']))
punctuations = '''()-[]{};:'"\,<>./@#$%^&_~'''
mapP = dict(zip(list(punctuations), [""] * len(punctuations)))

re_number = re.compile('[0-9]+')
re_url = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
re_tag = re.compile('\[[A-Z]+\]')
re_char = re.compile('[^0-9a-zA-Z\s?!.,:\'\"//]+')
re_char_clean = re.compile('[^0-9a-zA-Z\s?!.,\[\]]')
re_punc = re.compile('[?!,.\'\"]')

In [8]:
def _word_mapper(text, mapper):
    for word in text.split(' '):
        if word in mapper:
            text = text.replace(word, mapper[word])
    return text
    

def _char_mapper(text, mapper):
    for k, v  in mapper.items():
        text = text.replace(k, v)
    return text


def clean_text(text):
    text = re.sub(re_char, "", text) # Remove unknown character
    text = _char_mapper(text, mapChar) # Similar characters mapping
    # text = _word_mapper(text, mapC) # Remove contraction
    text = _word_mapper(text, mapS) # Remove spelling mistakes

    # text = re.sub(re_number, ' [number] ', text) # Replace number with tag
    text = re.sub(re_url, ' [url] ', text) # Replace URL with number

    # text = re.sub(re_punc, lambda a: f" {a.group(0)} ", text) # Add space between punctuation
    text = preprocessor.clean(text) # Remove tweet clean

    text = re.sub(re_char_clean, "", text) # Only alphanumeric and punctuations.
    text = text.lower() # Lower text
    text = " ".join([w for w in text.split(' ') if w != " "]) # Remove whitespace

    return text


## Preparing dataset 

In [None]:
def prepare_split_data(
    dataset_source_path, 
    train_dataset_path, 
    test_dataset_path, 
    emotions, 
    drop_insignifiant=False, 
    test_split=0.2, 
    seed=0, 
):
  emotions = list(emotions)
  data = pd.read_csv(dataset_source_path)
  data['text'] = data['text'].progress_apply(clean_text)
  data = data[data['text'] != '']
  data = data[['text'] + list(emotions)]

  if drop_insignifiant:
    data = data[data[list(emotions)].sum(1) > 0].reset_index(drop=True)

  train, test = train_test_split(data, 
                                 test_size=test_split, 
                                 shuffle=True, 
                                 random_state=seed)
  
  train.to_csv(train_dataset_path, index=None)
  test.to_csv(test_dataset_path, index=None)
  return

In [9]:
train_dataset_path = os.path.join(args.output_dir, args.train_dataset_path)
test_dataset_path = os.path.join(args.output_dir, args.test_dataset_path)

Dtr = pd.read_csv(train_dataset_path)
Dtr.head(3)

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
0,"oh yah, do worries. a appropriate you times i ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"name juice i a runs. dont worry, name i lacing...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,"its a donuts dont bought ya back, aint it?",0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_dataset_path = os.path.join(args.output_dir, args.train_dataset_path)
test_dataset_path = os.path.join(args.output_dir, args.test_dataset_path)

prepare_split_data(
    dataset_source_path, 
    train_dataset_path, 
    test_dataset_path, 
    args.emotions, 
    args.drop_insignifiant, 
    args.test_split, 
    args.seed
)

Dtr = pd.read_csv(train_dataset_path)
Dtr.head(3)

100%|██████████| 211225/211225 [00:25<00:00, 8186.49it/s]


Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
0,"oh yah, do worries. a appropriate you times i ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"name juice i a runs. dont worry, name i lacing...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,"its a donuts dont bought ya back, aint it?",0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
