# Data Pre-processing and dataloaders

In [1]:
# Data source download: https://drive.google.com/file/d/1y7yjshepNRPhnh-Qz5MTRbnopGn7KzUm/view?usp=sharing
# Originally from: https://github.com/sarnthil/unify-emotion-datasets

In [2]:
from data.unified_emotion.unified_emotion import unified_emotion

unified = unified_emotion("./data/datasets/unified-dataset.jsonl")

unified.prep()
#unified.prep(text_tokenizer=manual_tokenizer)

In [3]:
unified.lens

{'grounded_emotions': 2585,
 'crowdflower': 40000,
 'dailydialog': 102979,
 'tales-emotion': 14771,
 'tec': 21051,
 'emoint': 7102}

In [4]:
for k in unified.lens.keys():
    trainloader, testloader = unified.get_dataloader(k, k=1)
    _, text = next(trainloader)
    print(k)
    print(text)
    print()

grounded_emotions
["@PennyDepp It's clearly the Irish missionaries going to Nigeria that brought their proverbs", 'RT @PattyArquette: @realDonaldTrump You are a heartless Billionaire who has 3 residences. We pay more 2 protect UR wife in NY than Arts &amp; Mâ\x80¦']

crowdflower
['@mitchelmusso', 'UGG WANT TO GO TO KAYLEN HOUSE BUT I CANT FINNA BE BORED THIS WEEKEND!!! UUURGG WANNA SPEND DA NITE  AND GO SEE UP AND GO SHOPPING', "I don't know what my Mom gave me to clean my Macbook with but it is SO white now. It was tinted grayish blue from my black desk.", "Have to sell my car. It's costing me too much. Can now afford one rollerskate. Bye bye petey", "Alexia has clogged up twitter...... again.... so I thought I'd post a tweet. Um... Just had a shower!!  Now I will have a drink! Teehee...", '@MotherBlanker I love brewing ideas! Great things always come from them.  keeping my fingers crossed for you!', 'unemployment office sucks', 'Cleaning the House! Im so boring..']

dailydialog
['Wha

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

trainloader, testloader = unified.get_dataloader('grounded_emotions', tokenizer=tokenizer, shuffle=True)
next(trainloader)

(tensor([1, 1, 1, 1, 0, 0, 0, 0]),
 tensor([[  101, 19387,  1030,  3021, 18098, 18632,  1024,  2644,  2157,  2085,
           1998,  2228,  2055,  2023,  1012,  2256,  2406,  2003,  2085,  2641,
          25135,  2000,  3942,  2011,  3010,  2611, 10158,  1012,  2003,  2023,
           2637,  1029,  1037, 29649,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0],
         [  101,  1030,  2771, 19658,  9628, 10732,  4806,  2003,  1996,  2069,
           2126,  2000,  2147,  1016, 18150,  5886,  1012,  4807,  3084,  3071,
           2468,  2149,  1012,  2057,  2024,  2149,  1006,  1996,  2088,  1007,
           1004, 23713,  1025,  1045,  1005,  1049,  2025,  2358,  9397,  3070,
            102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0],
         [  101, 19387,  1030,  6289,  2906,  5092,  1024,  8398,  1005, 

In [6]:
from data.utils.sampling import dataset_sampler

source_name = dataset_sampler(unified, sampling_method='sqrt')
source_name

'crowdflower'

In [7]:
# Raises StopIteration when there is not enough data left to generate an N x K shot
while True:
    next(testloader)

StopIteration: Some classes ran out of data.

# Custom Tokenizer

In [8]:
from data.utils.tokenizer import manual_tokenizer

The raw data set

In [9]:
trainloader, testloader = unified.get_dataloader('tec', shuffle=True)
labels, text = next(trainloader)
text

['I opened my door and saw my niece and nephew',
 "Just found out I'm going to Virginia on Christmas Day. #HolidayPlans",
 'i come home to find no one home',
 'i love having perfect days on a monday.',
 "@Zohair_Z for sure! and they've taken a nosedive since then kind of like EP after Irtiqa",
 '@Maffica I ain tink he face fat he maybe tryin to get it fat',
 'i need a vacation, junk food ,retail therapy, a biiiig hug , a bottle and a blunttt.',
 "If Hogs were playin ANYONE but MS Valley State I'd let it slide for Mike Anderson Here's to 3.5 months of no Modern Family on time.",
 "Oh how I love working on homework til 11 o'clock at night",
 'â\x80\x8e525,600 minutes...how do you measure a year?   #sorrow #thankful #blessed #LOVE',
 'Break! In a mountain meadow you see wild horses running free. Connect with them &amp; feel the freedom. #LJB',
 "So instead of shopping like 92% of the population, tomorrow I'm doing nothing but applying for jobs. Oh and working..",
 'Bloom is now a fucking 

The same, but now manually tokenized, sample

In [10]:
list(map(manual_tokenizer, text))

['i opened my door and saw my niece and nephew',
 'just found out i am going to virginia on christmas day . # holidayplans',
 'i come home to find no one home',
 'i love having perfect days on a monday .',
 '@USER for sure ! and they have taken a nosedive since then kind of like ep after irtiqa',
 '@USER i ain tink he face fat he maybe tryin to get it fat',
 'i need a vacation , junk food , retail therapy , a biiiig hug , a bottle and a blunttt .',
 'if hogs were playin anyone but ms valley state i d let it slide for mike anderson here s to 3 . 5 months of no modern family on time .',
 "oh how i love working on homework til 11 o ' clock at night",
 '\u200e525 , 600 minutes ... how do you measure a year ? # sorrow # thankful # blessed # love',
 'break ! in a mountain meadow you see wild horses running free . connect with them feel the freedom . # ljb',
 'so instead of shopping like 92 % of the population , tomorrow i am doing nothing but applying for jobs . oh and working ..',
 'bloom i

Can be easily slotted into the data loading process

In [11]:
unified.prep(text_tokenizer=manual_tokenizer)

Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.
Removed sentence for bad encoding.


In [12]:
for k in unified.lens.keys():
    trainloader, testloader = unified.get_dataloader(k, k=1)
    _, text = next(trainloader)
    print(k)
    print(text)
    print()

grounded_emotions
['rt @USER : trump wants $ 54b more for bombs while cutting meals on wheels . this is america , 2017 .', '@USER @USER @USER he s getting a kick back from the corporate insurance companies .. he s as dirty as trump']

crowdflower
['@USER i think 30 bucks and i dunno', 'got to clean the bathroom today .', 'up early this morning , first to portmeirion then home', 'last night s paper writing session = still not done . i need to prioritize better .', 'gah ! do not realize my meeting was from 9 - 12', '@USER but if you got hit by a car , who would make such lovely music ?', 'i am really mad at the world today . today is just a sucky day .', '@USER not if you have to be to work at six ...']

dailydialog
['a well - learned person never does that .', 'three years . they have been dating since freshman year . they are even talking about marriage .', 'you are welcome , bye !', 'i know . but i just can not stand her butting in when i was trying to emphasize a point .', 'really ? 