# Data Pre-processing and dataloaders

In [1]:
# Data source download: https://drive.google.com/file/d/1y7yjshepNRPhnh-Qz5MTRbnopGn7KzUm/view?usp=sharing
# Originally from: https://github.com/sarnthil/unify-emotion-datasets

In [2]:
from data.unified_emotion.unified_emotion import unified_emotion

unified = unified_emotion("./data/datasets/unified-dataset.jsonl")

unified.prep()

In [3]:
unified.lens

{'grounded_emotions': 2585,
 'ssec': 4868,
 'crowdflower': 40000,
 'dailydialog': 102979,
 'tales-emotion': 14771,
 'tec': 21051,
 'emoint': 7102}

In [4]:
trainloader, testloader = unified.get_dataloader('emoint')
next(trainloader)

([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
 ['@rtrn94 Mine is that the party did decide but the party has been slowly transformed into a vengeful hell-cult of white male resentment',
  '@coltonflurry @StrangeFacesLA I cancelled by CBS all access live feeds before JC even said Vic won AFP. Paul.should have won IMO #bitter',
  '@fluffysoftlouis no no. I insist that you give me your best insult first',
  '@Apple thanks for ios10 update, even the best app @telegram freezing and crashing on SE.',
  "#ahs6 every 5 minutes I've been saying 'nope nope I'd be gone by now.' 'MOVE' or 'GTFO' so thank you for the",
  "i met with an employment specialist this morning &amp; she basically told me its unlikely i'll get a job because i clearly have too much anxiety",
  '@Gen_Ironicus Musicians have a superstitious dread of crossing highway 69, true story',
  "It's 5:55am. I'm hungry but there is no food. #panic",
  "@viquintis OH yeah; I never showed you that part of the session did I? I'll plo

In [5]:
trainloader, testloader = unified.get_dataloader('grounded_emotions', shuffle=True)
next(trainloader)

([1, 1, 1, 1, 0, 0, 0, 0],
 [".@THEVinceRusso How long till you hit @johnmellencamp's home in Bloomington, IN to duet #CrumblinDown forâ\x80¦ https://t.co/FUlWhmyyoE",
  'RT @4everNeverTrump: @davidfrum Spicer lies. Spicer lies. Spicer lies.',
  '@realDonaldTrump because your ego is so damaged that you NEED it. Campaign is over. #SAD',
  "@SamanthaAugeri. @SamanthaAugeri Hey Sam Augeri Happy international women's Day From Ralph ð\x9f\x98\x80ð\x9f\x98\x80â\x9d¤â\x9d¤â\x9d¤â\x9d¤â\x9d¤",
  "@StopTrump2020 like Trump, #Trumpcare is a #disaster If the @GOP passes it they're will be #riots #wewoke #theyaintfoolinus",
  'Stay positive  #saturdaymorning',
  "Alice's face thinking about the weather on Thursday. alicethatcj #smile #happy #fuji #fujifilmâ\x80¦ https://t.co/k3Fu88D7OS",
  'Boom https://t.co/LHjH6Io6aE'])

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

trainloader, testloader = unified.get_dataloader('grounded_emotions', tokenizer=tokenizer, shuffle=True)
next(trainloader)

(tensor([1, 1, 1, 1, 0, 0, 0, 0]),
 tensor([[  101,   155,  1942,   137,   149, 15554,  2895, 16382, 22433,   131,
            137, 10624,  2064,  9962,   137,   180, 13148,  8752,  3810,  1580,
           1571,  1195,  1274,   112,   189,  1221,  1191,  1119,  1125,  1251,
            107,  1842,  1948,   107,  2612,   108,  8499,  1281,   112,   189,
           1836,  1117,  7538,   106,  1109,  1299,  1110,   170,   100,   102],
         [  101,   137,  1842,  2137, 24059,  1181,  1942, 27321,   108, 13411,
           2137,   119, 19293, 12507,  2162, 15531, 17056,  2137,   119,   108,
           9468,  2737,  2138,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [  101,  2421,   112,   188,  1208,  5663,  1240, 12616,   119,  1135,
            112,   188, 16972,  2346,  1136, 16972,  1105,  1128,  1444,   170,
   

In [7]:
from data.utils.sampling import dataset_sampler

source_name = dataset_sampler(unified, sampling_method='sqrt')
source_name

'tec'

In [8]:
# Raises StopIteration when there is not enough data left to generate an N x K shot
while True:
    next(testloader)

StopIteration: Some classes ran out of data.