In [1]:
#!pip install torchtext==0.8.1

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import torch
import torch.nn as nn
import torchtext.data as ttd

In [3]:
# Let's make some fake data!
data = {
    "label": [0, 1, 1],
    "data": [
        "I like eggs and ham.",
        "Eggs I like!",
        "Ham and eggs or just ham?",
    ]
}

In [4]:
df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham.
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [6]:
df.to_csv('thedata.csv', index=False)

In [7]:
!head thedata.csv

label,data
0,I like eggs and ham.
1,Eggs I like!
1,Ham and eggs or just ham?


## Create field objects

In [8]:
TEXT = ttd.Field(
    sequential=True, # each sample is sequence of words
    batch_first=True, # N x T
    lower=True, # lower case the words
    #tokenize='spacy', 
    
    # if 'spacy' commented it uses string.split()
    
    pad_first=True) # pre-padding


LABEL = ttd.Field(sequential=False, use_vocab=False, is_target=True)

# Note: if you don't specify use_vocab=False, then PyTorch will
# complain later when you try to iterate over the dataset that
# the attribute `vocab` doesn't exist.

# Note 2: if you don't specify is_target=True, then PyTorch will
# assume it's part of the input, so when you iterate over the
# dataset it will be like:
# for (inputs, targets), _ in iterator:
# where the 2nd element (_) should have been the target.

dataset = ttd.TabularDataset(
    path='thedata.csv',
    format='csv',
    skip_header=True,
    
    # specify fields in the order they appear in .csv file
    fields=[('label', LABEL), ('data', TEXT)]
)



In [9]:
ex = dataset.examples[0]

In [10]:
type(ex)

torchtext.data.example.Example

In [11]:
ex.data

['i', 'like', 'eggs', 'and', 'ham.']

In [12]:
ex.label

'0'

In [13]:
train_dataset, test_dataset = dataset.split(0.66) # default is 0.7

In [14]:
for ex in train_dataset.examples:
  print(ex.data)

['ham', 'and', 'eggs', 'or', 'just', 'ham?']
['i', 'like', 'eggs', 'and', 'ham.']


In [15]:
TEXT.build_vocab(train_dataset,)

In [16]:
vocab = TEXT.vocab
type(vocab)

torchtext.vocab.Vocab

In [17]:
vocab.stoi

# if you use spacey, punctuation will be considered a token '?', '.'

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fc8f0cf48e0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'and': 2,
             'eggs': 3,
             'ham': 4,
             'ham.': 5,
             'ham?': 6,
             'i': 7,
             'just': 8,
             'like': 9,
             'or': 10})

In [25]:
vocab.stoi['ham']

4

In [19]:
vocab.itos

['<unk>',
 '<pad>',
 'and',
 'eggs',
 'ham',
 'ham.',
 'ham?',
 'i',
 'just',
 'like',
 'or']

In [26]:
vocab.itos[4]

'ham'

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [21]:
train_iter, test_iter = ttd.Iterator.splits(
        (train_dataset, test_dataset), 
        sort_key=lambda x: len(x.data), # each sentence of roughly equal size to organize batches of equal size sentences
        batch_sizes=(2, 2), # tuple (batch size of train, batch size of test)
        device=device) # refers to placing dataset automatically on GPU



In [22]:
for inputs, targets in train_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)
  break

inputs: tensor([[ 4,  2,  3, 10,  8,  6],
        [ 1,  7,  9,  3,  2,  5]]) shape: torch.Size([2, 6])
targets: tensor([1, 0]) shape: torch.Size([2])




In [23]:
for inputs, targets in test_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)
  break

inputs: tensor([[3, 7, 0]]) shape: torch.Size([1, 3])
targets: tensor([1]) shape: torch.Size([1])


In [24]:
# Exericise: Figure out which sequence of integers goes with which sentence.