In [1]:
import torch
import torch.nn as nn
import torchtext.legacy.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
data = {
    "label": [0, 1, 1],
    "data": [
        "I like eggs and ham",
        "Eggs I like!",
        "Ham and eggs or just ham?"
    ]
}

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [5]:
df.to_csv("thedata.csv", index=False)

In [6]:
!head thedata.csv

label,data
0,I like eggs and ham
1,Eggs I like!
1,Ham and eggs or just ham?


In [7]:
TEXT = ttd.Field(
    sequential=True,
    batch_first=True,
    lower=True,
    tokenize="spacy",
    pad_first=True
)

LABEL = ttd.Field(
    sequential=False,
    use_vocab=False,
    # is_target=False, PyTorch assumes it's part of the input
    # E.g. for (inputs, targets), _ in iterator:
    is_target=True
)

dataset = ttd.TabularDataset(
    path="thedata.csv",
    format="csv",
    skip_header=True,
    fields=[('label', LABEL), ('data', TEXT)]
)



In [8]:
ex = dataset.examples[0]

In [9]:
type(ex)

torchtext.legacy.data.example.Example

In [10]:
ex.data

['i', 'like', 'eggs', 'and', 'ham']

In [11]:
ex.label

'0'

In [12]:
train_dataset, test_dataset = dataset.split(0.66)

In [13]:
TEXT.build_vocab(train_dataset)

In [14]:
vocab = TEXT.vocab
type(vocab)

torchtext.vocab.Vocab

In [15]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f243dec2d60>>,
            {'<unk>': 0,
             '<pad>': 1,
             'eggs': 2,
             'ham': 3,
             '!': 4,
             '?': 5,
             'and': 6,
             'i': 7,
             'just': 8,
             'like': 9,
             'or': 10})

In [16]:
vocab.itos

['<unk>', '<pad>', 'eggs', 'ham', '!', '?', 'and', 'i', 'just', 'like', 'or']

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [18]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset), 
    sort_key=lambda x:len(x.data),
    batch_sizes=(2, 2), # (train_batch_size, test_batch_size)
    device=device
)

In [19]:
for inputs, targets in train_iter:
  print(f"inputs: {inputs}, shape: {inputs.shape}")
  print(f"targets: {targets}, shape: {targets.shape}")
  break

inputs: tensor([[ 1,  1,  1,  2,  7,  9,  4],
        [ 3,  6,  2, 10,  8,  3,  5]]), shape: torch.Size([2, 7])
targets: tensor([1, 1]), shape: torch.Size([2])


In [20]:
for inputs, targets in test_iter:
  print(f"inputs: {inputs}, shape: {inputs.shape}")
  print(f"targets: {targets}, shape: {targets.shape}")
  break

inputs: tensor([[7, 9, 2, 6, 3]]), shape: torch.Size([1, 5])
targets: tensor([0]), shape: torch.Size([1])
