In [1]:
# Code originally adapted from: https://docs.microsoft.com/en-us/learn/paths/pytorch-fundamentals/
# Unlike the original code though, this notebook was run in Google Colab

In [1]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
[?25hCollecting portalocker>=2.0.0
  Downloading portalocker-2.5.1-py2.py3-none-any.whl (15 kB)
Collecting urllib3>=1.25
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 47.3 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 47.7 MB/s 
Installing collected packages: urllib3, portalocker, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed portalocker-2.5.1 torchdata-0.4.1 urllib3-1.25.11


In [2]:
import torch
import torchtext
import os
import collections
os.makedirs('data',exist_ok=True)
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='data')

In [3]:
print(train_dataset) # These are iterators

ShardingFilterIterDataPipe


In [4]:
classes = ['World', 'Sports', 'Business', 'Sci/Tech']

In [5]:
for i,x in zip(range(5),train_dataset):
  print(f"**{classes[x[0]]}** -> {x[1]}\n")

**Sci/Tech** -> Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.

**Sci/Tech** -> Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.

**Sci/Tech** -> Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.

**Sci/Tech** -> Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.

**Sci/Tech** -> Oil prices soa

In [6]:
# Because these datasets are built as iterators, we have to convert it to list for our purpose
train_dataset = list(train_dataset)
test_dataset = list(test_dataset)

In [7]:
train_dataset[0:10]

[(3,
  "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."),
 (3,
  'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'),
 (3,
  "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."),
 (3,
  'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.'),
 (3,
  'Oil prices soar to all-time record, 

# **Tokenization and Vectorization to representation as tensors**

In [8]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [9]:
first_sentence = train_dataset[0][1]
second_sentence = train_dataset[1][1]

print(first_sentence)

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


In [10]:
f_tokens = tokenizer(first_sentence)
s_tokens = tokenizer(second_sentence)

print(f'\nfirst token list:\n{f_tokens}')
print(f'\nsecond token list:\n{s_tokens}')


first token list:
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']

second token list:
['carlyle', 'looks', 'toward', 'commercial', 'aerospace', '(', 'reuters', ')', 'reuters', '-', 'private', 'investment', 'firm', 'carlyle', 'group', ',', '\\which', 'has', 'a', 'reputation', 'for', 'making', 'well-timed', 'and', 'occasionally\\controversial', 'plays', 'in', 'the', 'defense', 'industry', ',', 'has', 'quietly', 'placed\\its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market', '.']


In [11]:
# To convert text to numbers, we need to build a vocabulary of all tokens
# We use dictionary via Counter and then create a Vocab object

counter = collections.Counter()
for (label, line) in train_dataset:
  counter.update(tokenizer(line))
print(counter)



In [12]:
# vocab = torchtext.vocab.Vocab(counter, min_freq=1) # min_freq gave an error for some reason
vocab = torchtext.vocab.Vocab(counter)
print(vocab)

Vocab()


In [13]:
word_lookup = [list((vocab[w], w)) for w in f_tokens]
print(f'\nIndex lookup in 1st sentence:\n{word_lookup}')


Index lookup in 1st sentence:
[[1395, 'wall'], [1409, 'st'], [225971, '.'], [399, 'bears'], [17, 'claw'], [4123, 'back'], [6637, 'into'], [203843, 'the'], [761, 'black'], [41106, '('], [19310, 'reuters'], [40787, ')'], [19310, 'reuters'], [39206, '-'], [2, 'short-sellers'], [165685, ','], [1395, 'wall'], [1581, 'street'], [32235, "'"], [61724, 's'], [1, 'dwindling\\band'], [97909, 'of'], [2, 'ultra-cynics'], [165685, ','], [9723, 'are'], [135, 'seeing'], [828, 'green'], [1758, 'again'], [225971, '.']]


In [14]:
word_lookup = [list((vocab[w], w)) for w in s_tokens]
print(f'\nIndex lookup in 2nd sentence:\n{word_lookup}')


Index lookup in 2nd sentence:
[[15, 'carlyle'], [600, 'looks'], [758, 'toward'], [490, 'commercial'], [124, 'aerospace'], [41106, '('], [19310, 'reuters'], [40787, ')'], [19310, 'reuters'], [39206, '-'], [696, 'private'], [809, 'investment'], [1776, 'firm'], [15, 'carlyle'], [4676, 'group'], [165685, ','], [5, '\\which'], [18945, 'has'], [110153, 'a'], [117, 'reputation'], [50186, 'for'], [1114, 'making'], [2, 'well-timed'], [68872, 'and'], [1, 'occasionally\\controversial'], [296, 'plays'], [95488, 'in'], [203843, 'the'], [1192, 'defense'], [2264, 'industry'], [165685, ','], [18945, 'has'], [140, 'quietly'], [1, 'placed\\its'], [66, 'bets'], [56529, 'on'], [2508, 'another'], [1636, 'part'], [97909, 'of'], [203843, 'the'], [3637, 'market'], [225971, '.']]


In [15]:
vocab_size = len(vocab)
print(f"Vocab size of {vocab_size}")

Vocab size of 95810


In [16]:
from torchtext.data.utils import ngrams_iterator

bi_counter = collections.Counter()
for (label, line) in train_dataset:
  bi_counter.update(ngrams_iterator(tokenizer(line),ngrams=2))
bi_vocab = torchtext.vocab.Vocab(bi_counter)

In [17]:
print(len(bi_vocab))

1308842
