<a href="https://colab.research.google.com/github/jinwooahnKHU/Pytorch_tutorial_practice/blob/main/Representing_text_as_tensors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Representing text

In [12]:
# token : atomic piece of text. Could be letters, words or parts of words
# tokenization : converting text into a sequence of tokens
# vectorization : assign each token to a number 

#huggingface, gensim, nltk, opencv, torch 관련 패키지들 다운로드

!pip install -r https://raw.githubusercontent.com/MicrosoftDocs/pytorchfundamentals/main/nlp-pytorch/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Text classification Task

In [13]:
# text classification task based on AG_NEWS dataset
# classify news headlines into 4 categories(world, sports, business, sci/tech)
# dataset is built from torchtext module

import torch
import torchtext
import os
import collections

os.makedirs('./data', exist_ok=True)
#train_dataset, test_dataset contain iterators that return pairs of label(num of classes) and text
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(
    root='./data'
)
classes = ['World','Sports','Business','Sci/Tech']


In [14]:
next(train_dataset)


(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [15]:
for i, x in zip(range(5), train_dataset): # iterator 이므로 zip으로 호출
  print(f"**{classes[x[0]]}** -> {x[1]}\n")

**Sci/Tech** -> Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.

**Sci/Tech** -> Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.

**Sci/Tech** -> Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.

**Sci/Tech** -> Oil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace

In [16]:
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
train_dataset = list(train_dataset)
test_dataset = list(test_dataset)

### Tokenization and Vectorization

In [17]:
#convert text into numbers
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [18]:
first_sentence = train_dataset[0][1]
second_sentence = train_dataset[1][1]

f_tokens = tokenizer(first_sentence)
s_tokens = tokenizer(second_sentence)

print(f'\nfirst token list:\n{f_tokens}')
print(f'\nsecond token list:\n{s_tokens}')


first token list:
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']

second token list:
['carlyle', 'looks', 'toward', 'commercial', 'aerospace', '(', 'reuters', ')', 'reuters', '-', 'private', 'investment', 'firm', 'carlyle', 'group', ',', '\\which', 'has', 'a', 'reputation', 'for', 'making', 'well-timed', 'and', 'occasionally\\controversial', 'plays', 'in', 'the', 'defense', 'industry', ',', 'has', 'quietly', 'placed\\its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market', '.']


In [19]:
#build a vocabulary of tokens with "Counter" object, then create a "Vocab" object 
# counter은 빈도수세주는 클래스(사전 사용)
counter = collections.Counter()
for (label, line) in train_dataset:
  counter.update(tokenizer(line))

#dictionary counter 을 torchtext.vocab사용하면 사전을 build할수있다
vocab = torchtext.vocab.Vocab(counter, min_freq=1)

In [20]:
vocab

<torchtext.vocab.Vocab at 0x7f3894b1e490>

In [21]:
word_lookup = [list((vocab[w], w)) for w in f_tokens]
print(f'\nIndex lockup in 1st sentence:\n{word_lookup}')

word_lookup = [list((vocab[w], w)) for w in s_tokens]
print(f'\nIndex lockup in 2nd sentence:\n{word_lookup}')


Index lockup in 1st sentence:
[[432, 'wall'], [426, 'st'], [2, '.'], [1606, 'bears'], [14839, 'claw'], [114, 'back'], [67, 'into'], [3, 'the'], [849, 'black'], [14, '('], [28, 'reuters'], [15, ')'], [28, 'reuters'], [16, '-'], [50726, 'short-sellers'], [4, ','], [432, 'wall'], [375, 'street'], [17, "'"], [10, 's'], [67508, 'dwindling\\band'], [7, 'of'], [52259, 'ultra-cynics'], [4, ','], [43, 'are'], [4010, 'seeing'], [784, 'green'], [326, 'again'], [2, '.']]

Index lockup in 2nd sentence:
[[15875, 'carlyle'], [1073, 'looks'], [855, 'toward'], [1311, 'commercial'], [4251, 'aerospace'], [14, '('], [28, 'reuters'], [15, ')'], [28, 'reuters'], [16, '-'], [930, 'private'], [798, 'investment'], [321, 'firm'], [15875, 'carlyle'], [99, 'group'], [4, ','], [27658, '\\which'], [29, 'has'], [6, 'a'], [4460, 'reputation'], [12, 'for'], [565, 'making'], [52791, 'well-timed'], [9, 'and'], [80618, 'occasionally\\controversial'], [2126, 'plays'], [8, 'in'], [3, 'the'], [526, 'defense'], [242, 'indus

In [23]:
#encode tokenized string into a set of a numbers
vocab_size = len(vocab)
print(f"vocab size of {vocab_size}")

#torch.vocab.vocab 으로 만들어진 자료구조에서 .stoi 를 사용하면 string 을 integer로 바꿔줌
def encode(x):
  return [vocab.stoi[s] for s in tokenizer(x)]

vec = encode(first_sentence)
print(vec)

vocab size of 95812
[432, 426, 2, 1606, 14839, 114, 67, 3, 849, 14, 28, 15, 28, 16, 50726, 4, 432, 375, 17, 10, 67508, 7, 52259, 4, 43, 4010, 784, 326, 2]


In [25]:
# decode
def decode(x):
  return [vocab.itos[i] for i in x]

#위에서 encode한 것을 다시 decode
decode(vec)

['wall',
 'st',
 '.',
 'bears',
 'claw',
 'back',
 'into',
 'the',
 'black',
 '(',
 'reuters',
 ')',
 'reuters',
 '-',
 'short-sellers',
 ',',
 'wall',
 'street',
 "'",
 's',
 'dwindling\\band',
 'of',
 'ultra-cynics',
 ',',
 'are',
 'seeing',
 'green',
 'again',
 '.']

### Bigrams, Trigrams, and N-Grams

In [26]:
# some words are part of multi word expressions (ex. "Hot dog" is different with hot and dog, so have to make independent vector)
# therefore when bi-word, tri-word is useful feature for training classifier, "N-gram representaions" are used in document classification

# in "bi-gram representation", will add all "word pairs" to the vocabulary, in addition to original words (기존의 단어장에 2개씩 묶어서 더 넣는다)
# to get n-gram representation, we can use ngrams_iterator function

from torchtext.data.utils import ngrams_iterator

# 마찬가지로 횟수를 세기에 counter 클래스를 사용해준다
bi_counter = collections.Counter()
for (label, line) in train_dataset:
  bi_counter.update(ngrams_iterator(tokenizer(line), ngrams=2))
bi_vocab = torchtext.vocab.Vocab(bi_counter, min_freq=2)

print(f"Bigram vocab size = {len(bi_vocab)}")


Bigram vocab size = 481971


In [28]:
def encode(x):
  return [bi_vocab.stoi[s] for s in tokenizer(x)]

vec = encode(first_sentence)

print(vec)

[572, 564, 2, 2326, 49106, 150, 88, 3, 1143, 14, 32, 15, 32, 16, 443749, 4, 572, 499, 17, 10, 0, 7, 468770, 4, 52, 7019, 1050, 442, 2]


In [29]:
def decode(x):
  return [bi_vocab.itos[s] for s in x]

decode(vec)

['wall',
 'st',
 '.',
 'bears',
 'claw',
 'back',
 'into',
 'the',
 'black',
 '(',
 'reuters',
 ')',
 'reuters',
 '-',
 'short-sellers',
 ',',
 'wall',
 'street',
 "'",
 's',
 '<unk>',
 'of',
 'ultra-cynics',
 ',',
 'are',
 'seeing',
 'green',
 'again',
 '.']