<a href="https://colab.research.google.com/github/jhlbxx/-/blob/master/Predicting_the_sentiment_of_IMDB_movie_records.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting urllib3>=1.25
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker>=2.0.0
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: urllib3, portalocker, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed portalocker-2.7.0 torchdata-0.5.1 urllib3-1.26.14


In [1]:
import torch
from torchtext.datasets import IMDB
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

In [2]:
## Step 1: create the datasets
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_dataset),[20000,5000])

In [18]:
## Step 2: find unique tokens (words)
import re
from collections import Counter, OrderedDict

def tokenizer(text):
  text = re.sub('<[^>]*>', '',text)
  emoticons = re.findall('(?::\;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-','')
  tokenized = text.split()
  return tokenized

In [19]:
token_counts = Counter()
for label, line in train_dataset:
  tokens = tokenizer(line)
  token_counts.update(tokens)
print('Vocab-size:', len(token_counts))

Vocab-size: 75953


In [20]:
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>",0)
vocab.insert_token("<unk>",1)
vocab.set_default_index(1)

In [23]:
print([vocab[token] for token in ['this','is','an','example']])

[11, 7, 35, 462]


In [25]:
## Step 3-A: define the functions for transformation
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x:1. if x=='pos' else 0.

In [27]:
import torch.nn as nn

In [28]:
## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
  label_list, text_list, lengths = [],[],[]
  for _label, _text in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text),dtype=torch.int64)
    text_list.append(processed_text)
    lengths.append(processed_text.size(0))
  label_list = torch.tensor(label_list)
  lengths = torch.tensor(lengths)
  padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
  return padded_text_list, label_list, lengths