In [1]:
from datasets import load_dataset
import tokenizers
import torch
import transformers
from torch.utils.data import DataLoader
from typing import List, Dict, Tuple
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

imdb_dataset = load_dataset("stanfordnlp/imdb")
split = imdb_dataset['train'].train_test_split(train_size=0.8, seed=42)
imdb_train_set, imdb_valid_set = split['train'], split['test']
imdb_test_set = imdb_dataset['test']
train_reviews = [review['text'] for review in imdb_train_set]
type(imdb_train_set)



datasets.arrow_dataset.Dataset

In [3]:
bert_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
# bert_encoding = bert_tokenizer(train_reviews[:3], padding=True, truncation=True, max_lenth=500, return_tensors="pt")
"""
bert_tokenizer.get_vocab() #get vocab
[PAD]
[UNK]
[CLS]
[SEP]

"""

'\nbert_tokenizer.get_vocab() #get vocab\n[PAD]\n[UNK]\n[CLS]\n[SEP]\n\n'

In [4]:
bert_tokenizer(imdb_train_set[0:3]['text'], padding=True, max_len=200,return_tensors="pt")['input_ids'].shape

torch.Size([3, 234])

In [5]:
imdb_train_set[0]

{'text': 'Stage adaptations often have a major fault. They often come out looking like a film camera was simply placed on the stage (Such as "Night Mother"). Sidney Lumet\'s direction keeps the film alive, which is especially difficult since the picture offered him no real challenge. Still, it\'s nice to look at for what it is. The chemistry between Michael Caine and Christopher Reeve is quite brilliant. The dynamics of their relationship are surprising. Caine is fantastic as always, and Reeve gets one of his few chances to really act.<br /><br />I confess that I\'ve never seen Ira Levin\'s play, but I hear that Jay Presson Allen\'s adaptation is faithful. The script is incredibly convoluted, and keeps you guessing. "Deathtrap" is an enormously entertaining film, and is recommended for nearly all fans of stage and screen.<br /><br />7.4 out of 10',
 'label': 1}

In [6]:
def collate_fn(
                batch: List[Dict[str:str, str:int]],
                tokenizer=bert_tokenizer
                ) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Out: encodings: (B, max_len)
        labels: (B, 1)
    """

    reviews = [review['text'] for review in batch]
    labels = [[review['label']] for review in batch]
    encodings = tokenizer(reviews, padding=True, truncation=True, max_len=200, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.float32)
    return encodings, labels

batch_size = 16
imdb_train_loader = DataLoader(imdb_train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
imdb_valid_loader = DataLoader(imdb_valid_set, batch_size=batch_size, collate_fn=collate_fn)
imdb_test_loader =  DataLoader(imdb_test_set, batch_size=batch_size, collate_fn=collate_fn)


In [7]:
class SentimentAnalysisModel(nn.Module):
    def __init__(self, vocab_size, n_layers=2, embed_dim=128, hidden_dim=64, pad_id=0, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.output = nn.Linear(2 * hidden_dim, 1)

    def forward(self, encodings):
        """
        encodings: (B, max_len)
        embeddings: (B, max_len, embed_dim)
        _outputs: (B, max_len, hidden_dim)
        hidden_state: (n_layers, B, hidden_dim) #(n_layers * 2 if bidirection, B, , hidden_dim)
        out (B, 1)
        """
        embeddings = self.embed(encodings['input_ids'])
        lengths = encodings["attention_mask"].sum(dim=1)
        packed = pack_padded_sequence(embeddings, lengths=lengths, batch_first=True, enforce_sorted=False)

        _outputs, hidden_state = self.gru(packed)
        "拿最后一个layer的输出 bidirection=False的时候"
        # output = self.output(hidden_state[-1,:,:])
        """
        拿最后两层 (n_layers * 2, B, hidden_dim) -> (2, B, hidden_dim) -> (B, 2 * hidden_dim)
        """
        n_dims = self.output.in_features
        top_states = hidden_state[-2:].permute(1, 0, 2).reshape(-1, n_dims)
        output = self.output(top_states)
        return output

model = SentimentAnalysisModel(vocab_size=len(bert_tokenizer.get_vocab()))

x, y = next(iter(imdb_train_loader))
model(x).shape

torch.Size([16, 1])

### Reusing Pretrained Embeddings and Language Models

In [8]:
bert_model = transformers.AutoModel.from_pretrained("bert-base-uncased")
type(bert_model.embeddings.word_embeddings)

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 499.35it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


torch.nn.modules.sparse.Embedding

In [9]:
class SentimentAnalysisModelPreEmbeds(nn.Module):
    def __init__(self, pretrained_embeddings: torch.nn.modules.sparse.Embedding, n_layers=2, hidden_dim=64, dropout=0.2):
        super().__init__()
        weights = pretrained_embeddings.weight.data
        self.embed = nn.Embedding.from_pretrained(weights, freeze=True)
        embed_dim = weights.shape[-1]
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.output = nn.Linear(2 * hidden_dim, 1)

    def forward(self, encodings):
        """
        encodings: (B, max_len)
        embeddings: (B, max_len, embed_dim)
        _outputs: (B, max_len, hidden_dim)
        hidden_state: (n_layers, B, hidden_dim) #(n_layers * 2 if bidirection, B, , hidden_dim)
        out (B, 1)
        """
        embeddings = self.embed(encodings['input_ids'])
        lengths = encodings["attention_mask"].sum(dim=1)
        packed = pack_padded_sequence(embeddings, lengths=lengths, batch_first=True, enforce_sorted=False)

        _outputs, hidden_state = self.gru(packed)
        "拿最后一个layer的输出 bidirection=False的时候"
        # output = self.output(hidden_state[-1,:,:])
        """
        拿最后两层 (n_layers * 2, B, hidden_dim) -> (2, B, hidden_dim) -> (B, 2 * hidden_dim)
        """
        n_dims = self.output.in_features
        top_states = hidden_state[-2:].permute(1, 0, 2).reshape(-1, n_dims)
        output = self.output(top_states)
        return output
model = SentimentAnalysisModelPreEmbeds(pretrained_embeddings=bert_model.embeddings.word_embeddings)

x, y = next(iter(imdb_train_loader))
model(x).shape

torch.Size([16, 1])

### Bert cls

In [10]:
bert_encoding = bert_tokenizer(train_reviews[:3], padding=True, max_len=200, truncation=True, return_tensors="pt")
bert_output = bert_model(**bert_encoding)
# bert_output

In [11]:
bert_output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [12]:
print(bert_output.last_hidden_state.shape)
bert_output.pooler_output.shape

torch.Size([3, 234, 768])


torch.Size([3, 768])

In [None]:
#show model config
# bert_model.config

device = torch.device("cuda")
class SentimentAnalysisModelBert(nn.Module):
    def __init__(self, n_layers=2, hidden_dim=64, dropout=0.2):
        super().__init__()
        self.bert = transformers.AutoModel.from_pretrained("bert-base-uncased")
        embed_dim = self.bert.config.hidden_size
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=False, dropout=dropout)
        self.output = nn.Linear(hidden_dim, 1)

    def forward(self, encodings):
        contextualized_embeddings = self.bert(**encodings).last_hidden_state
        lengths = encodings["attention_mask"].sum(dim=1)
        packed = pack_padded_sequence(contextualized_embeddings, lengths=lengths, batch_first=True, enforce_sorted=False)
        _outputs, hidden_state = self.gru(packed)
        output = self.output(hidden_state[-1,:,:])
        return output

model = SentimentAnalysisModelBert().to(device)

x, y = next(iter(imdb_train_loader))
model(x.to(device)).shape


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 510.18it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 3.63 GiB of which 96.31 MiB is free. Including non-PyTorch memory, this process has 3.48 GiB memory in use. Of the allocated memory 3.36 GiB is allocated by PyTorch, and 41.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)