In [1]:
!/usr/bin/gcsfuse spoken-squad-data data

Using mount point: /home/jupyter/data
2021/03/14 01:36:54.707940 Opening GCS connection...
2021/03/14 01:36:54.905357 Mounting file system...
2021/03/14 01:36:54.934078 File system has been successfully mounted.


In [2]:
import os
DATA_PATH = 'data'
if not os.path.exists(DATA_PATH):
  %mkdir $DATA_PATH


MODEL_PATH = 'trained_models'
if not os.path.exists(MODEL_PATH):
  %mkdir $MODEL_PATH

if not os.path.exists('utils.py'):
  !wget -q http://web.stanford.edu/class/cs224s/download/utils.py

!pip3 -q install pytorch_lightning
!pip3 install wandb -qqq
#!pip3 install pytorch-pretrained-bert pytorch-nlp pytorch_transformers
#!pip3 install pytorch_transformers
!pip3 install transformers==4.3.3
!pip3 install torchaudio==0.8.0



In [1]:
import wandb

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import numpy as np

import transformers
from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model
from transformers import RobertaTokenizerFast, RobertaModel, RobertaForQuestionAnswering

from squad_dataset import *

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
train_contexts, train_questions, train_answers = read_squad('data/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('data/dev-v2.0.json')

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)


add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


In [6]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [3]:
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

for param in model.roberta.parameters():
    param.requires_grad = False   

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [None]:
for t, batch in enumerate(val_loader):
    print(batch)
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    loss, outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    
    loss = outputs.loss
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    
   

{'input_ids': tensor([[    0, 37257,   501,  ...,     1,     1,     1],
        [    0,  3762,  6680,  ...,     1,     1,     1],
        [    0, 23754,  4484,  ...,     1,     1,     1],
        ...,
        [    0,   713, 11577,  ...,     1,     1,     1],
        [    0,  4993,     5,  ...,     1,     1,     1],
        [    0,   673, 32027,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'start_positions': tensor([ 55, 170,  60,  20,  58,  55,  49,  16,   6,  41, 253,  65,  32,  63,
         22,  12]), 'end_positions': tensor([ 56, 176,  61,  22,  61,  57,  51,  18,   7,  42, 259,  70,  36,  73,
         27,  24])}


In [18]:
# Do not modify.

class LightningTextBERTQA(pl.LightningModule):
    """PyTorch Lightning class for training a BERT-QA model."""
    def __init__(self,learning_rate=1e-5, batch_size=16, weight_decay=1e-5):

        super().__init__()
        self.save_hyperparameters()
        self.lr = learning_rate
        self.batch_size = batch_size
        self.weight_decay = weight_decay   
        self.train_dataset, self.val_dataset= \
          self.create_datasets()

        self.model = self.create_model()
        self.tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
        self.output=[]

    def create_model(self):
        model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
        for param in model.roberta.parameters():
            param.requires_grad = False   
            
        return model

    def create_datasets(self):
        
        train_contexts, train_questions, train_answers = read_squad('data/train-v2.0.json')
        val_contexts, val_questions, val_answers = read_squad('data/dev-v2.0.json')

        add_end_idx(train_answers, train_contexts)
        add_end_idx(val_answers, val_contexts)

        
        train_encodings = self.tokenizer(train_contexts, train_questions, truncation=True, padding=True)
        val_encodings = self.tokenizer(val_contexts, val_questions, truncation=True, padding=True)


        add_token_positions(train_encodings, train_answers)
        add_token_positions(val_encodings, val_answers)

        train_dataset = SquadDataset(train_encodings)
        val_dataset = SquadDataset(val_encodings)

        return train_dataset, val_dataset

    def configure_optimizers(self):
        optim = torch.optim.AdamW(self.model.parameters(),
                                  lr=self.lr, weight_decay=self.weight_decay)
        return [optim], [] # <-- put scheduler in here if you want to use one


    def forward(self, input_ids, attention_masks, start_positions, end_positions):
        self.output = self.model(input_ids, attention_masks=attention_masks, start_positions=start_positions, end_positions=end_positions)
        return self.output.loss, self.output.start_logits, self.output.end_logits, self.output.hidden_states[0]

    def get_primary_task_loss(self, batch):
        """Returns ASR model losses, metrics, and embeddings for a batch."""
        input_ids, attention_masks = batch[0], batch[1]
        start_positions, end_positions = batch[2], batch[3]

        
        loss, start_logits,end_logits,_  = self.forward(
              input_ids, attention_masks, start_positions, end_positions)

        return loss, start_logits,end_logits,embedding

      # Overwrite TRAIN
    def training_step(self, batch):
        loss,_,_,_ = self.get_primary_task_loss(batch)
        self.log('train_loss', loss, prog_bar=True, on_step=True)
        return loss

      # Overwrite VALIDATION: get next minibatch
    def validation_step(self, batch):
        loss,_,_,_ = self.get_primary_task_loss(batch)
        return metrics

    
    def train_dataloader(self):
        # - important to shuffle to not overfit!
        # - drop the last batch to preserve consistent batch sizes
        loader = DataLoader(self.train_dataset, batch_size=self.batch_size,
                            shuffle=True, pin_memory=True, drop_last=True)
        return loader

    def val_dataloader(self):
        loader = DataLoader(self.val_dataset, batch_size=self.batch_size,
                            shuffle=False, pin_memory=True)
        return loader



In [16]:
WANDB_NAME = 'jefe-jeff' # Fill in your Weights & Biases ID here.

def run(system, config, ckpt_dir, epochs=1, monitor_key='val_loss', 
        use_gpu=False, seed=1337):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    SystemClass = globals()[system]
    system = SystemClass(**config)

    checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(MODEL_PATH, ckpt_dir),
    save_top_k=1,
    verbose=True,
    monitor=monitor_key, 
    mode='min')

    wandb.init(project='cs224s', entity=WANDB_NAME, name=ckpt_dir, 
         config=config, sync_tensorboard=True)
    wandb_logger = WandbLogger()

    if use_gpu:
        trainer = pl.Trainer(
        gpus=1, max_epochs=epochs, min_epochs=epochs,
        checkpoint_callback=checkpoint_callback, logger=wandb_logger)
    else:
        trainer = pl.Trainer(
        max_epochs=epochs, min_epochs=epochs,
        checkpoint_callback=checkpoint_callback, logger=wandb_logger)

    trainer.fit(system)
    result = trainer.test()


['.ipynb_checkpoints',
 '0_0.flac',
 '0_0_0.flac',
 '0_0_1.flac',
 '0_0_2.flac',
 '0_0_3.flac',
 '0_0_4.flac',
 '0_0_5.flac',
 '0_0_6.flac',
 '0_0_7.flac',
 '0_0_8.flac',
 '0_1.flac',
 '0_10.flac',
 '0_10_0.flac',
 '0_10_1.flac',
 '0_10_2.flac',
 '0_10_3.flac',
 '0_12.flac',
 '0_12_0.flac',
 '0_12_1.flac',
 '0_12_2.flac',
 '0_12_3.flac',
 '0_12_4.flac',
 '0_12_5.flac',
 '0_13.flac',
 '0_13_0.flac',
 '0_13_1.flac',
 '0_13_2.flac',
 '0_13_3.flac',
 '0_13_4.flac',
 '0_13_5.flac',
 '0_14.flac',
 '0_14_0.flac',
 '0_14_1.flac',
 '0_14_2.flac',
 '0_14_3.flac',
 '0_14_4.flac',
 '0_14_5.flac',
 '0_14_6.flac',
 '0_15.flac',
 '0_15_0.flac',
 '0_15_1.flac',
 '0_15_2.flac',
 '0_15_3.flac',
 '0_15_4.flac',
 '0_15_5.flac',
 '0_16.flac',
 '0_16_0.flac',
 '0_16_1.flac',
 '0_16_2.flac',
 '0_16_3.flac',
 '0_16_4.flac',
 '0_16_5.flac',
 '0_16_6.flac',
 '0_16_7.flac',
 '0_17.flac',
 '0_17_0.flac',
 '0_17_1.flac',
 '0_17_2.flac',
 '0_17_3.flac',
 '0_18.flac',
 '0_18_0.flac',
 '0_18_1.flac',
 '0_18_2.flac',


In [20]:
config = {
    'learning_rate': 1e-5, 
    'batch_size': 16, 
    'weight_decay': 0, 
}

run(system="LightningTextBERTQA", config=config, ckpt_dir='textbert_0gen', epochs=5, 
    use_gpu=True)