In [0]:
!pip install pytorch_pretrained_bert
!pip install transformers 

import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
#from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification, BertForNextSentencePrediction
from transformers import BertModel, BertTokenizer, BertPreTrainedModel, BertForMaskedLM
from tqdm import tqdm, trange
import io
import os
import numpy as np
import os
from glob import glob
import json
import subprocess
import pathlib 
from google.cloud import storage
import uuid
from google.colab import auth

auth.authenticate_user()


!gcloud config set project 'experiments-260319'

DATA_BUCKET = 'gs://cs229-project-data'
WORK_DIR = '/content'
pathlib.Path('/content/data/sts-b').mkdir(parents=True, exist_ok=True)
pathlib.Path('/content/output/sts-b').mkdir(parents=True, exist_ok=True)

DATA_DIR = os.path.join(WORK_DIR, 'data/sts-b/cs229-project-data')
OUTPUT_DIR = os.path.join(WORK_DIR, 'output')
EXEC_PATH = os.path.join(WORK_DIR, 'src/bert_finetune_with_pytorch.py')

#dirname = os.path.dirname(os.path.dirname(__file__))

#model_save_path = 'gs://gridspace-tts-data'
model_save_path = os.path.join(OUTPUT_DIR, 'sts-b')
train_data_path = os.path.join(DATA_DIR, 'sts-train.csv')
test_data_path = os.path.join(DATA_DIR, 'sts-test.csv')

!gsutil cp -r {DATA_BUCKET} {DATA_DIR}

MAX_LEN = 128
batch_size = 32

learning_rate = 3e-5
epochs = 3



Updated property [core/project].
Copying gs://cs229-project-data/LICENSE.txt...
Copying gs://cs229-project-data/correlation.pl...
Copying gs://cs229-project-data/readme.txt...
Copying gs://cs229-project-data/sts-dev.csv...
- [4 files][264.1 KiB/264.1 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://cs229-project-data/sts-test.csv...
Copying gs://cs229-project-data/sts-train.csv...
\ [6 files][  1.4 MiB/  1.4 MiB]                                                
Operation completed over 6 objects/1.4 MiB.                                      


In [0]:
print(os.getcwd())

/content


# New Section

In [0]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


class BertSimilarity(BertPreTrainedModel):
    def __init__(self, config):
        super(BertSimilarity, self).__init__(config)
        self.bert = BertModel(config)
        self.linear = torch.nn.Linear(config.hidden_size, 1)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):

        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        linear_output = self.linear(pooled_output)
        output = self.sigmoid(linear_output)

        return output


In [0]:
def main():

    
    torch.cuda.empty_cache()

    device_name = tf.test.gpu_device_name()
    if device_name != '/device:GPU:0':
      raise SystemError('GPU device not found')
    print('Found GPU at: {}'.format(device_name))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    #model.cuda()
    scores_train =[]
    first_sent_train = []
    second_sent_train = []

    scores_test =[]
    first_sent_test = []
    second_sent_test = []

    sent_pairs = []

    with open(train_data_path, encoding='utf-8') as fin:
        train_data = fin.read().split('\n')
    train_data = [line for line in train_data if line.strip()]
    for line in train_data:
        pair = []
        line1 = line.split('\t')
        if float(line1[4]) <= 4.0:
            scores_train.append(0)
        else:
            scores_train.append(1)
        first_sent_train.append(line1[5])
        second_sent_train.append(line1[6])
        pair.append(str(line1[5]))
        pair.append(str(line1[6]))
        sent_pairs.append(pair)


    with open(test_data_path, encoding='utf-8') as fin:
        test_data = fin.read().split('\n')
    test_data = [line for line in test_data if line.strip()]
    for line in test_data:
        line1 = line.split('\t')
        if float(line1[4]) <= 4.0:
            scores_test.append(0)
        else:
            scores_test.append(1)
        first_sent_test.append(line1[5])
        second_sent_test.append(line1[6])

    pairs_train = []
    pairs_test = []
    segment_ids_train = []
    segment_ids_test = []
    tokenized_pairs_train = []
    tokenized_pairs_test = []

    for sent1, sent2 in zip(first_sent_train, second_sent_train):
        token1 = tokenizer.tokenize(sent1)
        token2 = tokenizer.tokenize(sent2)
        pair_tokens = []
        pair_segment_ids = []
        pair_tokens.append("[CLS] ")
        pair_segment_ids.append(0)
        for t in token1:
            pair_tokens.append(t)
            pair_segment_ids.append(0)
        pair_tokens.append('[SEP]')
        for t in token2:
            pair_tokens.append(t)
            pair_segment_ids.append(1)
        pair_tokens.append('[SEP]')
        pair_segment_ids.append(1)
        tokenized_pairs_train.append(pair_tokens)
        segment_ids_train.append(pair_segment_ids)

    for sent1, sent2 in zip(first_sent_test, second_sent_test):
        token1 = tokenizer.tokenize(sent1)
        token2 = tokenizer.tokenize(sent2)
        pair_tokens = []
        pair_segment_ids = []
        pair_tokens.append("[CLS] ")
        pair_segment_ids.append(0)
        for t in token1:
            pair_tokens.append(t)
            pair_segment_ids.append(0)
        pair_tokens.append('[SEP]')
        for t in token2:
            pair_tokens.append(t)
            pair_segment_ids.append(1)
        pair_tokens.append('[SEP]')
        pair_segment_ids.append(1)
        tokenized_pairs_test.append(pair_tokens)
        segment_ids_test.append(pair_segment_ids)

    print("the first tokenized pair:")
    print(tokenized_pairs_train[0])
    print("the first segment ids:")
    print(segment_ids_train[0])

    input_ids_train = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_pairs_train]
    input_ids_train = pad_sequences(input_ids_train, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    input_ids_test = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_pairs_test]
    input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    segment_ids_train = pad_sequences(segment_ids_train, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    segment_ids_test = pad_sequences(segment_ids_test, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    #encoded = [tokenizer.encode(s, add_special_tokens=True) for s in sent_pairs]
    #input_ids2 = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in sent_pairs]).unsqueeze(0)

    attention_masks_train = []
    attention_masks_test = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_train:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks_train.append(seq_mask)
    for seq in input_ids_test:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks_test.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model

    train_inputs = torch.tensor(input_ids_train).to(torch.int64)
    validation_inputs = torch.tensor(input_ids_test).to(torch.int64)
    train_labels = torch.tensor(scores_train).float()
    print(train_labels[:100])
    validation_labels = torch.tensor(scores_test).float()
    train_masks = torch.tensor(attention_masks_train).to(torch.int64)
    validation_masks = torch.tensor(attention_masks_test).to(torch.int64)
    segment_ids_train = torch.tensor(segment_ids_train).to(torch.int64)
    segment_ids_test = torch.tensor(segment_ids_test).to(torch.int64)

    # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
    # with an iterator the entire dataset does not need to be loaded into memory

    train_data = TensorDataset(train_inputs, train_masks, train_labels, segment_ids_train)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, segment_ids_test)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    #BertPreTrainedModel = BertModel.from_pretrained('bert-base-uncased')


    model = BertSimilarity.from_pretrained('bert-base-uncased')
    model = model.cuda()

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
        ]
    optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=.1)



    # Store our loss and accuracy for plotting
    train_loss_set = []
    accuracy = {}

    # trange is a tqdm wrapper around the normal python range
    for _ in trange(epochs, desc="Epoch"):

        # Training

        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_segment_ids = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass
            probs = model(b_input_ids, attention_mask=b_input_mask, token_type_ids=b_segment_ids)
            loss_func = torch.nn.BCELoss()
            batch_loss = loss_func(probs, b_labels)

            train_loss_set.append(batch_loss)
            # Backward pass
            batch_loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()

            # Update tracking variables
            tr_loss += batch_loss
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss / nb_tr_steps))
        
        accuracy['train_loss'] = tr_loss / nb_tr_steps
       
        # Validation

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        train_loss, train_accuracy = 0, 0
        nb_train_steps, nb_train_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_segment_ids = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                sigmoid = model(b_input_ids, attention_mask=b_input_mask, token_type_ids=b_segment_ids)

            # Move logits and labels to CPU
            sigmoid = sigmoid.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(sigmoid, label_ids)

            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        
        for batch in train_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_segment_ids = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                sigmoid = model(b_input_ids, attention_mask=b_input_mask, token_type_ids=b_segment_ids)

            # Move logits and labels to CPU
            sigmoid = sigmoid.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_train_accuracy = flat_accuracy(sigmoid, label_ids)

            train_accuracy += tmp_train_accuracy
            nb_train_steps += 1
        
        accuracy['valid_loss'] = eval_accuracy / nb_eval_steps
        
        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
        print("Train Accuracy: {}".format(train_accuracy / nb_train_steps))

    print("Saving to output folder")
    
    acc_filename = os.path.join(model_save_path, 'accuracy.pth')
    with open(acc_filename, 'wb') as f:
        torch.save(accuracy, f)
    f.close()
    
    train_loss_filename = os.path.join(model_save_path, 'trainloss.pth')
    with open(train_loss_filename, 'wb') as f:
        torch.save(train_loss_set, f)


    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    
    #storage_client = storage.Client()
    bucket_name = 'gs://cs229-models' + str(uuid.uuid1())
    #bucket = storage_client.get_bucket(bucket_name)
    !gsutil mb {bucket_name}
    !gsutil cp -r {model_save_path} {bucket_name}
    
    #cp_to_bucket_cmd = 'cp {} {}'.format(model_save_path, bucket_name)

    #subprocess.check_call(cp_to_bucket_cmd, shell=True)


In [0]:
main()

Found GPU at: /device:GPU:0
the first tokenized pair:
['[CLS] ', 'a', 'plane', 'is', 'taking', 'off', '.', '[SEP]', 'an', 'air', 'plane', 'is', 'taking', 'off', '.', '[SEP]']
the first segment ids:
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tensor([1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0.,
        0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
        1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
        0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
        0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


100%|██████████| 313/313 [00:00<00:00, 169614.62B/s]
100%|██████████| 440473133/440473133 [00:37<00:00, 11604730.86B/s]
t_total value of -1 results in schedule not being applied
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Train loss: 0.48385995626449585


Epoch:  33%|███▎      | 1/3 [03:30<07:00, 210.39s/it]

Validation Accuracy: 0.8359375
Train Accuracy: 0.8169973544973544
Train loss: 0.4790232479572296


Epoch:  67%|██████▋   | 2/3 [06:57<03:29, 209.33s/it]

Validation Accuracy: 0.8359375
Train Accuracy: 0.8171792328042328
Train loss: 0.45946332812309265


Epoch: 100%|██████████| 3/3 [10:23<00:00, 208.47s/it]

Validation Accuracy: 0.8359375
Train Accuracy: 0.8170882936507937
Saving to output folder





Creating gs://cs229-modelsaeb377e2-1be1-11ea-9888-0242ac1c0002/...
Copying file:///content/output/sts-b/pytorch_model.bin [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file:///content/output/sts-b/config.json [Content-Type=application/json]...
Copying file:///content/output/sts-b/trainloss.pth [Content-Type=applicatio