In [1]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 4.7MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 28.4MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 44.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


**Prediction**

In [4]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

import re
import numpy as np

np.random.seed(224)
torch.manual_seed(224)
torch.cuda.manual_seed_all(224)

In [5]:
def clean_text(tokenizer, text):
    # basic text preprocessing
    text = text.replace("''", '" ').replace("``", '" ')  # replace the quotes 
    text = text.replace("`", "'") # backticks typo
    text = text.replace("\"", "") # replace quotes
    text = text.replace("...", " ").replace(". . .", " ").replace('..', ' ') # replace dots
    text = text.replace("\n", " ") # replace new line chars
    text = re.sub(r'(?:http:|https:).*?(?=\s)', '', text)  # remove url and website
    text = re.sub(r'www.*?(?=\s)', '', text)  # remove url and website

    list_to_replace = [':(', '=)', ':)', ':P', '-', ',,', ':', ';', '/', '+', '~', '_', '*', '(', ')', '&', '=', '@'] #replace the punctuations which are messy with empty
    for elem in list_to_replace:
        text = text.replace(elem, '')
    
    text = re.sub(r'\!{2,}', '!', text) # duplicate punctuation
    text = re.sub(r'\?{2,}', '?', text) # duplicate punctuation
    text = text.replace('?!', '?').replace('!?', '?') #replace slang punctuation with question
    text = re.sub(r'\s(?:\.|\,)', '', text) # replace spaces before punctuation
    text = re.sub(r'([a-zA-Z?!])\1\1+', r'\1', text) # removes repeated characters (Ex: Veryyyyy -> very)
    
    text = re.sub(r'\s{2,}', ' ', text) # replace multiple spaces
    text = text.strip() # strips spaces

    text = text.lower() # lower text

    return text

In [6]:
def pad_sent(tokenizer, raw_text, max_text_len = 350):  #token number 0 is [PAD]
    curr_text = "[CLS] " + raw_text  # add starting cls token
    tokenized_text = tokenizer.tokenize(curr_text) # tokenize            
    tokenized_ids = tokenizer.convert_tokens_to_ids(tokenized_text) # convert to ids

    tokenized_ids = tokenized_ids[:max_text_len - 1]   # trim the reviews
    tokenized_ids.append(102) # add special token for [SEP]
    
    # get text length and padding
    curr_sent_len = len(tokenized_ids)
    remaining = max_text_len - curr_sent_len # words remaining for padding

    # pad the input token
    tokenized_ids.extend([0] * remaining)  # pad the text to max_text_len

    # create attention and segmented mask
    curr_attn = [1] * curr_sent_len  
    curr_attn.extend([0] * remaining)
    curr_seg_id = [0] * max_text_len

    return tokenized_ids, curr_attn, curr_seg_id

In [7]:
def get_available_devices():
    """Get IDs of all available GPUs.

    Returns:
        device (torch.device): Main device (GPU 0 or CPU).
        gpu_ids (list): List of IDs of all GPUs that are available.
    """
    gpu_ids = []
    if torch.cuda.is_available():
        gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())]
        device = torch.device(f'cuda:{gpu_ids[0]}')
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')

    return device, gpu_ids

In [8]:
# #### USED ONLY IF WE LOAD .pth file from save_state_dict()

# def load_model(model, checkpoint_path, gpu_ids, return_step=True):
#     """Load model parameters from disk.

#     Args:
#         model (torch.nn.DataParallel): Load parameters into this model.
#         checkpoint_path (str): Path to checkpoint to load.
#         gpu_ids (list): GPU IDs for DataParallel.
#         return_step (bool): Also return the step at which checkpoint was saved.

#     Returns:
#         model (torch.nn.DataParallel): Model loaded from checkpoint.
#         step (int): Step at which checkpoint was saved. Only if `return_step`.
#     """
#     device = "cuda:" + gpu_ids[0] if gpu_ids else 'cpu' 
#     ckpt_dict = torch.load(checkpoint_path, map_location=device)

#     # Build model, load parameters
#     model.load_state_dict(ckpt_dict['model_state'])

#     if return_step:
#         step = ckpt_dict['step']
#         return model, step

#     return model

In [9]:
# ### USED ONLY IF WE LOAD .pth file from save_state_dict()

# class BertFineTune(nn.Module):
#     def __init__(self):
#         super(BertFineTune, self).__init__()
#         self.embed_model = BertModel.from_pretrained('gdrive/My Drive/Yelp_Sentiment_Analysis/bert-base-uncased')
        
#         # initial Feed Forward Network with same size and tanh activation, and initial dropout
#         self.dense = nn.Linear(768, 768)
#         self.initial_activation = nn.Tanh()
#         self.dropout = nn.Dropout(p=0.1)
        
#         # stack more layer for better fitting and dropout
#         self.fc_1 = nn.Linear(768, 256)
#         self.tanh1 = nn.Tanh()
#         self.dropout2 = nn.Dropout(p=0.1)

#         # stack another layer
#         self.fc_2 = nn.Linear(256, 128)
#         self.tanh2 = nn.Tanh()

#         # does classification
#         self.classifier = nn.Linear(128, 5)

#         # layers initialization
#         nn.init.xavier_uniform_(self.dense.weight)  # layer initialization
#         nn.init.xavier_uniform_(self.fc_1.weight)  # layer initialization
#         nn.init.xavier_uniform_(self.fc_2.weight)  # layer initialization
#         nn.init.xavier_uniform_(self.classifier.weight)  # layer initialization

#     def forward(self, x, seg_id_tensor, attnmask_tensor):
#         # use bert to output embeddings
#         # take the BERT embedding
#         # bert_output[0] = last_layer_embedding  (batch_size, seq_len, hidden_size)
#         # bert_output[1] = pooler_output, [CLS] embedding further preprocessed by linear and Tanh layers
#         # bert_output[2] = tuple of length 13 (one for output of embedding layer and 12 output for each layer in the transformer) 
#         bert_output = self.embed_model(x, token_type_ids=seg_id_tensor, attention_mask=attnmask_tensor)
        
#         # sequence_output size = batch_size x sequence_length x hidden_size
#         sequence_output = bert_output[0]

#         # get embedding of [CLS] token
#         cls_embed = sequence_output[:, 0, :] # take CLS embedding
        
#         # post process the embedding layer by applying dense layer and tanh activation
#         pooled_output = self.dense(cls_embed) # take linear layer
#         pooled_output = self.initial_activation(pooled_output) # take activation

#         # perform dropout and stack linear layers and tanh afterwards 
#         after_dropout1 = self.dropout(pooled_output)
        
#         linear1 = self.fc_1(after_dropout1)
#         tanh1 = self.tanh1(linear1)
#         after_dropout2 = self.dropout2(tanh1)
#         linear2 = self.fc_2(after_dropout2)
#         tanh2 = self.tanh2(linear2)

#         # do classification
#         logits = self.classifier(tanh2)

#         return logits

In [20]:
def setup_model():
    # create tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # tokenizer = BertTokenizer.from_pretrained('gdrive/My Drive/Yelp_Sentiment_Analysis/bert-base-uncased')

    # load model
    device, gpu_ids = get_available_devices()

    # load from final model
    model = torch.jit.load("gdrive/My Drive/Yelp_Sentiment_Analysis/best-model-180000.pth")

    # load from state_dict
    #model = load_model(BertFineTune(), 'gdrive/My Drive/Yelp_Sentiment_Analysis/save/train/baseline-01/step_180000.pth.tar', gpu_ids, return_step=False)

    model = model.to(device)
    model.eval()

    return tokenizer, model, device

In [21]:
def do_prediction(device, tokenizer, test_string):
          
    # test string + preprocess
    test_string = clean_text(tokenizer, test_string)
    tokenized_ids, curr_attn, curr_seg_id = pad_sent(tokenizer, test_string)


    with torch.no_grad():
        # Setup for forward
        text = torch.tensor(tokenized_ids).to(device)
        attnmask = torch.tensor(curr_attn).to(device)
        seg_id = torch.tensor(curr_seg_id).to(device)

        text = torch.reshape(text, (1, -1))
        attnmask = torch.reshape(attnmask, (1, -1))
        seg_id = torch.reshape(seg_id, (1,-1))

        # Forward
        logits = model(text, seg_id, attnmask)

        # ypred
        ypred = torch.argmax(logits, dim = 1)

        print('rating is: ', ypred.item() + 1, '*')

In [22]:
import time
start = time.time()
tokenizer, model, device = setup_model()
end = time.time()

In [23]:
end - start

1.0666265487670898

In [24]:
start = time.time()
test_string = 'I Love the food here its really good'
do_prediction(device, tokenizer, test_string)
end = time.time()

rating is:  5 *


In [25]:
end - start

1.7071433067321777

**Saving Model As JIT**

In [19]:
# # test string + preprocess
# test_string = 'I hate this good'
# test_string = clean_text(tokenizer, test_string)
# tokenized_ids, curr_attn, curr_seg_id = pad_sent(tokenizer, test_string)

# text = torch.tensor(tokenized_ids).to(device)
# attnmask = torch.tensor(curr_attn).to(device)
# seg_id = torch.tensor(curr_seg_id).to(device)

# text = torch.reshape(text, (1, -1))
# attnmask = torch.reshape(attnmask, (1, -1))
# seg_id = torch.reshape(seg_id, (1,-1))

In [20]:
# torch.jit.save(torch.jit.trace(model, (text, seg_id, attnmask)), "gdrive/My Drive/Yelp_Sentiment_Analysis/best-model-180000.pth")

In [21]:
# loaded_model = torch.jit.load("gdrive/My Drive/Yelp_Sentiment_Analysis/best-model-180000.pth")

**Saving Base Model**

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer.save_pretrained('gdrive/My Drive/Yelp_Sentiment_Analysis/bert-base-uncased')

('gdrive/My Drive/Yelp_Sentiment_Analysis/bert-base-uncased/vocab.txt',
 'gdrive/My Drive/Yelp_Sentiment_Analysis/bert-base-uncased/special_tokens_map.json',
 'gdrive/My Drive/Yelp_Sentiment_Analysis/bert-base-uncased/added_tokens.json')

In [None]:
# model = BertModel.from_pretrained('bert-base-uncased')
# model.save_pretrained('gdrive/My Drive/Yelp_Sentiment_Analysis/bert-base-uncased')