In [None]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 17.7MB/s eta 0:00:01[K     |▌                               | 20kB 7.4MB/s eta 0:00:01[K     |▊                               | 30kB 8.2MB/s eta 0:00:01[K     |█                               | 40kB 8.9MB/s eta 0:00:01[K     |█▎                              | 51kB 7.8MB/s eta 0:00:01[K     |█▌                              | 61kB 9.0MB/s eta 0:00:01[K     |█▊                              | 71kB 9.1MB/s eta 0:00:01[K     |██                              | 81kB 9.4MB/s eta 0:00:01[K     |██▎                             | 92kB 8.8MB/s eta 0:00:01[K     |██▌                             | 102kB 9.4MB/s eta 0:00:01[K     |██▊                             | 112kB 9.4MB/s eta 0:00:01[K     |███                             | 122kB 9.4M

In [None]:
import os
from typing import Tuple, List
from functools import partial

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, SubsetRandomSampler
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertPreTrainedModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

import sqlite3

In [None]:
bert_model_name = 'bert-base-cased'
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda:0')
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
assert tokenizer.pad_token_id == 0, "Padding value used in masks is set to zero, please change it everywhere"

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
class BertClassifier(nn.Module):
    
    def __init__(self, bert: BertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                
            labels=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
        cls_output = outputs[1] # batch, hidden
        cls_output = self.classifier(cls_output) # batch, 6
        cls_output = torch.sigmoid(cls_output)
        criterion = nn.BCELoss()
        loss = 0
        if labels is not None:
            loss = criterion(cls_output, labels)
        return loss, cls_output

## Multi-class

In [None]:
def predict(input_string, model_path, predict_proba=False):
    model = torch.load(model_path, map_location=torch.device('cpu'))
    texts = []
    text = tokenizer.encode(input_string, add_special_tokens=True)
    if len(text) > 120:
        text = text[:119] + [tokenizer.sep_token_id]
    texts.append(torch.LongTensor(text))
    x = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
    mask = (x != tokenizer.pad_token_id).float().to(device)
    with torch.no_grad():
        _, outputs = model(x, attention_mask=mask)
    pred = np.argmax(outputs.cpu().numpy())
    columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    if predict_proba:
        return outputs
    else:
        return columns[pred]

In [None]:
path = "/content/drive/My Drive/Module 3 shared folder/model.pt"
input_string = "I think you are a poor jewish moron"

predict(input_string, path)

'toxic'

## Binary class

In [None]:
def predict_binary(input_string, model, predict_proba=False, threshold=.7):
    texts = []
    text = tokenizer.encode(input_string, add_special_tokens=True)
    if len(text) > 120:
        text = text[:119] + [tokenizer.sep_token_id]
    texts.append(torch.LongTensor(text))
    x = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
    mask = (x != tokenizer.pad_token_id).float().to(device)
    with torch.no_grad():
        _, outputs = model(x, attention_mask=mask)
    prob_toxic = outputs.cpu().numpy()[0][0]
    if predict_proba:
        return prob_toxic
    else:
        if prob_toxic >= threshold:
            return "toxic"
        else:
            return "non-toxic"

In [None]:
path = "/content/drive/My Drive/Module 3 shared folder/bert_2_classes.pth"
model = torch.load(path, map_location=torch.device('cpu'))

In [None]:
input_string = "I think you are a poor jewish moron"
input_string2= "I really liked this particular subject"
input_string3= "Maybe your mom forgot to tell you this but I don't worry I will. You do not deserve to live."
input_string4= "This is quite ambiguous. I hoped I would find more accurate information here but I did not. This website is not that good after all."

predict_binary(input_string4, model, predict_proba=True)

0.000109714594