In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import torchtext.data as data
import pandas as pd

import string
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
device = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
SEED = 2020
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
DATA_DIR = "data/train.csv"
PREPROCESS_DATA_DIR = "data/preprocessed.csv"
PREPROCESS_TEST_DIR = "data/preprocessed_test2.csv"

In [5]:
traindata = pd.read_csv(DATA_DIR)
print(traindata.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [6]:
MAKE_PREPROCESSED_DATA = 0
TRAIN_DATA = 0

In [7]:
punctuations = string.punctuation
stopwords_list = stopwords.words("english")
spacy_tokenizer = torchtext.data.utils.get_tokenizer('spacy')
lemmatizer = WordNetLemmatizer()
def preprocessing(text):
  
    def tokenizer(text):
        text = str.split(text)
        return text
    
    def remove_punctuations(sentence):
        result = "".join([w if w not in punctuations and not w.isdigit() else " " for w in sentence])
        return result
    
    def word_lemmatizer(sentence):
        result = lemmatizer.lemmatize(sentence)
        return result
    
    def word_lowercase(sentence):
        return sentence.lower()
    
    def remove_URL(text):
        url = re.compile(r'https?://\S+|www\.\S+')
        html=re.compile(r'<.*?>')
        text = html.sub(r'',text)
        text = url.sub(r'',str(text))
        return text
  
    def remove_newline(text):
        return text.rstrip("\n")
    
    def clean(sentence):
        result = []
        sentence = remove_newline(sentence)
        sentence = remove_URL(sentence)
        sentence = word_lowercase(sentence)
        sentence = word_lemmatizer(sentence)
        sentence = remove_punctuations(sentence)
        sentence = tokenizer(sentence)

        result = " ".join(sentence)
        return result
    
    #result = generate_bigrams(result)  
    text = clean(text)
    if text == "":
        text = "None"
    return text

In [8]:
import csv
from tqdm import tqdm

if MAKE_PREPROCESSED_DATA:
    with open(DATA_DIR, "r", encoding="utf8") as in_csv, open(PREPROCESS_DATA_DIR, "w", newline="", encoding="utf8") as out_csv:
        reader = csv.reader(in_csv)
        writer = csv.writer(out_csv)
        next(reader, None) # Skip header
        for row in tqdm(reader):
            row[1] = preprocessing(row[1])
            try:
                writer.writerow(row)
            except Exception as e:
                print(e)

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased'] 
print(f"Maximum length that a single input can have using this model: {max_input_length} words")

Maximum length that a single input can have using this model: 512 words


In [10]:
# Using inititated tokens from transformers
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

In [11]:
def mytokenizer(sentence):
    tokens = str.split(sentence)
    tokens = tokens[:max_input_length-2] # BERT model appends two tokens to each sequence
                                         # one at the beginning, one at the end
    return tokens

In [12]:
TEXT = data.Field(batch_first = True,
                  use_vocab= False, # so we dont need to build vocabulary from training data
                  tokenize = mytokenizer,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx,
                  stop_words=stopwords_list)
LABEL = data.LabelField(dtype = torch.float)
ID2 = data.Field(sequential=False)

In [13]:
FIELDS = [["id",None], ["text", TEXT], ["toxic",LABEL],["s_toxic",LABEL],
          ["obscene",LABEL],["threat",LABEL],["insult",LABEL],["id_hate",LABEL]]
TEST_FIELDS = [["id",ID2], ["text", TEXT]]
trainset = data.TabularDataset(PREPROCESS_DATA_DIR,
                              format = "csv",
                              fields=FIELDS,
                              skip_header=True)
testset = data.TabularDataset(PREPROCESS_TEST_DIR,
                             format="csv",
                             fields=TEST_FIELDS,
                             skip_header=True)

In [14]:
LABEL.build_vocab(trainset)
ID2.build_vocab(testset)
print(f"Unique words in training set: {len(tokenizer.vocab)}")
print(f"Unique labels in training set: {len(LABEL.vocab)}")

Unique words in training set: 30522
Unique labels in training set: 2


In [15]:
print(ID2.vocab.freqs.most_common(10))

[('0000247867823ef7', 1), ('00013b17ad220c46', 1), ('00017563c3f7919a', 1), ('00017695ad8997eb', 1), ('0001ea8717f6de06', 1), ('00024115d4cbde0f', 1), ('000247e83dcc1211', 1), ('00025358d4737918', 1), ('00026d1092fe71cc', 1), ('0002eadc3b301559', 1)]


In [16]:
BATCH_SIZE = 16

train_iter = data.BucketIterator(trainset,
                                           batch_size=BATCH_SIZE,
                                           device = device)
test_iter = data.BucketIterator(testset,
                                batch_size= BATCH_SIZE,
                                shuffle=False,
                                device = device)

In [18]:
for batc in train_iter:
    break

In [17]:
# Load the pre-trained model
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [18]:
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          batch_first=True,
                          dropout=0 if n_layers < 2 else dropout)
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout,inplace=False)
        
    def forward(self, text):

        # text = [batch size, sent len]
        with torch.no_grad():
            embedded = self.bert(text)[0]
        # embedded = [batch size, sent len, emb dim]
        _, hidden = self.rnn(embedded)
        # hidden = [n layers * n directions, batch size, emb dim]
        hidden_tmp = hidden.clone()
        if self.rnn.bidirectional:
            hidden = self.dropout(
                torch.cat((hidden_tmp[-2, :, :], hidden_tmp[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden_tmp[-1, :, :])
        # hidden = [batch size, hid dim]
        output = self.out(hidden)
        # output = [batch size, out dim]
        return output

In [19]:
HIDDEN_DIM = 512
OUTPUT_DIM = 6
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-4, weight_decay = 1e-5)
criterion = nn.BCEWithLogitsLoss().to(device)
print(model)

BERTGRUSentiment(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [20]:
# Freeze the BERT model's parameters
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,761,734 trainable parameters


In [21]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [22]:
def get_labels(batch):
    toxic = batch.toxic.unsqueeze(1)
    s_toxic = batch.s_toxic.unsqueeze(1)
    obscene = batch.obscene.unsqueeze(1)
    threat = batch.threat.unsqueeze(1)
    insult = batch.insult.unsqueeze(1)
    id_hate = batch.id_hate.unsqueeze(1)
    labels = torch.cat((toxic,s_toxic,obscene,
                        threat,insult,id_hate),dim=1)
    return labels

In [23]:
from tqdm import tqdm
def train_step(model, optimizer, criterion, batch):
    model.train()
    
    optimizer.zero_grad()
    text = batch.text
    labels = get_labels(batch)

    outputs = model(text)
    loss = criterion(outputs,labels)
    loss.backward()
    optimizer.step()

    return loss.item()

In [24]:
if TRAIN_DATA:
    EPOCHS = 3
    loss_list = []
    print("Start training...")
    for epoch in range(EPOCHS):
        for i, batch in enumerate(train_iter):
            train_loss = train_step(model,optimizer, criterion, batch)

            if i%400 == 0:
                print(f"Epoch: [{epoch+1}/{EPOCHS}] | Iterations: [{i+1}/{len(train_iter)}] | Training loss: {train_loss:.3f}")
                torch.save(model.state_dict(), "model/modelBERT2.pt")
    print("Training Completed!")

Start training...
Epoch: [1/3] | Iterations: [1/9974] | Training loss: 0.693
Epoch: [1/3] | Iterations: [401/9974] | Training loss: 0.141
Epoch: [1/3] | Iterations: [801/9974] | Training loss: 0.102
Epoch: [1/3] | Iterations: [1201/9974] | Training loss: 0.009
Epoch: [1/3] | Iterations: [1601/9974] | Training loss: 0.065
Epoch: [1/3] | Iterations: [2001/9974] | Training loss: 0.035
Epoch: [1/3] | Iterations: [2401/9974] | Training loss: 0.023
Epoch: [1/3] | Iterations: [2801/9974] | Training loss: 0.002
Epoch: [1/3] | Iterations: [3201/9974] | Training loss: 0.054
Epoch: [1/3] | Iterations: [3601/9974] | Training loss: 0.002
Epoch: [1/3] | Iterations: [4001/9974] | Training loss: 0.051
Epoch: [1/3] | Iterations: [4401/9974] | Training loss: 0.155
Epoch: [1/3] | Iterations: [4801/9974] | Training loss: 0.063
Epoch: [1/3] | Iterations: [5201/9974] | Training loss: 0.011
Epoch: [1/3] | Iterations: [5601/9974] | Training loss: 0.017
Epoch: [1/3] | Iterations: [6001/9974] | Training loss: 0

RuntimeError: CUDA error: unspecified launch failure

In [25]:
if not TRAIN_DATA:
    model.load_state_dict(torch.load("model/modelBERT2.pt"))

In [26]:
torch.set_printoptions(precision=4)
def predict_test(model, test_iter):
    result = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_iter):
            batch_size = len(batch)
            text = batch.text.view(batch_size,-1).long()
            ids = batch.id.squeeze().cpu()
            output = model(text)
            output = torch.sigmoid(output).cpu()
            for i,j in zip(ids,output):
                result.append([ID2.vocab.itos[i.numpy()],j.numpy()])
    return result

In [27]:
result = predict_test(model, test_iter)

100%|████████████████████████████████████████████████████████████████████████████| 9573/9573 [1:04:01<00:00,  2.49it/s]


In [33]:
import csv
with open("submission2.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id"]+["toxic"]+["severe_toxic"]+["obscene"]+["threat"]+["insult"]+["identity_hate"])
    for line in tqdm(result):
        writer.writerow([line[0]] + [line[1][i] for i in range(6)])

100%|██████████████████████████████████████████████████████████████████████| 153163/153163 [00:01<00:00, 130394.15it/s]


In [32]:
print(len(result))

153163


In [28]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = mytokenizer(sentence)
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction

In [29]:
text = preprocessing("yo bitch ja rule is more succesful then you ll ever be whats up with you and hating you sad mofuckas i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja rule is about pride in da music man dont diss that shit on him and nothin is wrong bein like tupac he was a brother too fuckin white boys get things right next time")
print(text)

yo bitch ja rule is more succesful then you ll ever be whats up with you and hating you sad mofuckas i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja rule is about pride in da music man dont diss that shit on him and nothin is wrong bein like tupac he was a brother too fuckin white boys get things right next time


In [30]:
print(predict_sentiment(model,tokenizer,text))

tensor([[0.9917, 0.4062, 0.9639, 0.0687, 0.9240, 0.4244]], device='cuda:0',
       grad_fn=<SigmoidBackward>)
