In [1]:
import torch
import transformers
import random
import numpy as np
from transformers import AdamW
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader



device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(torch.cuda.get_device_name(0))

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

GeForce GTX 1080 Ti


In [2]:
set_seed(530)
df = pd.read_csv('data/cleaned_df.csv')
df, test_df = train_test_split(df, test_size=0.1)


train_texts = list(df.clean_tweet.values)
train_labels = list(df['class'].values)
test_texts = list(test_df.clean_tweet.values)
test_labels = list(test_df['class'].values)

['you and them bitches that you came wit', 'so about this flappy bird game then', 'i missing something of my ofay friends day all scared by da recent happenings do not chu  lamar say you like da colored folk', '    bitch yall relationship am not shit but a theory', 'np shy glizzy  awwsome remix feat 2 chainz  aap rocky via  ', 'she let me play with her pussy then she lick it off my fingers', 'the whole afternoon has been about the deep voiced bitch', 'speaking of  a lot of them backpage hoes do not like fucking black guys so i never went thru with it', 'halloween over and bitches still playing roles', ' now that halloween is fast approaching please understand this we are a culture not a costume to mock and ridicule http']


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #print(self.encodings.items()[0])
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #print(self.labels[idx])
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [5]:
def load_data(train_texts, train_labels, test_texts, test_labels):
    train_encodings = tokenizer(train_texts, padding=True)
    test_encodings = tokenizer(test_texts, padding=True)
    
    #print(train_encodings['input_ids'][:3])
    
    train_dataset = Dataset(train_encodings, train_labels)
    test_dataset = Dataset(test_encodings, test_labels)
    
    return train_dataset, test_dataset
    


In [6]:
## training


train_dataset, test_dataset = load_data(train_texts, train_labels, test_texts, test_labels)
train_loader= DataLoader(train_dataset, shuffle=True, batch_size=64)
test_loader= DataLoader(test_dataset, shuffle=False, batch_size=64)

optim = AdamW(model.parameters(), lr=3e-5)
model.to(device)
model.train()

print("Started training")
start = time.time()



for epoch in range(5):
    avg_loss = []
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, labels = labels, return_dict=True)
        loss = outputs.loss
        #print(input_ids)
        #print(loss)
        avg_loss.append(float(loss))
        loss.backward()
        optim.step()
        
    print("Epoch %d loss =" %epoch, np.mean(avg_loss))
    
model_path = "models"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
                    
print(f'Time taken to train: {time.time() - start}')

Started training
Epoch 0 loss = 0.3199634667069659
Epoch 1 loss = 0.21984149016076993
Epoch 2 loss = 0.17475966050324263
Epoch 3 loss = 0.1262094376923618
Epoch 4 loss = 0.08595003313078067
Time taken to train: 536.8577296733856


In [12]:
model.eval()
## evaluation here
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits
        prob = torch.softmax(logits, dim=1).tolist()
        pred = [p.index(max(p)) for p in prob]
        test_preds += pred
#print(test_preds)
#print(test_labels)



[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 0, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 

In [13]:
from sklearn.metrics import f1_score
fscore = f1_score(test_preds, test_labels, average='micro')
print("The F1 score is:", fscore)

The F1 score is: 0.9100443727309399
