In [1]:
%%capture
!pip install transformers

In [2]:
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from sklearn.metrics import f1_score

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(torch.cuda.get_device_name(0))

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

Tesla T4


In [3]:
set_seed(530)
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')


train_texts = list(train_df['tweet'].values)
train_labels = list(train_df['class'].values)

dev_texts = list(dev_df['tweet'].values)
dev_labels = list(dev_df['class'].values)

test_texts = list(test_df['tweet'].values)
test_labels = list(test_df['class'].values)

In [4]:
train_df.head(20)

Unnamed: 0,tweet,class
0,@USER How dare they!? Please miss us with all ...,1
1,@USER She is!!! 😍,1
2,#Trump was walking towards a crowd excited to ...,0
3,@USER @USER Ford and the conservatives hates t...,1
4,@USER CONway say it louder for your fellow #MA...,0
5,@USER @USER @USER #Brilliant bigotry..,0
6,@USER I just think that he's more likely to be...,1
7,@USER YOU CAN'T TELL ME WHAT THE FUCK I CAN AN...,1
8,@USER @USER Fuck you cunt. Try this on for si...,0
9,@USER You will not win. Texas doesn’t need a l...,0


In [5]:
dev_df.head(20)

Unnamed: 0,tweet,class
0,@USER @USER @USER @USER @USER @USER She is bei...,1
1,@USER @USER Murda fucker,0
2,@USER @USER @USER Maybe he is just light in hi...,1
3,@USER @USER @USER @USER @USER i choose antifa URL,1
4,@USER I think this is funny that you're compla...,1
5,@USER @USER @USER @USER So you are ok with the...,0
6,@USER This isn’t about gun control anymore thi...,1
7,@USER And they have some of the strictest gun ...,0
8,@USER @USER @USER Completely agree. I grew up ...,1
9,Business is business. I love this story. URL,0


In [6]:
test_df.head(20)

Unnamed: 0,tweet,class
0,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,0
1,"#ConstitutionDay is revered by Conservatives, ...",1
2,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,1
3,#Watching #Boomer getting the news that she is...,1
4,#NoPasaran: Unity demo to oppose the far-right...,0
5,. . . What the fuck did he do this time?,0
6,#RAP is a form of ART! Used to express yoursel...,1
7,@USER Do you get the feeling he is kissing @US...,0
8,5 Tips to Enhance Audience Connection on Faceb...,1
9,#BiggBossTamil janani won the task. She is goi...,1


In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def load_data(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels):
    train_encodings = tokenizer(train_texts, padding=True)
    dev_encodings = tokenizer(dev_texts, padding=True)
    test_encodings = tokenizer(test_texts, padding=True)
    
    train_dataset = Dataset(train_encodings, train_labels)
    dev_dataset = Dataset(dev_encodings, dev_labels)
    test_dataset = Dataset(test_encodings, test_labels)
    
    return train_dataset, dev_dataset, test_dataset

In [16]:
train_dataset, dev_dataset, test_dataset = load_data(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels)
train_loader= DataLoader(train_dataset, shuffle=True, batch_size=64)
dev_loader= DataLoader(dev_dataset, shuffle=False, batch_size=64)
test_loader= DataLoader(test_dataset, shuffle=False, batch_size=64)

In [10]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
optim = AdamW(model.parameters(), lr=3e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
%%time

# training
model.to(device)
model.train()

for epoch in range(5):
    avg_loss = []
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels = labels, return_dict=True)
        loss = outputs.loss
        avg_loss.append(loss.item())
        loss.backward()
        optim.step()
        
    print("Epoch %d loss =" %epoch, np.mean(avg_loss))
    
model_path = "models"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Epoch 0 loss = 0.5072404743834613
Epoch 1 loss = 0.37421535059092514
Epoch 2 loss = 0.2797821102773442
Epoch 3 loss = 0.16097421380367508
Epoch 4 loss = 0.0908194950876389
CPU times: user 13min 41s, sys: 14min 20s, total: 28min 2s
Wall time: 27min 59s


In [17]:
# evaluation with development set
model.eval()
dev_preds = []
with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits
        prob = torch.softmax(logits, dim=1).tolist()
        pred = [p.index(max(p)) for p in prob]
        dev_preds += pred

In [18]:
fscore = f1_score(dev_preds, dev_labels, average='macro')
print("The development F1 score is:", fscore)

The development F1 score is: 0.7437481619633877


In [14]:
# evaluation with testing set
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits
        prob = torch.softmax(logits, dim=1).tolist()
        pred = [p.index(max(p)) for p in prob]
        test_preds += pred

In [15]:
fscore = f1_score(test_preds, test_labels, average='macro')
print("The training F1 score is:", fscore)

The training F1 score is: 0.7751694273322651
