In [1]:
%%capture
!pip install transformers
!pip install emoji
!pip install wordsegment

In [8]:
import random
import numpy as np
import pandas as pd
import re
import emoji
import wordsegment
import torch
from torch.utils.data import DataLoader
import transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from sklearn.metrics import f1_score

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(torch.cuda.get_device_name(0))

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

Tesla K80


In [3]:
set_seed(530)
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')


train_texts = list(train_df['tweet'].values)
train_labels = list(train_df['class'].values)

dev_texts = list(dev_df['tweet'].values)
dev_labels = list(dev_df['class'].values)

test_texts = list(test_df['tweet'].values)
test_labels = list(test_df['class'].values)

In [10]:
def preprocess(tweet):
    tweet = tweet.lower()
    # limit consecutive @user
    tweet = re.sub(r"(@user ){3,}", "@user @user @user ", tweet)
    # replace "url" with "html" for embedding
    tweet = tweet.replace("url", "html")
    # translate emoji into words
    tweet = emoji.demojize(tweet)
    # segment hashtag & emoji translations
    tweet = " ".join(wordsegment.segment(tweet))
    return tweet

In [21]:
%%time

wordsegment.load()

train_texts = list(train_df['tweet'].apply(lambda x: preprocess(x)).values)
dev_texts = list(dev_df['tweet'].apply(lambda x: preprocess(x)).values)
test_texts = list(test_df['tweet'].apply(lambda x: preprocess(x)).values)

CPU times: user 39min 3s, sys: 13.9 s, total: 39min 17s
Wall time: 39min 12s


In [12]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def load_data(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels):
    train_encodings = tokenizer(train_texts, padding=True)
    dev_encodings = tokenizer(dev_texts, padding=True)
    test_encodings = tokenizer(test_texts, padding=True)
    
    train_dataset = Dataset(train_encodings, train_labels)
    dev_dataset = Dataset(dev_encodings, dev_labels)
    test_dataset = Dataset(test_encodings, test_labels)
    
    return train_dataset, dev_dataset, test_dataset

In [22]:
train_dataset, dev_dataset, test_dataset = load_data(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels)
train_loader= DataLoader(train_dataset, shuffle=True, batch_size=64)
dev_loader= DataLoader(dev_dataset, shuffle=False, batch_size=64)
test_loader= DataLoader(test_dataset, shuffle=False, batch_size=64)

In [15]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
optim = AdamW(model.parameters(), lr=3e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
%%time

# training
model.to(device)
model.train()

for epoch in range(5):
    avg_loss = []
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels = labels, return_dict=True)
        loss = outputs.loss
        avg_loss.append(loss.item())
        loss.backward()
        optim.step()
        
    print("Epoch %d loss =" %epoch, np.mean(avg_loss))
    
model_path = "models"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Epoch 0 loss = 0.4946779360108197
Epoch 1 loss = 0.38214835030509825
Epoch 2 loss = 0.2765058110104525
Epoch 3 loss = 0.1707656903461339
Epoch 4 loss = 0.09959018328550824
CPU times: user 19min 25s, sys: 14min 55s, total: 34min 21s
Wall time: 34min 14s


In [23]:
# evaluation with development set
model.eval()
dev_preds = []
with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits
        prob = torch.softmax(logits, dim=1).tolist()
        pred = [p.index(max(p)) for p in prob]
        dev_preds += pred

In [24]:
fscore = f1_score(dev_preds, dev_labels, average='macro')
print("The development F1 score is:", fscore)

The development F1 score is: 0.7460267957695059


In [19]:
# evaluation with testing set
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits
        prob = torch.softmax(logits, dim=1).tolist()
        pred = [p.index(max(p)) for p in prob]
        test_preds += pred

In [20]:
fscore = f1_score(test_preds, test_labels, average='macro')
print("The training F1 score is:", fscore)

The training F1 score is: 0.7765610679418211
