In [1]:
%%capture
!pip install transformers
!pip install emoji
!pip install wordsegment

In [14]:
import random
import numpy as np
import pandas as pd
import re
import emoji
import wordsegment
import torch
from torch.utils.data import DataLoader
import transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(torch.cuda.get_device_name(0))

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

Tesla K80


In [3]:
set_seed(530)
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')


train_texts = list(train_df['tweet'].values)
train_labels = list(train_df['class'].values)

dev_texts = list(dev_df['tweet'].values)
dev_labels = list(dev_df['class'].values)

test_texts = list(test_df['tweet'].values)
test_labels = list(test_df['class'].values)

In [4]:
def preprocess(tweet):
    tweet = tweet.lower()
    # limit consecutive @user
    tweet = re.sub(r"(@user ){3,}", "@user @user @user ", tweet)
    # replace "url" with "html" for embedding
    tweet = tweet.replace("url", "html")
    # translate emoji into words
    tweet = emoji.demojize(tweet)
    # segment hashtag & emoji translations
    tokens = []
    for token in tweet.split(" "):
        # Remove all non-alphanumeric characters
        tokens += wordsegment.segment(token)
        # # Only deal with translated emoji & hashtags
        # if re.match(r"(:[a-z_-]+:)|(#[a-z]+)", token):
        #     tokens += wordsegment.segment(token)
        # else:
        #     tokens.append(token)
    return " ".join(tokens)

In [5]:
%%time

wordsegment.load()

train_texts = list(train_df['tweet'].apply(lambda x: preprocess(x)).values)
dev_texts = list(dev_df['tweet'].apply(lambda x: preprocess(x)).values)
test_texts = list(test_df['tweet'].apply(lambda x: preprocess(x)).values)

CPU times: user 1min 47s, sys: 601 ms, total: 1min 47s
Wall time: 1min 47s


In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def load_data(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels):
    train_encodings = tokenizer(train_texts, padding=True)
    dev_encodings = tokenizer(dev_texts, padding=True)
    test_encodings = tokenizer(test_texts, padding=True)
    
    train_dataset = Dataset(train_encodings, train_labels)
    dev_dataset = Dataset(dev_encodings, dev_labels)
    test_dataset = Dataset(test_encodings, test_labels)
    
    return train_dataset, dev_dataset, test_dataset

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [8]:
train_dataset, dev_dataset, test_dataset = load_data(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels)
train_loader= DataLoader(train_dataset, shuffle=True, batch_size=64)
dev_loader= DataLoader(dev_dataset, shuffle=False, batch_size=64)
test_loader= DataLoader(test_dataset, shuffle=False, batch_size=64)

In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
optim = AdamW(model.parameters(), lr=3e-5)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
%%time

# training
model.to(device)
model.train()

for epoch in range(5):
    avg_loss = []
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels = labels, return_dict=True)
        loss = outputs.loss
        avg_loss.append(loss.item())
        loss.backward()
        optim.step()
        
    print("Epoch %d loss =" %epoch, np.mean(avg_loss))
    
model_path = "models"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Epoch 0 loss = 0.49180589146155085
Epoch 1 loss = 0.37868708898039427
Epoch 2 loss = 0.2757424150000919
Epoch 3 loss = 0.1703186868266626
Epoch 4 loss = 0.10419574272843765
CPU times: user 19min 42s, sys: 15min 34s, total: 35min 16s
Wall time: 35min 15s


In [11]:
# evaluation with development set
model.eval()
dev_preds = []
with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits
        prob = torch.softmax(logits, dim=1).tolist()
        pred = [p.index(max(p)) for p in prob]
        dev_preds += pred

In [15]:
# Write to result file
with open("dev_preds.txt", "w") as f:
    f.write("\n".join(map(str, dev_preds)))
# Print the fscore
dev_fscore = f1_score(dev_labels, dev_preds, average='macro')
print("The baseline development f1 score is:", dev_fscore)
print(classification_report(dev_labels, dev_preds))

The baseline development f1 score is: 0.7469728473581214
              precision    recall  f1-score   support

           0       0.66      0.67      0.67       444
           1       0.83      0.82      0.83       880

    accuracy                           0.77      1324
   macro avg       0.75      0.75      0.75      1324
weighted avg       0.77      0.77      0.77      1324



In [16]:
# evaluation with testing set
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs=model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits
        prob = torch.softmax(logits, dim=1).tolist()
        pred = [p.index(max(p)) for p in prob]
        test_preds += pred

In [17]:
# Write to result file
with open("test_preds.txt", "w") as f:
    f.write("\n".join(map(str, test_preds)))
# Print the fscore
test_fscore = f1_score(test_labels, test_preds, average='macro')
print("The baseline testing f1 score is:", test_fscore)
print(classification_report(test_labels, test_preds))

The baseline testing f1 score is: 0.7866008516647764
              precision    recall  f1-score   support

           0       0.68      0.70      0.69       240
           1       0.88      0.87      0.88       620

    accuracy                           0.83       860
   macro avg       0.78      0.79      0.79       860
weighted avg       0.83      0.83      0.83       860

