In [None]:
# install Hugging Face libraries and emoji
!pip install -q transformers
!pip install -q datasets
!pip install -q emoji

In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk 
import string
import re
import torch
import torch.nn as nn
import emoji
# from torch.utils.data import Dataset, DataLoader
from datasets import Dataset, load_dataset
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertForSequenceClassification, BertTokenizer, BertTokenizerFast, Trainer, TrainingArguments

# set up notebook environment
%matplotlib inline
pd.set_option('display.max_colwidth', 100)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))

Tesla T4


In [None]:
# data loading
filepath = '/content/drive/My Drive/CS410_Text_Info_Systems/Final Project/ClassificationCompetition/data/'
trainname = 'train.jsonl'
df = pd.read_json(filepath + trainname, lines=True)
df['label'] = df['label'].apply(lambda x: 1 if x=='SARCASM' else 0) # change label to 0/1

testname = 'test.jsonl'
df_pred = pd.read_json(filepath + testname, lines=True)

In [None]:
# data cleaning and emoji conversion
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation]) + '|\\’'
# df['text'] = df['response'].str.lower().str.replace('@user','').str.replace(RE_PUNCTUATION,'').str.replace(' +', ' ').str.strip()
# df_pred['text'] = df_pred['response'].str.lower().str.replace('@user','').str.replace(RE_PUNCTUATION,'').str.replace(' +', ' ').str.strip()
df['text'] = df['response'].str.replace('@USER','').apply(lambda x: emoji.demojize(x).replace(':',' ')).str.replace(' +', ' ').str.strip()
df_pred['text'] = df_pred['response'].str.replace('@USER','').apply(lambda x: emoji.demojize(x).replace(':',' ')).str.replace(' +', ' ').str.strip()

In [None]:
df

Unnamed: 0,label,response,context,text
0,1,@USER @USER @USER I don't get this .. obviously you do care or you would've moved right along .....,"[A minor child deserves privacy and should be kept out of politics . Pamela Karlan , you should ...",I don't get this .. obviously you do care or you would've moved right along .. instead you decid...
1,1,@USER @USER trying to protest about . Talking about him and his labels and they label themselves...,"[@USER @USER Why is he a loser ? He's just a Press Secretary, @USER @USER having to make up excu...",trying to protest about . Talking about him and his labels and they label themselves WTF does th...
2,1,"@USER @USER @USER He makes an insane about of money from the MOVIES , Einstein ! #LearnHowTheSys...",[Donald J . Trump is guilty as charged . The evidence is clear . If your Senator votes to acquit...,"He makes an insane about of money from the MOVIES , Einstein ! #LearnHowTheSystemWorks"
3,1,@USER @USER Meanwhile Trump won't even release his SAT scores and his Wharton professors said he...,"[Jamie Raskin tanked Doug Collins . Collins looks stupid . <URL>, @USER But not half as stupid a...",Meanwhile Trump won't even release his SAT scores and his Wharton professors said he was the dum...
4,1,"@USER @USER Pretty Sure the Anti-Lincoln Crowd Claimed That "" Democracy Was on the Ballot "" in 1...","[Man ... y ’ all gone “ both sides ” the apocalypse one day . <URL>, @USER They already did . Ob...","Pretty Sure the Anti-Lincoln Crowd Claimed That "" Democracy Was on the Ballot "" in 1860 , too . ..."
...,...,...,...,...
4995,0,@USER You don't . I have purchased a lot on Amazon ( check my entire spending history ) and toda...,[@USER Apologies for the inconvenience you faced with your order . We would like to take a close...,You don't . I have purchased a lot on Amazon ( check my entire spending history ) and today you ...
4996,0,@USER #Emotions you say 🤔 never knew that I think I ’ m just happy when I ’ m #eating and when I...,"[@USER 🤔 idk tho , I think I ’ m #hungry . But that ’ s definitely just a #mango and not a #sala...",#Emotions you say thinking_face never knew that I think I ’ m just happy when I ’ m #eating and ...
4997,0,"@USER @USER @USER You are so right ... "" Yes ! #Silence is not #Privacy is not ""","[@USER @USER @USER Peace to you , and two countries certainly seems more ideal than a greater nu...","You are so right ... "" Yes ! #Silence is not #Privacy is not """
4998,0,@USER @USER @USER Another lazy delusional voter who takes the word of corporatists at face value...,"[Bernie Sanders told Elizabeth Warren in private 2018 meeting that a woman can't win , sources s...","Another lazy delusional voter who takes the word of corporatists at face value , instead of doin..."


In [None]:
# shuffle data and train-test split for validation
df_shuffle = shuffle(df)
df_train = df_shuffle.head(5000) # if set to 5000, then use entire data set for training (final submission)
df_test = df_shuffle.tail(500)

In [None]:
# download Hugging Face BERT model and tokenizer

# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', hidden_dropout_prob=0.2)
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
# print(df['response'])
# print(df['response'].apply(lambda x: emoji.demojize(x).replace(':',' ')))

In [None]:
# # test pre-trained BERT tokenizer
# text_batch = ["That's funny!", emoji.demojize("Are you serious?!!🤣🤔")]
# print(emoji.demojize("Are you serious?!!🤣🤔"))
# encoding = tokenizer(text_batch, return_tensors='pt', padding='max_length', truncation=True, max_length=64)
# input_ids = encoding['input_ids']
# attention_mask = encoding['attention_mask']
# print(input_ids,attention_mask)

In [None]:
# tokenize the training and test data set
def tokenize(batch):
  return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=64)
  # return tokenizer(batch['text'], 
  #                  max_length=64, 
  #                  add_special_tokens=True, 
  #                  return_token_type_ids=False, 
  #                  pad_to_max_length=True, 
  #                  return_attention_mask=True)

train_dataset = Dataset.from_pandas(df_train[['text','label']]).map(tokenize)
# train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = Dataset.from_pandas(df_test[['text','label']]).map(tokenize)

# convert dataset to pytorch compatible format
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [None]:
# compute additional metrics
# reference: https://huggingface.co/transformers/training.html
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=50,
    # warmup_steps=500,
    # weight_decay=0.01,
    evaluation_strategy='steps',
    logging_steps=400,
    logging_dir='./logs',
    learning_rate=2e-5,
)

# initialize Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
# perform training, comment out the next line if not re-training the model
trainer.train()

  return torch.tensor(x, **format_kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
400,0.519373,0.434224,0.79,0.812165,0.69419,0.978448
800,0.354357,0.165553,0.938,0.934183,0.920502,0.948276
1200,0.237919,0.076699,0.984,0.982759,0.982759,0.982759
1600,0.166114,0.056258,0.988,0.987069,0.987069,0.987069
2000,0.07759,0.054811,0.988,0.987124,0.982906,0.991379


TrainOutput(global_step=2000, training_loss=0.27107021522521974)

In [None]:
# tokenize data for inference/prediction
pred_dataset = Dataset.from_pandas(df_pred[['text']]).map(tokenize)
pred_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

HBox(children=(FloatProgress(value=0.0, max=1800.0), HTML(value='')))




In [None]:
# set up data loader for batch inference
data_loader = torch.utils.data.DataLoader(pred_dataset, batch_size=32)
print(pred_dataset['input_ids'].shape)
print(next(iter(data_loader))['input_ids'].shape)

torch.Size([1800, 64])
torch.Size([32, 64])


In [None]:
# perform prediction
model.eval()
result = np.zeros(0)
with torch.no_grad():
  for d in data_loader:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    _, preds = torch.max(outputs[0], dim=1)
    result = np.append(result, preds.to('cpu').numpy())

In [None]:
np.sum(result==1)

969

In [None]:
result[:10]

array([1., 1., 1., 0., 1., 1., 0., 1., 1., 1.])

In [None]:
# write prediction result to disk
with open('answer_pt_v6.txt', 'w') as f:
  for i in range(len(result)):
    pred = 'SARCASM' if result[i]==1 else 'NOT_SARCASM'
    f.writelines(f'twitter_{i+1},{pred}\n')

In [None]:
# save the model
filepath = '/content/drive/My Drive/CS410_Text_Info_Systems/Final Project/ClassificationCompetition/model/'
# torch.save(model, filepath + 'bert_ft_4epochs.pt') # uncomment to save new model

# Demo

In [None]:
# check if model is available
filepath = '/content/drive/My Drive/CS410_Text_Info_Systems/Final Project/ClassificationCompetition/model/'
dir = filepath.replace(' ','\ ')
!ls $dir

bert_ft_4epochs.pt


In [None]:
# read the model
model_saved = torch.load(filepath + 'bert_ft_4epochs.pt')

# perform prediction
model_saved.eval()
result_1 = np.zeros(0)
with torch.no_grad():
  for d in data_loader:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    outputs = model_saved(input_ids=input_ids, attention_mask=attention_mask)
    _, preds = torch.max(outputs[0], dim=1)
    result_1 = np.append(result_1, preds.to('cpu').numpy())

In [None]:
np.sum(result_1==1)

969

In [None]:
result_1[:10]

array([1., 1., 1., 0., 1., 1., 0., 1., 1., 1.])

In [None]:
# write prediction result to disk
with open('answer_demo.txt', 'w') as f:
  for i in range(len(result_1)):
    pred = 'SARCASM' if result_1[i]==1 else 'NOT_SARCASM'
    f.writelines(f'twitter_{i+1},{pred}\n')

In [None]:
# retrieve submitted solution
f_sol = '/content/drive/My Drive/CS410_Text_Info_Systems/Final Project/ClassificationCompetition/result/answer.txt'.replace(' ','\ ')
!cp $f_sol .

In [None]:
# test for difference
!diff ./answer.txt ./answer_demo.txt

### Reference:
https://huggingface.co/transformers/v3.4.0/training.html

https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

https://medium.com/atheros/text-classification-with-transformers-in-tensorflow-2-bert-2f4f16eff5ad