In [None]:
!pip install -q transformers
!pip install -q emoji --upgrade

[K     |████████████████████████████████| 5.8 MB 12.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 65.4 MB/s 
[K     |████████████████████████████████| 182 kB 79.6 MB/s 
[K     |████████████████████████████████| 240 kB 9.6 MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertConfig

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from textwrap import wrap
import re
from emoji import demojize
from collections import defaultdict
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

# plot setting
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# setup random seed for split 
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
# Install bert tokenizer
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'   # model 1: 'bert-base-cased'; model 2: 'bert-large-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Pre trained model store path
PRE_TRAINED_DICT_NAME = 'checkpoints/' + PRE_TRAINED_MODEL_NAME + '/pretrain_model_dict.bin'

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/CS675 Machine Learning/Final_Project/' # Jintan
#path = '/content/drive/MyDrive/MLProject/' #Haochen's google drive

Mounted at /content/drive


**-----------------------------------------**

**Pretrain Data Setup**

**-----------------------------------------**

In [None]:
# Label the IMDB sentiment
df = pd.read_csv("imdb_reviews.csv") # due to the size of the dataset, we are not providing them in this notebook

def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else: 
    return 2

df['sentiment'] = df.score.apply(to_sentiment)

class_names = ['negative', 'neutral', 'positive']

**-----------------------------------------**

**Dataset and Model Setup**

**-----------------------------------------**

In [None]:
# dataset class 
class SentimentDataset(Dataset):

  def __init__(self, text_input, label, tokenizer, max_token_len):
    # input and label
    self.text_input = text_input
    self.label = label

    # text tokenizer
    self.tokenizer = tokenizer

    # tokenzier property
    self.max_token_len = max_token_len
  
  def __len__(self):
    return len(self.text_input)
  
  def __getitem__(self, item):
    # get text and label
    text_input = str(self.text_input[item])
    label = self.label[item]

    # tokenize string
    tokenized_input = self.tokenizer.encode_plus(text_input, add_special_tokens=True, 
                                                 max_length=self.max_token_len, pad_to_max_length=True,
                                                 return_attention_mask=True)

    return tokenized_input['input_ids'], tokenized_input['attention_mask'], label

In [None]:
# helper function to create dataset and dataloader
def generate_dataloader(data, tokenizer, max_token_len, batch_size, num_workers):
  dataset = SentimentDataset(text_input=data.content.to_numpy(), label=data.sentiment.to_numpy(),
                             tokenizer=tokenizer, max_token_len=max_token_len)

  return DataLoader(ds, batch_size=batch_size, num_workers=num_workers)

In [None]:
# Padding size 160 and class number 3
# NOTE: we get size 160 by analyzing the histogram of the sequence size
MAX_SEQ_LEN = 160
NUM_SENTIMENT = 3

# split data to train, eval, test
BATCH_SIZE = 16
NUM_WORKER = 2

data_train, data_test = train_test_split(data, test_size=0.3, random_state=RANDOM_SEED)
data_val, data_test = train_test_split(data_test, test_size=0.5, random_state=RANDOM_SEED)

train_data_loader = generate_dataloader(data_train, tokenizer, MAX_SEQ_LEN, BATCH_SIZE, NUM_WORKER)
val_data_loader = generate_dataloader(data_val, tokenizer, MAX_SEQ_LEN, BATCH_SIZE, NUM_WORKER)
test_data_loader = generate_dataloader(data_test, tokenizer, MAX_SEQ_LEN, BATCH_SIZE, NUM_WORKER)

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.dropout = nn.Dropout(p=0.2)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, output = self.bert(input_ids=input_ids, attention_mask=attention_mask).to_tuple()
    output = self.dropout(output)
    return self.classifier(output)

model = SentimentClassifier(NUM_SENTIMENT, dropout)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# hyperparameters
EPOCHS = 10

# optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# learning rate scheduler # TODO
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# loss function
loss_fn = nn.CrossEntropyLoss().to(device)

**-----------------------------------------**

**Define training & evaluatiojn function**

**-----------------------------------------**

In [None]:
def train(model, data_loader, loss_fn, optimizer, scheduler, device, n_examples):
  # enable weight update
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for input_ids, attention_mask, label in data_loader:
    # convert to acceleration unit
    input_ids = text_input.to(device)
    attention_mask = attention_mask.to(device)
    label = label.to(device)

    # forward
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # compute loss
    _, pred = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    # update gradient
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()

    # compute accuracy of current training epoch
    correct_predictions += torch.sum(pred == label)
    losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval(model, data_loader, loss_fn, device, n_examples):
  # freeze backprop gradient compute
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for input_ids, attention_mask, label in data_loader:
      # convert to acceleration unit
      input_ids = text_input.to(device)
      attention_mask = attention_mask.to(device)
      label = label.to(device)

      # forward
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)

      # compute loss
      _, pred = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)

      # compute accuracy of current evaluation epoch
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def train(EPOCHS, model, train_data_loader, val_data_loader, data_train, data_val, loss_fn, optimizer, scheduler, device, dict_name):
  history = defaultdict(list)
  best_acc = 0

  for epoch in range(EPOCHS):

    print('Starting Epoch {}...'.format(epoch))

    # train
    train_acc, train_loss = train(
      model,
      train_data_loader,    
      loss_fn, 
      optimizer,
      scheduler, 
      device, 
      len(data_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

    # evaluate
    val_acc, val_loss = eval(
      model,
      val_data_loader,
      loss_fn, 
      device, 
      len(data_val)
    )
    print(f'Val loss {val_loss} accuracy {val_acc}')

    # record
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    # save model ckpt based on evaluation accuracy
    if val_acc > best_acc:
      torch.save(model.state_dict(), path + dict_name)
      best_accuracy = val_acc
      print("new val acc best! saving model...")

  return history

history = train(EPOCHS, model, train_data_loader, val_data_loader, data_train, data_val, loss_fn, optimizer, scheduler, device, PRE_TRAINED_DICT_NAME)

**-----------------------------------------**

**Evaluate pre-trained model accuracy**

**-----------------------------------------**

In [None]:
# uncomment this line if you reload this notebook and wish to continue from previous checkpoint
# model.load_state_dict(torch.load("/content/drive/MyDrive/MLProject/checkpoints/bert-large-cased/pretrain_model_dict.bin"))

test_acc, test_loss = eval(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(data_test)
)

print("Test accuracy {}".format(test_acc.item()))

**FINE TUNNING STARTS HERE**

**-----------------------------------------**

**Read fine-tune Data**

**-----------------------------------------**

In [None]:
# herbal
data_herbal = pd.read_csv(path + "/data/Labeling_Sample_herbal_medicine.csv")
data_herbal.columns = ['content', 'label']
print("Herbal")
print(len(data_herbal))

# integrative medicine
data_im = pd.read_csv(path + "/data/integrative_medicine_labeled_csv.csv")
data_im.columns = ['content', 'label']
data_im = data_im.drop(data_im.index[127:len(data_im)])
print("integrative medicine")
print(len(data_im))

# qi gong tweet & reply
data_qg_tweet = pd.read_csv(path + "/data/qi_gong_tweet.csv")
data_qg_tweet.columns = ['content', 'label']
print("qi gong tweet")
print(len(data_qg_tweet))

data_qg_reply = pd.read_csv(path + "/data/qi_gong_reply.csv")
data_qg_reply.columns = ['content', 'label']
print("qi gong reply")
print(len(data_qg_reply))

# acpuate tweet & reply
data_acup_tweet = pd.read_csv(path + "/data/acup_tweet.csv")
data_acup_tweet.columns = ['content', 'label']
print("acpuate tweet")
print(len(data_acup_tweet))

data_acup_reply = pd.read_csv(path + "/data/acup_reply.csv")
data_acup_reply.columns = ['content', 'label']
print("acpuate reply")
print(len(data_acup_reply))

# concatenate all data
data = pd.concat([data_herbal, data_im, data_qg_tweet, data_qg_reply, data_acup_tweet, data_acup_reply], axis=0)

In [None]:
# create label column in numerical form
numLabel = []
pos_count=0
neu_count=0
neg_count=0
for label in data.label:
  if label == "positive" or label == "P":
    numLabel.append(0)
    pos_count+=1
  elif label == "neutral" or label == "NU" or label == "IC":
    numLabel.append(1)
    neu_count+=1
  elif label == "negative" or label == "NE" or label == "N":
    neg_count+=1
    numLabel.append(2)
  else:
    print("Unrecognized label {}".format(label))

data["sentiment"] = numLabel

**-----------------------------------------**

**TCM Data Analysis and Cleaning**

**-----------------------------------------**

In [None]:
# brief analysis on hashtag
def find_hashtags(tweet):
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)
  
data['hashtag'] = data.content.apply(find_hashtags)
data.head(10)

hashtag_list = data['hashtag'].to_list()
flat_hashtags_df = pd.DataFrame([item for sublist in hashtag_list for item in sublist])
flat_hashtags_df.shape
#change the name of the column to hashtags
flat_hashtags_df.columns = ['hashtag']
flat_hashtags_df.head()
print("Total hashtags: ", len(flat_hashtags_df['hashtag']))
print("Repeated hashtags: ", len(flat_hashtags_df['hashtag'].unique()))
flat_hashtags_df['hashtag'].value_counts()[:20].plot(kind='barh')

In [None]:
# clean data, remove emmoji, weblink, etc.
import emoji 

def clean(tweet):
  txt = re.sub(r"https?://\S+", "", tweet) #remove hyperlink
  txt = re.sub("\n", " ", txt)
  txt = re.sub(":", " ", txt)
  txt = re.sub(r"&amp", " ", txt)
  txt = re.sub(r'@[A-Za-z0-9_]+[A-Za-z0-9-_]+', '', txt) #remove mention
  txt = re.sub(r'#[A-Za-z]+[A-Za-z0-9-_]+', '', txt)
  txt = re.sub("_", " ", txt)
  txt = emoji.replace_emoji(txt, replace='')
  return txt

data.content = data.content.apply(clean)
data.head(10)

In [None]:
# check label distribution
sns.countplot(data.sentiment)
plt.xlabel('sentiment');

**-----------------------------------------**

**Prepare fine-tune**

**-----------------------------------------**

In [None]:
# create model for fine-tune
modelTCM = SentimentClassifier(NUM_SENTIMENT)

# load pretrained checkpoint (general domain)
modelTCM.load_state_dict(torch.load(path + PRE_TRAINED_DICT_NAME))
modelTCM = modelTCM.to(device)

In [None]:
# split data to train, eval, test
BATCH_SIZE = 16
NUM_WORKER = 2

data_train, data_test = train_test_split(data, test_size=0.3, random_state=RANDOM_SEED)
data_val, data_test = train_test_split(data_test, test_size=0.5, random_state=RANDOM_SEED)

train_data_loader = generate_dataloader(data_train, tokenizer, MAX_SEQ_LEN, BATCH_SIZE, NUM_WORKER)
val_data_loader = generate_dataloader(data_val, tokenizer, MAX_SEQ_LEN, BATCH_SIZE, NUM_WORKER)
test_data_loader = generate_dataloader(data_test, tokenizer, MAX_SEQ_LEN, BATCH_SIZE, NUM_WORKER)

In [None]:
# Plot for the data distribution
sns.countplot(data_train.sentiment)
plt.xlabel('sentiment');
plt.title('train');

sns.countplot(data_val.sentiment)
plt.xlabel('sentiment');
plt.title('val');

sns.countplot(data_test.sentiment)
plt.xlabel('sentiment');
plt.title('test');

In [None]:
# Run the code below if you start fine-tunning directly

# hyperparameters
EPOCHS = 10

# optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# learning rate scheduler # TODO
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# loss function
# uncomment code below if you want to use weight cross-entropy function
# total_count = pos_count + neu_count + neg_count
# class_weight = torch.tensor([total_count/pos_count,total_count/neu_count,total_count/neg_count])
# loss_fn = nn.CrossEntropyLoss(class_weight).to(device)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
history_ft = train(EPOCHS, modelTCM, train_data_loader, val_data_loader, data_train, data_val, loss_fn, optimizer, scheduler, device, 'checkpoints/'+PRE_TRAINED_MODEL_NAME+'/fine_tune_model_dict.bin')

**-----------------------------------------**

**Evaluate TCM Fine-tune Result**

**-----------------------------------------**

In [None]:
# uncomment this line if you reload this notebook and wish to continue from previous checkpoint
# modelTCM.load_state_dict(torch.load(path + 'checkpoints/'+PRE_TRAINED_MODEL_NAME+'/fine_tune_model_dict_shuffle_llr_gen_entire.bin'))
# modelTCM = modelTCM.to(device)

test_acc, test_loss = eval(
  modelTCM,
  test_data_loader,
  loss_fn,
  device,
  len(data_test)
)

print("Test accuracy {}".format(test_acc.item()))

**-----------------------------------------**

**Further Analysis**

**-----------------------------------------**

In [None]:
# check training and val accuracy
for i in range(len(history_ft['train_acc'])):
  history_ft['train_acc'][i] = history_ft['train_acc'][i].cpu()

for i in range(len(history_ft['val_acc'])):
  history_ft['val_acc'][i] = history_ft['val_acc'][i].cpu()

plt.plot(history_ft['train_acc'], label='train')
plt.plot(history_ft['val_acc'], label='validation')
plt.title('TCM: Fine Tune')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

**Precision, recall, F-1**

In [None]:
# utility function for getting predictions
import torch.nn.functional as F

def get_predictions(model, data_loader):
  # freeze backprop gradient compute
  model = model.eval()
  
  predictions = []
  labels = []

  with torch.no_grad():
    for input_ids, attention_mask, label in data_loader:
      # convert to acceleration unit
      input_ids = text_input.to(device)
      attention_mask = attention_mask.to(device)
      label = label.to(device)

      # forward
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)

      # get prediction
      _, prediction = torch.max(outputs, dim=1)

      predictions.extend(prediction)
      labels.extend(label)

  predictions = torch.stack(predictions).cpu()
  labels = torch.stack(labels).cpu()

  return predictions, labels

In [None]:
# check precision, recall, and F-1
y_pred, y_test = get_predictions(modelTCM, test_data_loader)
target_names = ['positive', 'neutral', 'negative']
print(classification_report(y_test, y_pred, target_names=target_names))

**Confusion matrix**

In [None]:
# utility function for visualizing confusion matrix
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Reds")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.title('Confusion Matrix - w pretrain, w/o fine-tune')
  plt.xlabel('Prediction');
  plt.ylabel('Ground Truth')

In [None]:
# check confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

confusion_mat = pd.DataFrame(confusion_mat, index=target_names, columns=target_names)
show_confusion_matrix(confusion_mat)