In [0]:
import torch
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score

In [0]:
BATCH_SIZE = 12
EPOCHS = 10

In [38]:
df = pd.read_csv('train_data.csv')
df.head()

Unnamed: 0,text,label
0,daily mood https://twitter.com/LIBGyal/status...,2
1,I am sorry you and others have to go through t...,2
2,Great point. But loyalty eventually has great...,1
3,Thank you so much sweet girl love you!!!,0
4,corona virus killing sports smh,2


In [0]:
label_to_emoji = {0: "\u2764\uFE0F",    
                    1: "😄"	,
                    2: "😔",
                    3: "🥺",
                    4: "😤",
                    }

In [0]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)

In [0]:
df['type'] = ['TBD']*df.shape[0]
df.loc[X_train, 'type'] = 'train'
df.loc[X_val, 'type'] = 'val'

In [42]:
df.groupby(['label', 'type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
label,type,Unnamed: 2_level_1
0,train,1715
0,val,303
1,train,1728
1,val,305
2,train,1740
2,val,307
3,train,1724
3,val,304
4,train,1718
4,val,304


In [0]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [0]:
tokens_train = tokenizer.batch_encode_plus(
    df[df.type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

tokens_val = tokenizer.batch_encode_plus(
    df[df.type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = tokens_train['input_ids']
attention_masks_train = tokens_train['attention_mask']
labels_train = torch.tensor(df[df.type=='train'].label.values)

input_ids_val = tokens_val['input_ids']
attention_masks_val = tokens_val['attention_mask']
labels_val = torch.tensor(df[df.type=='val'].label.values)

In [0]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [0]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_to_emoji),
    output_attentions=False,
    output_hidden_states=False
)

In [0]:
loader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=BATCH_SIZE
)

loader_val = DataLoader(
    dataset_val,
    sampler=SequentialSampler(dataset_val),
    batch_size=32
)

In [0]:
optimizer = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)

In [0]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataset_train)*EPOCHS
)

In [0]:
def calc_f1_score(preds, labels):
  preds_ = np.argmax(preds, axis=1).flatten()
  labels_ = labels.flatten()
  return f1_score(labels_, preds_, average='macro')

In [0]:
def acc_per_class(preds, labels):
  preds_ = np.argmax(preds, axis=1).flatten()
  labels_ = labels.flatten()

  for label in np.unique(labels_):
    y_preds = preds_[labels_==label]
    y_true = labels_[labels_==label]
    print('Class: ', label_to_emoji[label])
    print('Accuracy: ', len(y_preds[y_preds==label])/len(y_true))

In [0]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [0]:
def evaluate(loader_val):
  model.eval()

  loss_val_total = 0
  y_pred, y_true = [], []

  for batch in loader_val:
    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

    with torch.no_grad():
      outputs = model(**inputs)
    
    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    y_pred.append(logits)
    y_true.append(label_ids)
  
  loss_val_avg = loss_val_total/len(loader_val)

  y_pred = np.concatenate(y_pred, axis=0)
  y_true = np.concatenate(y_true, axis=0)

  return loss_val_avg, y_pred, y_true

In [0]:
for epoch in tqdm(range(1, EPOCHS+1)):
  model.train()

  loss_train_total = 0

  progress_bar = tqdm(loader_train,
                      desc=f'Epoch {epoch}',
                      leave=False,
                      disable=False)
  for batch in progress_bar:
    model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2] 
    }
  
    outputs = model(**inputs)
    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

  torch.save(model.state_dict(), f'Model_epoch{epoch}.model')

  tqdm.write(f'\nEpoch {epoch}')

  loss_train_avg = loss_train_total/len(loader_train)
  tqdm.write(f'Training loss: {loss_train_avg}')
  
  val_loss, y_pred, y_true = evaluate(loader_val)
  val_f1 = calc_f1_score(y_pred, y_true)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'f1-score: {val_f1}')

In [0]:
_, pred, true = evaluate(loader_val)

In [0]:
pred_emoji = []
for p in np.argmax(pred, axis=1).flatten():
  pred_emoji.append(label_to_emoji[p])

In [0]:
df_results = pd.DataFrame({'text': df[df.type=='val'].text.values, 'pred': pred_emoji})
df_results.to_csv('test_results.csv')