In [None]:
! pip install transformers

In [1]:
import torch
import torch.nn as nn
import transformers
import pandas as pd
import numpy as np

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [4]:
df = pd.read_csv('./train.csv', names=['label', 'review'])
df = df.iloc[:2000,:]

In [None]:
max_len = 0

for i in df[0]:
  if len(i.split()) > max_len:
    max_len = len(i.split())

max_len

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=RANDOM_SEED, shuffle=True)

train_df.shape, val_df.shape

In [None]:
class CustomDataset(Dataset):
  def __init__(self, review, label, tokenizer, max_len):
    self.review = review
    self.label = label
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.review)

  def __getitem__(self, idx):
    review = self.review[idx]
    encoding = self.tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=64,
        truncation=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return {
        'review': review,
        'input_id': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(self.label[idx], dtype=torch.long)
    }

In [None]:
bert_model = 'bert-base-uncased'

In [None]:

tokenizer = transformers.BertTokenizer.from_pretrained(bert_model)

MAX_LEN = 128 

train_data = CustomDataset(
    review = train_df[0].values,
                              label = train_df[1].values,
                              tokenizer = tokenizer,
                              max_len = MAX_LEN)

val_data = CustomDataset(
    review = val_df[0].values,
                              label = val_df[1].values,
                              tokenizer = tokenizer,
                              max_len = MAX_LEN)

test_data = CustomDataset(
    review = test_df[0].values,
                              label = test_df[1].values,
                              tokenizer = tokenizer,
                              max_len = MAX_LEN)

In [None]:
len(train_data)

In [None]:
BATCH_SIZE = 32

train_loader = DataLoader(
    train_data, 
    batch_size=BATCH_SIZE, 
    shuffle=True)

val_loader = DataLoader(
    val_data, 
    batch_size=BATCH_SIZE, 
    shuffle=False)

test_loader = DataLoader(
    test_data, 
    batch_size=BATCH_SIZE, 
    shuffle=False)

In [None]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_class):
    super(SentimentClassifier, self).__init__()
    self.bert = transformers.BertModel.from_pretrained(bert_model)
    self.dropout = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_class)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    temp = self.bert(input_ids, attention_mask)
    pooled_out = temp[1]
    out = self.dropout(pooled_out)
    out = self.linear(out)
    return out

In [None]:
n_class = 2
model = SentimentClassifier(n_class)

In [None]:
LR = 1e-5
EPOCHS = 10
TTL_STEPS = len(train_loader) * EPOCHS

criterian = torch.nn.CrossEntropyLoss()

optimizer = transformers.AdamW(params=model.parameters(), lr=LR, correct_bias=True)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=TTL_STEPS)

In [None]:
def train_model(model, data_loader, criterian, optimizer, scheduler, batch_size, max_len, n_samples):
  model.train()
  train_loss = []
  correct_pred = 0

  for data in data_loader:
    input_ids = data['input_id']
    attention_masks = data['attention_mask']
    labels = data['label']

    # forward prop
    predictions = model(input_ids, attention_masks)
    loss = criterian(predictions, labels)
    _, pred_classes = torch.max(predictions, dim=1)

    # back prop
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    train_loss.append(loss.item())
    correct_pred += torch.sum(pred_classes==labels)

  return correct_pred / n_samples, np.mean(train_loss)

In [None]:
def eval_model(model, data_loader, criterian, batch_size, max_len, n_samples):
  model.eval()
  eval_loss= []
  correct_pred = 0

  with torch.no_grad():
    for data in data_loader:
      input_ids = data['input_id']
      attention_masks = data['attention_mask']
      labels = data['label']

      # forward prop
      predictions = model(input_ids, attention_masks)
      loss = criterian(predictions, labels)
      _, pred_classes = torch.max(predictions, dim=1)

      eval_loss.append(loss.item())
      correct_pred += torch.sum(pred_classes==labels)

  return correct_pred/n_samples, np.mean(eval_loss)

In [None]:
history = defaultdict(list)
best_acc = 0

for epoch in range(EPOCHS):
  print(f'epoch: {epoch+1}/{EPOCHS}')

  model = SentimentClassifier(n_class)
  train_acc, train_loss = train_model(model, train_loader, criterian, optimizer, scheduler, BATCH_SIZE, MAX_LEN, len(train_df))

  val_acc, val_loss = eval_model(model, val_loader, criterian, BATCH_SIZE, MAX_LEN, len(val_df))

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  print(f'train_acc: {train_acc}, train_lostt: {train_loss}, val_acc: {val_acc}, val_loss: {val_loss}')

  if val_acc > best_acc:
    best_model_name = f'best_model_{val_acc}.bin'
    torch.save(model.state_dict(), best_model_name)
    best_acc = val_acc