In [1]:
pip install transformers



In [0]:
import pandas as pd
import numpy as np
import torch
import transformers as ppb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

In [3]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1bFYRBWjAqXFxrRnA6XGqRY2ffh6gsAgG'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('IMDB Dataset.csv') 
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,length of text,sentiment
0,Match 1: Tag Team Table Match Bubba Ray and Sp...,13704,positive
1,There's a sign on The Lost Highway that says:<...,12988,positive
2,"(Some spoilers included:)<br /><br />Although,...",12930,positive
3,"Back in the mid/late 80s, an OAV anime by titl...",12129,positive
4,**Attention Spoilers**<br /><br />First of all...,10363,positive


In [4]:
data.shape

(50000, 3)

In [5]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

We're gona tranform the sentiment into a dummy variable so the analysis will be easier

In [0]:
data['sentiment']=data['sentiment'].apply(lambda row: np.where(row=='positive',1,0))

In [7]:
data['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

The dataset is balanced, we don't need to modify it.
We split it so that we can cross-validate our resutls.

In [0]:
df_train, df_test = train_test_split(data, test_size=0.1)
df_val, df_test = train_test_split(df_test, test_size=0.5)

We need to preprocess the data by tokenizing it, so that it can bu understood by our model. To do that, we're gonna use the pre-trained Bert Tranformer from the transformer librairy.

In [0]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.reviews)
  
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

After that, we need to create a Data Loader to use the resulting dataset of our pre-processing.

In [0]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df['review'].to_numpy(),
    targets=df['sentiment'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

The max length is the length of the tokenizez sentence. Normally, Bert accept a max length of 512. However, because of memory issues, we are only using 200.

In [0]:
batch_size = 32
max_len=200

train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
val_data_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)
test_data_loader = create_data_loader(df_test, tokenizer, max_len, batch_size)

Bert is already pre-trained, thus we can use transfer learning so get some results. In this case, we're adding a dropout layer, and a fully-connected with two neurons for our output.

In [0]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = ppb.BertModel.from_pretrained('bert-base-uncased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [0]:
model = SentimentClassifier(2)
model = model.to(device)

We're gonna try our model on a test batch.

In [0]:
test = next(iter(train_data_loader))

In [27]:
input_ids = test['input_ids'].to(device)
attention_mask = test['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

torch.Size([32, 200])
torch.Size([32, 200])


In [28]:
nn.functional.softmax(model(input_ids, attention_mask), dim=1)

tensor([[7.7979e-03, 9.9220e-01],
        [3.1158e-02, 9.6884e-01],
        [9.9909e-01, 9.1435e-04],
        [8.4817e-03, 9.9152e-01],
        [9.9760e-01, 2.4016e-03],
        [9.9746e-01, 2.5438e-03],
        [6.9988e-01, 3.0012e-01],
        [9.9321e-01, 6.7899e-03],
        [9.9776e-01, 2.2382e-03],
        [9.9863e-01, 1.3716e-03],
        [2.8701e-03, 9.9713e-01],
        [8.4338e-01, 1.5662e-01],
        [9.8252e-01, 1.7477e-02],
        [9.9950e-01, 5.0212e-04],
        [9.8962e-01, 1.0382e-02],
        [1.6586e-03, 9.9834e-01],
        [9.9681e-01, 3.1917e-03],
        [9.9908e-01, 9.2120e-04],
        [9.7856e-01, 2.1435e-02],
        [9.9893e-01, 1.0741e-03],
        [7.4931e-03, 9.9251e-01],
        [9.9921e-01, 7.9241e-04],
        [1.6838e-03, 9.9832e-01],
        [1.1401e-02, 9.8860e-01],
        [3.3195e-03, 9.9668e-01],
        [1.4358e-02, 9.8564e-01],
        [9.4923e-01, 5.0769e-02],
        [9.9863e-01, 1.3750e-03],
        [8.3647e-03, 9.9164e-01],
        [5.747

In [58]:
test['targets']

tensor([1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        1, 1, 0, 0, 1, 0, 0, 1])

We can see that on a batch test, the results seem coherent. We will now try to train the pre-trained model, to get the best results possible.

In [0]:
EPOCHS = 2

optimizer = ppb.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = ppb.get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [0]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [0]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [23]:
%%time


for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()



Epoch 1/2
----------
Train loss 0.26523301232534685 accuracy 0.8909555555555556
Val   loss 0.2094722317083727 accuracy 0.9184

Epoch 2/2
----------
Train loss 0.13220838201381185 accuracy 0.9544444444444445
Val   loss 0.2571580213600699 accuracy 0.9128000000000001

CPU times: user 55min 43s, sys: 36min 36s, total: 1h 32min 20s
Wall time: 1h 33min 8s


The results are actually pretty good on the train and the validation set. Let's see the performance on the test set.

In [24]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

0.906

The accuracy on the test set is almost the same, we don't have an overfitting problem.

If we wanted to imporve our model, we should firstly set the max_length at 512. However, most of the sentences are longer that that. therefore, we could separate the reviews into parts, and addind a layer to our model to take recombine the results.