In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!pip install transformers



In [None]:
!ls "/content/drive/MyDrive/DeepLearning/Common Literacy"

 Common_Literacy_AVGW2V_POS.ipynb
 Common_Literacy_AVGW2V_TFIDFAVGW2V_POS.ipynb
 CommonLiteracyBertModels.ipynb
 CommonLiteracyBertModels_VV.ipynb
 Common_Literacy_BOW_TFIDF_AvgW2V100D.ipynb
 Common_Literacy_BOW_TFIDF_AvgW2V100D_SVD.ipynb
 Common_Literacy_BOW_TFIDF_AVGW2V300D.ipynb
 CommonLiteracy_CNN_Model.ipynb
 CommonLiteracy_CNN_Model_Word2Vec_TFIDF.ipynb
'Copy of CommonLiteracyBertModels_VV.ipynb'
 Dataset


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import seaborn as sns

from sklearn.model_selection import GridSearchCV, train_test_split

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

%matplotlib inline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# specify GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
train ='/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/train.csv'
test = '/content/drive/MyDrive/DeepLearning/Common Literacy/Dataset/test.csv'

In [None]:
# only loading the train data as test data is not complete
df_train = pd.read_csv(train)

In [None]:
# Defining different parameters of the code
MAX_LEN = 350
BATCH_SIZE = 8
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
EPOCHS = 16

### Data Preprocessing

In [None]:
# Getting the bert base uncased model from transformers for tokenizing
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Special Tokens Name and IDs

In [None]:
print("This is SEP token:",tokenizer.sep_token, tokenizer.sep_token_id)
print("This is CLS token:",tokenizer.cls_token, tokenizer.cls_token_id)
print("This is PAD token:",tokenizer.pad_token, tokenizer.pad_token_id)
print("This is UNK token:",tokenizer.unk_token, tokenizer.unk_token_id)

This is SEP token: [SEP] 102
This is CLS token: [CLS] 101
This is PAD token: [PAD] 0
This is UNK token: [UNK] 100


### Choosing Sequence Length

In [None]:
class Common_Literacy_Dataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.reviews)
  
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target)
    }

In [None]:
df = df_train[['target','excerpt']]
print(df.columns)

CL_train, CL_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
CL_train, CL_val = train_test_split(CL_train, test_size=0.2, random_state=RANDOM_SEED)

Index(['target', 'excerpt'], dtype='object')


In [None]:
CL_train.shape, CL_val.shape, CL_test.shape

((1813, 2), (454, 2), (567, 2))

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = Common_Literacy_Dataset(
    reviews=df['excerpt'].to_numpy(),
    targets=df['target'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
train_data_loader = create_data_loader(CL_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(CL_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(CL_test, tokenizer, MAX_LEN, BATCH_SIZE)

  cpuset_checked))


In [None]:
class Common_Literacy_Regressor(nn.Module):

  def __init__(self):
    super(Common_Literacy_Regressor, self).__init__()
    self.bert = bert_model
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(in_features=self.bert.config.hidden_size,out_features=1)
    self.double()
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
model = Common_Literacy_Regressor()
model = model.to(device)

In [None]:
loss_fn = nn.MSELoss(reduction='mean').to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()

  train_losses = 0
  no_train_steps = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    # print("="*100)

    loss = loss_fn(outputs, targets)
    # print("This is training Loss:",loss)
    train_losses += loss.item()

    # print("="*100)

    no_train_steps += 1

    # print("Number of Steps:",no_train_steps)
    # print("="*100)

    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return train_losses, no_train_steps

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  val_losses = 0
  no_val_steps = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      # print("="*100)

      loss = loss_fn(outputs, targets)
      # print("This is Evaluation Loss:",loss)

      # print("="*100)

      val_losses += loss.item()

      no_val_steps += 1

      # print("Number of Steps:",no_val_steps)
      # print("="*100)

  return val_losses, no_val_steps

In [None]:
from collections import defaultdict

history = defaultdict(list)
best_rmse = 0

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_losses, no_train_steps = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(CL_train)
  )

  MSE_train = train_losses/no_train_steps
  RMSE_train = np.sqrt(mean_squared_error_train)

  print(f'Train MSE {MSE_train} RSME {RMSE_train}')

  val_losses, no_val_steps = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(CL_val)
  )

  MSE_val = val_losses/no_val_steps
  RMSE_val = np.sqrt(MSE_val)

  print(f'Validation MSE {MSE_val} RSME {RMSE_val}')
  
  history['train_MSE'].append(MSE_train)
  history['train_RMSE'].append(RMSE_train)
  history['val_MSE'].append(MSE_val)
  history['val_RMSE'].append(RMSE_val)

  if RMSE_val > best_rmse:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_rmse = RMSE_val

Epoch 1/16
----------


  cpuset_checked))
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Train MSE 0.9569834749103096 RSME 0.979500395394024


  return F.mse_loss(input, target, reduction=self.reduction)


Validation MSE 1.1254132575391411 RSME 1.0608549653647954
Epoch 2/16
----------
Train MSE 0.9577925230467235 RSME 0.979500395394024
Validation MSE 1.1254132575391411 RSME 1.0608549653647954
Epoch 3/16
----------
Train MSE 0.9573438319475264 RSME 0.979500395394024
Validation MSE 1.1254132575391411 RSME 1.0608549653647954
Epoch 4/16
----------
Train MSE 0.9572165232665533 RSME 0.979500395394024
Validation MSE 1.1254132575391411 RSME 1.0608549653647954
Epoch 5/16
----------


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f894c2aedd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1301, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.7/multiprocessing/process.py", line 140, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.7/multiprocessing/popen_fork.py", line 45, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 921, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.7/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 


KeyboardInterrupt: ignored

In [None]:
plt.plot(history['train_RMSE'], label='train RMSE')
plt.plot(history['val_RMSE'], label='validation RMSE')

plt.title('Training history')
plt.ylabel('RMSE')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);