In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv
/kaggle/input/feedback-prize-english-language-learning/train.csv
/kaggle/input/feedback-prize-english-language-learning/test.csv
/kaggle/input/bert-base-uncased/config.json
/kaggle/input/bert-base-uncased/pytorch_model.bin
/kaggle/input/bert-base-uncased/vocab.txt


In [2]:
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import transformers
from transformers import get_linear_schedule_with_warmup, AdamW, get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import get_polynomial_decay_schedule_with_warmup
from sklearn.metrics import mean_squared_error
import torch.nn.functional as F

In [3]:
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
sample_sub = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')

In [4]:
config = {
    'model': '../input/bert-base-uncased',
    'max_length': 512,
    'train_batch_size': 8,
    'valid_batch_size': 8,
    'epochs': 5,
    'loss_fn': nn.SmoothL1Loss(),
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'target_classes': ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
}
print('done')

done


In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config['model'])
# tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')

In [6]:
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [7]:
class train_valid_dataset():
    
    def __init__(self, dataset):
        self.data = dataset
        self.classes = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
        self.maxlen = config['max_length']
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        full_text = str(self.data['full_text'][index])
        
        tokenizer_dict = tokenizer(full_text,
                                   None,
                                   add_special_tokens=True,
                                   max_length=self.maxlen,
                                   truncation=True,
                                   padding='max_length'
        )
        
        ids = tokenizer_dict['input_ids']
        token_type_ids = tokenizer_dict['token_type_ids']
        attention_mask = tokenizer_dict['attention_mask']
        label = self.data.loc[index, self.classes].to_list()
        
        return {'ids': torch.tensor(ids, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'mask': torch.tensor(attention_mask, dtype=torch.long),
                'target': torch.tensor(label, dtype=torch.float)
        }

In [8]:
class essay_model(nn.Module):
    
    def __init__(self, config, num_labels=6):
        super(essay_model, self).__init__()
        self.bert_model = transformers.AutoModel.from_pretrained(config['model'])
        self.fc1 = nn.Linear(self.bert_model.config.hidden_size, 64)
        self.fc2 = nn.Linear(64, num_labels)
    
    def forward(self, ids, token_type_ids, mask):
        _, outputs = self.bert_model(ids, token_type_ids, mask, return_dict=False)
        outputs = self.fc1(outputs)
        outputs = self.fc2(outputs)
        return outputs

In [9]:
def training_function(dataloader, model, device, optimizer):
    
    model.train()
    loss_sum = 0.
    total = 0
    iterator = tqdm(enumerate(dataloader), total=len(dataloader))
    for index, data in iterator:
        ids = data['ids']
        token_type_ids = data['token_type_ids']
        mask = data['mask']
        targets = data['target']
#         print("ids:", ids)
#         print("tt ids:",token_type_ids)
#         print("mask:", mask)
#         print("target:",targets)
        
        ids = ids.squeeze().to(device, dtype=torch.long)
        token_type_ids = token_type_ids.squeeze().to(device, dtype=torch.long)
        mask = mask.squeeze().to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(ids, token_type_ids, mask)
        
#         print(outputs.size())
#         print(targets.size())
        
        loss = config['loss_fn'](outputs, targets)
        loss_sum += loss.item()
        
        loss.backward()
        optimizer.step()
        total += targets.shape[0]
        
        del ids, token_type_ids, mask, targets, outputs, loss
        
    return loss_sum/total

In [10]:
def validation_function(dataloader, model, device):
    
    model.eval()
    loss_sum = 0.
    total = 0
    iterator = tqdm(enumerate(dataloader), total=len(dataloader))
    for index, data in iterator:
        ids = data['ids']
        token_type_ids = data['token_type_ids']
        mask = data['mask']
        targets = data['target']
        
        ids = ids.squeeze().to(device, dtype=torch.long)
        token_type_ids = token_type_ids.squeeze().to(device, dtype=torch.long)
        mask = mask.squeeze().to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        with torch.no_grad():
            outputs = model(ids, token_type_ids, mask)
        
        loss = config['loss_fn'](outputs, targets)
        loss_sum += loss.item()
        total += targets.shape[0]
        
        del ids, token_type_ids, mask, targets, outputs, loss
        
    return loss_sum/total

In [11]:
device = config['device']

loss_training = []
loss_validation = []

train_data, valid_data = train_test_split(train, test_size=0.2, random_state=77, shuffle=True)
train_data, valid_data = train_data.reset_index(), valid_data.reset_index()

model = essay_model(config)
model.to(device)
train_dataset = train_valid_dataset(train_data)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=config['train_batch_size'])
valid_dataset = train_valid_dataset(valid_data)
valid_dataloader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=config['valid_batch_size'])
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0},]
optimizer = AdamW(optimizer_parameters, lr=0.00001)
    
for epoch in range(config['epochs']):
    print("#################### EPOCH: %d ####################" % (epoch+1))
    train_loss = training_function(train_dataloader, model, device, optimizer)
    valid_loss = validation_function(valid_dataloader, model, device)
    loss_training.append(train_loss)
    loss_validation.append(valid_loss)
    print("Training Loss: %f, Validation Loss: %f" % (train_loss, valid_loss))

Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#################### EPOCH: 1 ####################




  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Training Loss: 0.073811, Validation Loss: 0.017518
#################### EPOCH: 2 ####################


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Training Loss: 0.015877, Validation Loss: 0.015443
#################### EPOCH: 3 ####################


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Training Loss: 0.014057, Validation Loss: 0.015198
#################### EPOCH: 4 ####################


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Training Loss: 0.012376, Validation Loss: 0.015833
#################### EPOCH: 5 ####################


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Training Loss: 0.011315, Validation Loss: 0.019044


In [12]:
loss_df = pd.DataFrame({
    'epoch': list(range(1, config['epochs'] + 1)),
    'train_loss': loss_training,
    'validation_loss': loss_validation
})
loss_df

Unnamed: 0,epoch,train_loss,validation_loss
0,1,0.073811,0.017518
1,2,0.015877,0.015443
2,3,0.014057,0.015198
3,4,0.012376,0.015833
4,5,0.011315,0.019044


In [13]:
class test_dataset:
    
    def __init__(self, dataset):
        self.config = config
        self.tokenizer = tokenizer
        self.maxlen = config['max_length']
        self.essay = dataset['full_text'].values
    
    def __len__(self):
        return len(self.essay)
    
    def __getitem__(self, item):
        
        essay = str(self.essay[item])
        tokenizer_dict = self.tokenizer(
            essay,
            add_special_tokens=True,
            truncation=True,
            max_length=self.maxlen,
            padding='max_length'
        )
        ids = tokenizer_dict['input_ids']
        token_type_ids = tokenizer_dict['token_type_ids']
        attention_mask = tokenizer_dict['attention_mask']
        
        return {'ids': torch.tensor(ids, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'mask': torch.tensor(attention_mask, dtype=torch.long),
        }

In [14]:
def testing_function(dataloader, model, device):
    
    predictions = []
    model.eval()
    iterator = tqdm(enumerate(dataloader), total=len(dataloader))
    for index, data in iterator:
        ids = data['ids']
        token_type_ids = data['token_type_ids']
        mask = data['mask']
        
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        
        with torch.no_grad():
            outputs = model(ids, token_type_ids, mask)
        predictions.append(outputs.detach().cpu())
    
    preds = np.concatenate(predictions)
    return preds

In [15]:
test.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [16]:
testing_dataset = test_dataset(test)
test_dataloader = torch.utils.data.DataLoader(dataset=testing_dataset, batch_size=config['valid_batch_size'])
predictions = testing_function(test_dataloader, model, device)

print(predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

[[3.1206613 3.065545  3.2405572 3.1494546 3.1377652 3.0697966]
 [2.8227742 2.7086482 2.9540188 2.7542455 2.6414194 2.7558405]
 [4.049753  3.9873178 4.038941  4.0979686 4.0884857 4.0921454]]


In [17]:

sample_sub[config['target_classes']] = predictions
sample_sub.to_csv('submission.csv', index=False)