In [1]:
import pathlib
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import (
    AdamW,
    BertConfig,
    BertModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

In [2]:
input_dir = pathlib.Path("../data/raw")

train = pd.read_csv(input_dir / 'train.csv')
test = pd.read_csv(input_dir / 'test.csv')
smpl_sub = pd.read_csv(input_dir / 'sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
class CommonLitDataset(Dataset):
    def __init__(self, target, excerpt, tokenizer, max_len):
        self.target = target
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_id = inputs["token_type_ids"]
        
        target = self.target[item]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.double),
        }
    
    
class CommonLitModel(nn.Module):
    def __init__(self, model_path):
        super(CommonLitModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_path)
        self.out = nn.Linear(768, 1)
        
    def forward(self, ids, mask, token_type_ids=None):
        _, output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.out(output)
        return output

In [5]:
model_path = "bert-base-uncased"

model = BertModel.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

In [6]:
dataset = CommonLitDataset(train["target"], excerpt=train['excerpt'], tokenizer=tokenizer, max_len=100)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [7]:
dataloader_iter = iter(data_loader)

In [9]:
smpl_data = dataloader_iter.next()

In [12]:
model = CommonLitModel(model_path)
z = model(smpl_data['input_ids'], smpl_data['attention_mask'])

In [13]:
z

tensor([[0.4939],
        [0.4185],
        [0.5105],
        [0.3803],
        [0.5780],
        [0.4830],
        [0.5224],
        [0.6072],
        [0.6168],
        [0.2091],
        [0.5405],
        [0.5386],
        [0.5783],
        [0.5864],
        [0.5090],
        [0.4866],
        [0.2648],
        [0.6567],
        [0.6368],
        [0.0676],
        [0.5711],
        [0.5017],
        [0.5236],
        [0.5658],
        [0.3929],
        [0.4598],
        [0.3017],
        [0.5262],
        [0.6805],
        [0.4418],
        [0.5334],
        [0.5256]], grad_fn=<AddmmBackward>)