# CommonLit Readability Prize

***
## My Workflow

#### 1. Import & Install libray
#### 2. Check out my data
#### 3. Preproccessing data
#### 4. Feature Engineering
#### 5. Define Model
#### 6. Evaluate
#### 7. Submission

# 1. Import & Install libray

In [1]:
import numpy as np
import pandas as pd
import math
import time
import gc

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig

# 2. Check out my data

In [2]:
submission_df = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device :", device)

Device : cuda


# 3. Preproccessing data

In [4]:
def cleaning_data(df):
    df['cleaning'] = df.excerpt.apply(lambda x: x.replace("'s", ''))
    df['cleaning'] = df['cleaning'].apply(lambda x: x.replace('’s', ''))
    df['cleaning'] = df['cleaning'].apply(lambda x: x.replace("\'s", ''))
    df['cleaning'] = df['cleaning'].apply(lambda x: x.replace("\’s", ''))
    
    return df

In [5]:
test_df = cleaning_data(test_df)
test_df.shape
test_df.head()

Unnamed: 0,id,url_legal,license,excerpt,cleaning
0,c0f722661,,,My hope lay in Jack's promise that he would ke...,My hope lay in Jack promise that he would keep...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...,Dotty continued to go to Mrs. Gray every night...
2,0df072751,,,It was a bright and cheerful scene that greete...,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...,Debugging is the process of finding and resolv...


# 4. Feature Engineering

In [6]:
BATCH_SIZE = 16
MAX_LEN = 248
ROBERTA_PATH = "../input/roberta-transformers-pytorch/roberta-base"
TOKENIZER_PATH = "../input/roberta-transformers-pytorch/roberta-base"
model_path = '../input/commonlit-roberta-model-weight/roberta5.pth'

### 4-1) Define Dataset

In [7]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [8]:
class RobertaDataset(Dataset):
    def __init__(self, df, Is_target=False):
        super().__init__()

        self.df = df        
        self.Is_target = Is_target
        self.text = df['cleaning'].apply(lambda x: str(x)).tolist()
        
        if self.Is_target:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)      

        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN + 2,#[s] and [/s]
            truncation = True,
            return_attention_mask=True,
        )  
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        data = {}
        
        data['input_ids'] = torch.tensor(self.encoded['input_ids'][index])
        data['attention_mask'] = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.Is_target:
            data['target'] = self.target[index]
            
        return data

In [9]:
test_dataset = RobertaDataset(test_df, Is_target=False)

In [10]:
print("[[ input_ids ]]")
print(test_dataset[0]['input_ids'],'\n\n')

print("[[ Decode ]]")
print(tokenizer.decode(test_dataset[0]['input_ids']), '\n\n')

print("[[ Attention Mask ]]")
print(test_dataset[0]['attention_mask'],'\n\n')

[[ input_ids ]]
tensor([    0,  2387,  1034,  4477,    11,  2722,  4198,    14,    37,    74,
          489,    10,  4520,  1109,  6574,    11,     5,  2853,   527,     7,
         4704,   162,    15,   127,   768,     4,   374,    10,   699,   363,
           42,  1109,    21,  7097,    31,     5,  3375,     6,    53,  7421,
           50,    97,    38,  1447,     7,   185,    88,  1316,     5,   194,
            9,     5,  1650,     4,    20,   935,    21,   455,     9,  4803,
          417,  4048, 37517,     6,    61,    74, 19930,     5,   471,  6991,
            9,    10, 29964,  9719, 20731,    10,  6317,  1314, 13258,     4,
        18680,    14,    42,   505,   754,   393,  2756,     7,   162,   454,
           38,    21,  1950,    10,   887,     9,    10,  7245,    31,     5,
         3375,     4,  1892,     6,    71,   546,    11, 25876,    13,     5,
        30943,  1109,     6,     5,  4854,     9,   127,  1068,  2322,   162,
            6,     8,    38, 12856,     4, 50118

### 4-2) Define Dataset

In [11]:
def get_train_val_loaders(train_dataset, val_dataset):
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE, 
        shuffle=True, 
        num_workers=0,
        drop_last=True)
    
    32,154,768 
    
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE, 
        shuffle=True, 
        num_workers=0,
        drop_last=True)
    
    dataloaders_dict = {"train": train_loader, "val": val_loader}
    return dataloaders_dict


def get_test_loader(dataset):
    
    loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=BATCH_SIZE, 
        shuffle=False,
        num_workers=0)  
    
    return loader

In [12]:
test_loader = get_test_loader(test_dataset)

# 5. Define Model

In [13]:
class MainModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)
        self.attention = nn.Sequential(            
            nn.Dropout(0.2),
            nn.Linear(768, 512),     
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask) 
        # roberta's hidden_size = 768
        last_layer_hidden_states = roberta_output.hidden_states[-1]
        #print(last_layer_hidden_states)
        
        weights = self.attention(last_layer_hidden_states)        
        #print(weights)
        
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        #print(context_vector)
        
        # Now we reduce the context vector to the prediction score.
        context_vector = self.regressor(context_vector).view(-1,)
        
        return context_vector

In [14]:
gc.collect()

model = MainModel() 
model.load_state_dict(torch.load(model_path, map_location=device))    

<All keys matched successfully>

# 6. Evaluate

In [15]:
def evaluate(model, loader):
    '''
    model = RobertaModel()
    model.load_state_dict(torch.load(f'roberta.pth'))
    model.to(device)
    '''
    predictions = []
    
    model.to(device)
    model.eval()
    
    for data in loader:
        ids = data['input_ids'].to(device, dtype=torch.int64)
        masks = data['attention_mask'].to(device, dtype=torch.int64)
        
        with torch.no_grad():
            preds = model(ids, masks)
            for pred in preds:
                predictions.append(pred.item()) 
            
                  
    return predictions

# 7. Submission

In [16]:
predictions = evaluate(model, test_loader)

submission_df['target'] = predictions
submission_df.head()

Unnamed: 0,id,target
0,c0f722661,-0.628001
1,f0953f0a5,-0.680984
2,0df072751,-0.792474
3,04caf4e0c,-2.373531
4,0e63f8bea,-2.570577


In [17]:
del model
gc.collect()

submission_df.to_csv('submission.csv', index=False)