<a href="https://colab.research.google.com/github/huhji/NLP-Basic/blob/main/Transformer_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer & roBERTa Basic

## Import Libraries

In [None]:
!pip install transformers
!pip install wandb

In [None]:
import os
import random
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import wandb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import get_cosine_schedule_with_warmup
from transformers import (AutoModel,AutoModelForMaskedLM, AutoTokenizer, LineByLineTextDataset,
                         DataCollatorForLanguageModeling,Trainer, TrainingArguments,)

## Load Data

In [None]:
# Google Drive에서 업로드
from google.colab import drive 
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
#os.getcwd()
data_path = "./gdrive/MyDrive/Kaggle_CommonLit"

train_data = pd.read_csv(os.path.join(data_path,"train.csv"))
test_data = pd.read_csv(os.path.join(data_path, "test.csv"))
sample_sub = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))

In [None]:
text  = '.'.join(train_data.excerpt.tolist() + test_data.excerpt.tolist())

with open(os.path.join(data_path,'text.txt'),'w') as f:
    f.write(text)

## Load pretrained roBERTa

In [None]:
model_name = 'roberta-base'
model =  AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=os.path.join(data_path,"text.txt"), #mention text file here
    block_size=64,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir= os.path.join(data_path, "./clrp_roberta_base_chk"), #select model path
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    eval_steps=20,
    metric_for_best_model = 'eval_loss',
    greater_is_better=False,
    load_best_model_at_end =True,
    prediction_loss_only=True,
    report_to = "none" 
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset)

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=201, training_loss=1.8009212361046332, metrics={'train_runtime': 433.9027, 'train_samples_per_second': 0.463, 'total_flos': 614253569481216.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 4096, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 102400, 'train_mem_gpu_alloc_delta': 1041950208, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6346656768})

In [None]:
trainer.save_model(os.path.join(data_path, "/clrp_roberta_base_chk"))

In [None]:
os.listdir(os.path.join(data_path, '/clrp_roberta_base_chk'))

['config.json', 'pytorch_model.bin', 'training_args.bin']

In [None]:
import glob
glob.glob(data_path + "/*")

['./gdrive/MyDrive/Kaggle_CommonLit/sample_submission.csv',
 './gdrive/MyDrive/Kaggle_CommonLit/train.csv',
 './gdrive/MyDrive/Kaggle_CommonLit/test.csv',
 './gdrive/MyDrive/Kaggle_CommonLit/cleaned_train.csv',
 './gdrive/MyDrive/Kaggle_CommonLit/cleaned_test.csv',
 './gdrive/MyDrive/Kaggle_CommonLit/text.txt',
 './gdrive/MyDrive/Kaggle_CommonLit/clrp_roberta_base_chk',
 './gdrive/MyDrive/Kaggle_CommonLit/Transformer_basic.ipynb']

In [None]:
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

bins = train_data.bins.to_numpy()
target = train_data.target.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
train_data

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,bins
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,7
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,7
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,6
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,5
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,8
...,...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900,11
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648,8
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866,8
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128,7


## K-fold

In [None]:
from sklearn.model_selection import StratifiedKFold
wandb.init(
  project="roBERTa_01",
  config = {
      'lr': 2e-5,
      'wd':0.01,
      'batch_size':16,
      'valid_step':10,
      'max_len':256,
      'epochs':3,
      'nfolds':5,
      'seed':42,
  })
config=wandb.config

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

train_data['Fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    train_data.loc[valid_idx,'Fold'] = k

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
train_data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,bins,Fold
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,7,0
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,7,2
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,6,3
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,5,2
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,8,1


In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.targets[idx],dtype=torch.float) 
        return encode, target
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self,path):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained(path)  
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [None]:
def run(fold,verbose=True):
    
    def loss_fn(outputs,targets):
        outputs = outputs.view(-1)
        targets = targets.view(-1)
        return torch.sqrt(nn.MSELoss()(outputs,targets))
    
    def train_and_evaluate_loop(train_loader,valid_loader,model, loss_fn, device,optimizer,epoch,fold,best_loss,valid_step=10,lr_scheduler=None):
        train_loss = 0
        for i, (inputs1,targets1) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            inputs1 = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs1.items()}
            targets1 = targets1.to(device)
            outputs1 = model(**inputs1)
            loss1 = loss_fn(outputs1,targets1)
            loss1.backward()
            optimizer.step()
            
            train_loss += loss1.item()
            
            if lr_scheduler:
                lr_scheduler.step()
            
            #evaluating for every valid_step
            if (i % valid_step == 0) or (i == (len(train_loader)-1)):
                model.eval()
                valid_loss = 0
                with torch.no_grad():
                    for j, (inputs2,targets2) in enumerate(valid_loader):
                        inputs2 = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs2.items()}
                        targets2 = targets2.to(device)
                        outputs2 = model(**inputs2)
                        loss2 = loss_fn(outputs2,targets2)
                        valid_loss += loss2.item()
                     
                    valid_loss /= len(valid_loader)
                    if valid_loss <= best_loss:
                        if verbose:
                            print(f"epoch:{epoch} | Train Loss:{train_loss/(i+1)} | Validation loss:{valid_loss}")
#                             print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")
                            print(f"Validation loss Decreased from {best_loss} to {valid_loss}")

                        best_loss = valid_loss
                        torch.save(model.state_dict(),f'./model{fold}/model{fold}.bin')
                        tokenizer.save_pretrained(f'./model{fold}')
                        
        return best_loss
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    x_train,x_valid = train_data.query(f"Fold != {fold}"),train_data.query(f"Fold == {fold}")

    MODEL_PATH = os.path.join(data_path, "/clrp_roberta_base_chk")
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    model = Model(MODEL_PATH)
    model.to(device)

    train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
    train_dl = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        shuffle=True,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
    valid_dl = DataLoader(valid_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    optimizer = optim.AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])

    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps= 10 * len(train_dl))

    print(f"Fold: {fold}")
    best_loss = 9999
    for epoch in range(config["epochs"]):
        print(f"Epoch Started:{epoch}")
        best_loss = train_and_evaluate_loop(train_dl,valid_dl,model,loss_fn,
                                            device,optimizer,epoch,fold,best_loss,
                                            valid_step=config['valid_step'],lr_scheduler=lr_scheduler)

In [None]:
for f in range(config['nfolds']):
    run(f)
wandb.finish()

Some weights of the model checkpoint at /clrp_roberta_base_chk were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /clrp_roberta_base_chk and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

Fold: 0
Epoch Started:0
epoch:0 | Train Loss:1.4068840742111206 | Validation loss:1.2161705642938614
Validation loss Decreased from 9999 to 1.2161705642938614
epoch:0 | Train Loss:1.0451854250647805 | Validation loss:0.9129453400770823
Validation loss Decreased from 1.2161705642938614 to 0.9129453400770823
epoch:0 | Train Loss:0.9762883299872989 | Validation loss:0.7206566606958708
Validation loss Decreased from 0.9129453400770823 to 0.7206566606958708
epoch:0 | Train Loss:0.9129054296401239 | Validation loss:0.7085926350620058
Validation loss Decreased from 0.7206566606958708 to 0.7085926350620058
epoch:0 | Train Loss:0.8570564294733652 | Validation loss:0.6572062960929341
Validation loss Decreased from 0.7085926350620058 to 0.6572062960929341
epoch:0 | Train Loss:0.7870819641918433 | Validation loss:0.6549994680616591
Validation loss Decreased from 0.6572062960929341 to 0.6549994680616591
epoch:0 | Train Loss:0.7580922531410003 | Validation loss:0.6315210031138526
Validation loss Dec

Some weights of the model checkpoint at /clrp_roberta_base_chk were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /clrp_roberta_base_chk and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

Fold: 1
Epoch Started:0
epoch:0 | Train Loss:1.1066762208938599 | Validation loss:1.2153713073995378
Validation loss Decreased from 9999 to 1.2153713073995378
epoch:0 | Train Loss:1.0005325566638599 | Validation loss:0.8558776817388005
Validation loss Decreased from 1.2153713073995378 to 0.8558776817388005
epoch:0 | Train Loss:0.8771779054687137 | Validation loss:0.7444211981362767
Validation loss Decreased from 0.8558776817388005 to 0.7444211981362767
epoch:0 | Train Loss:0.8128736411652914 | Validation loss:0.6984637007117271
Validation loss Decreased from 0.7444211981362767 to 0.6984637007117271
epoch:0 | Train Loss:0.7719919447805367 | Validation loss:0.6567064598202705
Validation loss Decreased from 0.6984637007117271 to 0.6567064598202705
epoch:0 | Train Loss:0.7133531163276081 | Validation loss:0.6417399495840073
Validation loss Decreased from 0.6567064598202705 to 0.6417399495840073
epoch:0 | Train Loss:0.7083028777615055 | Validation loss:0.5952000377906693
Validation loss Dec

Some weights of the model checkpoint at /clrp_roberta_base_chk were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /clrp_roberta_base_chk and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

Fold: 2
Epoch Started:0
epoch:0 | Train Loss:1.4777355194091797 | Validation loss:1.2196833540995915
Validation loss Decreased from 9999 to 1.2196833540995915
epoch:0 | Train Loss:1.0937653888355603 | Validation loss:0.9396527641349368
Validation loss Decreased from 1.2196833540995915 to 0.9396527641349368
epoch:0 | Train Loss:0.9802610874176025 | Validation loss:0.7521521904402309
Validation loss Decreased from 0.9396527641349368 to 0.7521521904402309
epoch:0 | Train Loss:0.8291968906798014 | Validation loss:0.6972550774614016
Validation loss Decreased from 0.7521521904402309 to 0.6972550774614016
epoch:0 | Train Loss:0.7935750157225365 | Validation loss:0.6855436282025443
Validation loss Decreased from 0.6972550774614016 to 0.6855436282025443
epoch:0 | Train Loss:0.7723565775840009 | Validation loss:0.5768228529228104
Validation loss Decreased from 0.6855436282025443 to 0.5768228529228104
epoch:0 | Train Loss:0.7585545778274536 | Validation loss:0.5683875911765628
Validation loss Dec

Some weights of the model checkpoint at /clrp_roberta_base_chk were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /clrp_roberta_base_chk and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

Fold: 3
Epoch Started:0
epoch:0 | Train Loss:1.254278540611267 | Validation loss:1.1313355614741643
Validation loss Decreased from 9999 to 1.1313355614741643
epoch:0 | Train Loss:0.9709238843484358 | Validation loss:0.8835806647936503
Validation loss Decreased from 1.1313355614741643 to 0.8835806647936503
epoch:0 | Train Loss:0.9141639371713003 | Validation loss:0.8484592371516757
Validation loss Decreased from 0.8835806647936503 to 0.8484592371516757
epoch:0 | Train Loss:0.8656892170829158 | Validation loss:0.6269961024324099
Validation loss Decreased from 0.8484592371516757 to 0.6269961024324099
epoch:0 | Train Loss:0.8117559573999266 | Validation loss:0.6070783568753136
Validation loss Decreased from 0.6269961024324099 to 0.6070783568753136
epoch:0 | Train Loss:0.7220978457250713 | Validation loss:0.5685998242762353
Validation loss Decreased from 0.6070783568753136 to 0.5685998242762353
epoch:0 | Train Loss:0.6964055839741584 | Validation loss:0.5159831982519891
Validation loss Decr

Some weights of the model checkpoint at /clrp_roberta_base_chk were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /clrp_roberta_base_chk and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

Fold: 4
Epoch Started:0
epoch:0 | Train Loss:1.4734455347061157 | Validation loss:1.3700453614195187
Validation loss Decreased from 9999 to 1.3700453614195187
epoch:0 | Train Loss:1.1426657167347996 | Validation loss:0.951818413204617
Validation loss Decreased from 1.3700453614195187 to 0.951818413204617
epoch:0 | Train Loss:1.0877674136843 | Validation loss:0.93420120411449
Validation loss Decreased from 0.951818413204617 to 0.93420120411449
epoch:0 | Train Loss:1.0160935348080051 | Validation loss:0.811138242483139
Validation loss Decreased from 0.93420120411449 to 0.811138242483139
epoch:0 | Train Loss:1.0052358086516218 | Validation loss:0.7504057751761543
Validation loss Decreased from 0.811138242483139 to 0.7504057751761543


In [None]:
os.listdir('./gdrive/MyDrive/Kaggle_CommonLit/')

['sample_submission.csv',
 'train.csv',
 'test.csv',
 'cleaned_train.csv',
 'cleaned_test.csv',
 'clrp_roberta_base_chk',
 'text.txt',
 'Transformer_basic.ipynb']

In [None]:
import random
import os
import numpy as np
import torch

config = {
    'learning_rate':2e-5,
    'batch_size':32,
    'epochs':10,
    'nfolds':5,
    'seed':42,
    'max_len':256,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
from torch.utils.data import Dataset

class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
import torch.nn as nn

class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [None]:
from tqdm import tqdm
def get_prediction(df,path,device='cuda'):        
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    
    test_ds = CLRPDataset(df,tokenizer)
    test_dl = DataLoader(test_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True)
    
    predictions = list()
    for i, (inputs) in tqdm(enumerate(test_dl)):
        inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
        outputs = model(**inputs)
        outputs = outputs.cpu().detach().numpy().ravel().tolist()
        predictions.extend(outputs)
        
    torch.cuda.empty_cache()
    return np.array(predictions)

In [None]:
pred1 = get_prediction(test_data,'./model0/model0.bin')
pred2 = get_prediction(test_data,'./model1/model1.bin')
pred3 = get_prediction(test_data,'./model2/model2.bin')
pred4 = get_prediction(test_data,'./model3/model3.bin')
pred5 = get_prediction(test_data,'./model4/model4.bin')

predictions = (pred1 + pred2 + pred3 + pred4 + pred5)/5

1it [00:00,  5.42it/s]
1it [00:00,  5.59it/s]
1it [00:00,  6.00it/s]
1it [00:00,  5.69it/s]
1it [00:00,  5.69it/s]


In [None]:
sample['target'] = predictions
sample.to_csv('submission.csv',index=False)

In [None]:
sample

Unnamed: 0,id,target
0,c0f722661,-0.511399
1,f0953f0a5,-0.540758
2,0df072751,-0.504366
3,04caf4e0c,-2.530457
4,0e63f8bea,-1.932504
5,12537fe78,-1.339906
6,965e592c0,0.219967
