In [None]:
import os
import math
import random
import time
import re
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
#引数は、optimizer,学習率を上げるステップ数、トータルステップ数。
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold,StratifiedKFold
import gc

#自動ガベージコレクションを有効にするらしい。
gc.enable()

In [None]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 4
MAX_LEN = 300
#eval頻度を調整→eval速度と、最適なmodel抽出のバランス設定。
EVAL_SCHEDULE = [(0.465,16),(-1., 8)]
ROBERTA_PATH = "../input/clrp-roberta-large/clrp_roberta_large"
TOKENIZER_PATH = "../input/clrp-roberta-large/clrp_roberta_large"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
num_bins = int(np.floor(1 + np.log2(len(train_df))))
train_df['bins'],bins=pd.cut(train_df['target'],num_bins,labels=False,retbins=True)

In [None]:
test_df= pd.read_csv('../input/commonlitreadabilityprize/test.csv')
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
# Remove incomplete entries if any→スコアと標準偏差が０のデータを削除
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [None]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()

        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)   
            self.standard_error = torch.tensor(df.standard_error.values, dtype=torch.float32) 
        
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            standard_error=self.standard_error[index]
            return (input_ids, attention_mask, target,standard_error)

In [None]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        #https://huggingface.co/transformers/main_classes/configuration.html#transformers.PretrainedConfig　参照
        config.update({"output_hidden_states":True, #Whether or not the model should return all hidden-states.
                       "hidden_dropout_prob": 0.0,# The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
                       "layer_norm_eps": 1e-7})                     
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
          
        #sequence内の各ワードを重み付けしている。
        self.attention = nn.Sequential(            
            nn.Linear(1024, 700),            
            nn.Tanh(),                       
            nn.Linear(700, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor1 = nn.Sequential(                        
            nn.Linear(1024, 1)                        
        )
        
        self.regressor2 = nn.Sequential(                        
            nn.Linear(1024, 1)                        
        )

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        


        last_layer_hidden_states = roberta_output.hidden_states[-1]

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_layer_hidden_states.size())

        attention_last_layer=last_layer_hidden_states*input_mask_expanded

        weights = self.attention(attention_last_layer)
                
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # regressor2で、standard_errorの予測値を出力
        return self.regressor1(context_vector),self.regressor2(context_vector)

In [None]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    target_mse = 0
    std_mse=0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target,standard_error) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            standard_error=standard_error.to(DEVICE)  
            
            target_pred,std_pred = model(input_ids, attention_mask)                       

            target_mse += nn.MSELoss(reduction="sum")(target_pred.flatten(), target).item()
            
            #一応、std_mseもモニタリング
            std_mse += nn.MSELoss(reduction="sum")(std_pred.flatten(), standard_error).item()
                

    return target_mse / len(data_loader.dataset),std_mse / len(data_loader.dataset)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))#numpy配列にすることで、計算が楽になる。   
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            
            #std_predは使わない
            target_pred,_= model(input_ids, attention_mask)                        

            result[index : index + target_pred.shape[0]] = target_pred.flatten().to("cpu")#バッチごとに出力
            index += target_pred.shape[0]

    return result

In [None]:
def train(model, model_path, train_loader, val_loader,
          optimizer, scheduler=None, num_epochs=NUM_EPOCHS,alpha=1):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]#上部に定義したタプルの右側   

    start = time.time()

    for epoch in range(num_epochs):                           
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target,standard_error) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)
            standard_error= standard_error.to(DEVICE)

            optimizer.zero_grad()
            
            model.train()

            target_pred,std_pred= model(input_ids, attention_mask)
            
            #reductionはsumにし、合算してから、平均を算出
            target_mse = nn.MSELoss(reduction="sum")(target_pred.flatten(), target)
            std_mse = nn.MSELoss(reduction="sum")(std_pred.flatten(), standard_error)
                        
            sum_mse=(target_mse + alpha*std_mse)/len(input_ids)
            
            sum_mse.backward()

            optimizer.step()
            if scheduler:
                scheduler.step()
            #EVAL_SCHEDULE[0][1]ごとにeval実行
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                
                val_target_mse,val_std_mse=eval_mse(model, val_loader)
                val_rmse = math.sqrt(val_target_mse)                            

                print(f"Epoch: {epoch} batch_num: {batch_num}", 
                      f"val_rmse: {val_rmse:0.4}\n std_error_val_mse:{val_std_mse}")
                
                #一定のrmseを超えると、evalの頻度が上がる。
                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                #best_valが０または、val_rmseがベストスコアになった時はbestスコアを更新
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})")                                    
                    
                start = time.time()
                                            
            step += 1
                        
    
    return best_val_rmse


In [None]:
def create_optimizer(model):
    
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:-8]    
    attention_parameters = named_parameters[-8:-4]
    regressor1_parameters = named_parameters[-4:-2]
    regressor2_parameters = named_parameters[-2:]

    parameters = []
    lr = 2e-5
    weight_decay=1e-2
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
 #       weight_decay = 0.0 if "bias" in name else 0.01
        

#         if layer_num >= 69:        
#             lr = 5e-5

#         if layer_num >= 133:
#             lr = 1e-4

        parameters.append({"params": params,
                       "weight_decay": weight_decay,
                       "lr": lr})
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor1_group = [params for (name, params) in regressor1_parameters]
    regressor2_group = [params for (name, params) in regressor2_parameters]
    
    parameters.append({"params": attention_group,"lr": lr,"weight_decay": weight_decay})
    parameters.append({"params": regressor1_group,"lr": lr,"weight_decay": weight_decay})
    parameters.append({"params": regressor2_group,"lr": lr,"weight_decay": weight_decay})



    return AdamW(parameters)

In [None]:
gc.collect()

SEED = 150
#各Foldごとのベストスコアを格納
list_val_rmse = []

kfold = StratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df,train_df['bins'])):    
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    #foldごとにベストスコアのパラメータを保存するイメージ。
    model_path = f"model_{fold + 1}.pth"
        
    set_random_seed(SEED + fold)
    
    train_dataset = LitDataset(train_df.loc[train_indices])    
    val_dataset = LitDataset(train_df.loc[val_indices])    
        
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=2)    
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    
        
    #set_random_seed(SEED + fold)    
    
    model = LitModel().to(DEVICE)
    
    optimizer = create_optimizer(model)                          
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=NUM_EPOCHS * len(train_loader),
        num_warmup_steps=50)    
    
    list_val_rmse.append(train(model, model_path, train_loader,
                               val_loader, optimizer, scheduler=scheduler,alpha=1))

    del model
    gc.collect()
    torch.cuda.empty_cache()
    
    print("\nPerformance estimates:")
    print(list_val_rmse)
    print("Mean:", np.array(list_val_rmse).mean())

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)

In [None]:
#Foldごとのpredictionを求める。
all_predictions = np.zeros((len(list_val_rmse), len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for index in range(len(list_val_rmse)):            
    model_path = f"model_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path))#model_pathを解凍。→state_dictを読み込み。   
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
predictions = all_predictions.mean(axis=0)
submission_df.target = predictions
#print(submission_df)
submission_df.to_csv("submission.csv", index=False)