In [1]:
import numpy as np 
import pandas as pd 
import os
       
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt 

import transformers
import random


import warnings
warnings.simplefilter('ignore')

scaler = torch.cuda.amp.GradScaler() # GPUでの高速化。

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cpuがgpuかを自動判断
device

device(type='cuda')

In [3]:
result_path = "../input/common-model-210731"

In [4]:
SEED = 508

def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

random_seed(SEED)

In [5]:
tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")

In [6]:
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test.head(3)

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...


In [7]:
class BERTDataSet(Dataset):
    
    def __init__(self,sentences):
        
        self.sentences = sentences
       
        
    def __len__(self):
        
        return len(self.sentences)
    
    def __getitem__(self,idx):
        
        sentence = self.sentences[idx]
        sentence = str(sentence)
        sentence = " ".join(sentence.split())
        
        
        bert_sens = tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True, # [CLS],[SEP]
                                max_length = 500,
                                pad_to_max_length = True, # add padding to blank
                                truncation=True)

        ids = torch.tensor(bert_sens['input_ids'], dtype=torch.long)
        mask = torch.tensor(bert_sens['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(bert_sens['token_type_ids'], dtype=torch.long)
     
        
    
        
        return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                
            }

In [8]:
test_dataset = BERTDataSet(test["excerpt"])

In [9]:
test_batch = 2

In [10]:
test_dataloader = DataLoader(test_dataset,batch_size=test_batch,shuffle = False,num_workers=8,pin_memory=True)

In [11]:
model = transformers.BertForSequenceClassification.from_pretrained('../input/bert-base-uncased',num_labels=1)

Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model chec

In [12]:
pthes = [os.path.join(result_path,s) for s in os.listdir(result_path) if ".bin" in s]
print(pthes)
states = [torch.load(s) for s in pthes]

['../input/common-model-210731/model_BERT_fold_1.bin', '../input/common-model-210731/model_BERT_fold_3.bin', '../input/common-model-210731/model_BERT_fold_0.bin', '../input/common-model-210731/model_BERT_fold_4.bin', '../input/common-model-210731/model_BERT_fold_2.bin']


In [13]:
def predicting(
    test_dataloader,
    model,
    states
    
):

    allpreds = []
    
    for state in states:
        model.load_state_dict(state)
        model.to(device)
        model.eval()
    
    
        preds = []
        allvalloss=0

        with torch.no_grad():


            for a in test_dataloader:



                ids = a["ids"].to(device)
                mask = a["mask"].to(device)
                tokentype = a["token_type_ids"].to(device)

                output = model(ids,mask,tokentype)
                output = output["logits"].squeeze(-1)


                preds.append(output.cpu().numpy())

            preds = np.concatenate(preds)
            
            allpreds.append(preds)

    return allpreds

In [14]:
allpreds = predicting(test_dataloader,model,states)

In [15]:
findf = pd.DataFrame(allpreds)
findf = findf.T
findf

Unnamed: 0,0,1,2,3,4
0,-0.333861,-0.150501,-0.190157,-0.123478,-0.137475
1,-0.050891,0.003288,0.021352,0.136464,0.070705
2,-0.202172,-0.153717,-0.274389,-0.131783,-0.201915
3,-1.738132,-1.854158,-1.943672,-1.638427,-1.754243
4,-1.315736,-1.185454,-1.336614,-1.168904,-0.920662
5,-0.814535,-0.864818,-0.945195,-0.694724,-0.756116
6,0.112696,0.145249,0.149508,0.14178,0.176225


In [16]:
finpred = findf.mean(axis=1)
finpred

0   -0.187094
1    0.036184
2   -0.192795
3   -1.785726
4   -1.185474
5   -0.815077
6    0.145092
dtype: float64

In [17]:
sample = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
sample

Unnamed: 0,id,target
0,c0f722661,0.0
1,f0953f0a5,0.0
2,0df072751,0.0
3,04caf4e0c,0.0
4,0e63f8bea,0.0
5,12537fe78,0.0
6,965e592c0,0.0


In [18]:
sample["target"] = finpred

In [19]:
sample

Unnamed: 0,id,target
0,c0f722661,-0.187094
1,f0953f0a5,0.036184
2,0df072751,-0.192795
3,04caf4e0c,-1.785726
4,0e63f8bea,-1.185474
5,12537fe78,-0.815077
6,965e592c0,0.145092


In [20]:
sample.to_csv("submission.csv",index = False)