In [1]:
import numpy as np 
import pandas as pd 
import os
       
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt 

import transformers
import random

import warnings
warnings.simplefilter('ignore')
scaler = torch.cuda.amp.GradScaler() # GPUでの高速化。

from cfg import get_cfg
CFG=get_cfg()

In [3]:
SEED = 508

def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

random_seed(SEED)

In [4]:
from preprocess import get_data
train=get_data()
sample_submission=pd.read_csv(CFG.path_test)
comments_to_score=pd.read_csv(CFG.path_test2)

In [5]:
print(train.head(),train.shape)
print(sample_submission.head(),sample_submission.shape)
print(comments_to_score.head(),comments_to_score.shape)

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation Why the edits made under my userna...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  " More I can't make any real suggestions on im...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  target  
0             0        0       0       0              0       0  
1             0        0       0       0              0       0  
2             0        0       0       0              0       0  
3             0        0       0       0              0       0  
4             0        0       0       0              0       0   (159571, 9)
   comment_id  score
0      114890    0.5
1      732895    0.5
2     1139051    0.5
3     1434512    0.5


# make fold

In [6]:
from sklearn.model_selection import KFold
skf=KFold(n_splits=5,shuffle=True,random_state=5)
train["fold"]=-1
X=train.sample(frac=1.)
for i,(_,val_idx) in enumerate(skf.split(X)):
    train.loc[val_idx,"fold"]=i

# Tokenizer

In [7]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

In [8]:
test_s = train["comment_text"].iloc[0]
test_s

"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now. "

In [9]:
result1=tokenizer.encode_plus(test_s)
result1

{'input_ids': [101, 7526, 2339, 1996, 10086, 2015, 2081, 2104, 2026, 5310, 18442, 13076, 12392, 2050, 5470, 2020, 16407, 1029, 2027, 4694, 1005, 1056, 3158, 9305, 22556, 1010, 2074, 8503, 2006, 2070, 3806, 2044, 1045, 5444, 2012, 2047, 2259, 14421, 6904, 2278, 1012, 1998, 3531, 2123, 1005, 1056, 6366, 1996, 23561, 2013, 1996, 2831, 3931, 2144, 1045, 1005, 1049, 3394, 2085, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenizer.decode(result1["input_ids"])

"[CLS] explanation why the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now. [SEP]"

In [11]:
# sen_length = []

# for sentence in tqdm(train["comment_text"]):

#     token_words = tokenizer.encode_plus(sentence)["input_ids"]
#     sen_length.append(len(token_words))

# print('maxlenth of all sentences are  ', max(sen_length))

In [12]:
test_s

"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now. "

In [13]:
len(test_s.split(" "))


44

In [14]:
result2 = tokenizer.encode_plus(
    test_s,
    add_special_tokens = True, # Whether to insert [CLS], [SEP]
    max_length = 2502, # Align the number of words using padding and transcription
    pad_to_max_length = True, # Put [PAD] in the blank area
    
    truncation = True # Cutout function. For example, max_length10 is a function that makes only the first 10 characters. I got an alert if I didn't put it in, so I'll put it in
)

In [15]:
result2

{'input_ids': [101, 7526, 2339, 1996, 10086, 2015, 2081, 2104, 2026, 5310, 18442, 13076, 12392, 2050, 5470, 2020, 16407, 1029, 2027, 4694, 1005, 1056, 3158, 9305, 22556, 1010, 2074, 8503, 2006, 2070, 3806, 2044, 1045, 5444, 2012, 2047, 2259, 14421, 6904, 2278, 1012, 1998, 3531, 2123, 1005, 1056, 6366, 1996, 23561, 2013, 1996, 2831, 3931, 2144, 1045, 1005, 1049, 3394, 2085, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [16]:
tokenizer.decode(result2["input_ids"])

"[CLS] explanation why the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [17]:
result3 = tokenizer.encode_plus(
    test_s,
    add_special_tokens = True, # Whether to insert [CLS], [SEP]
    max_length = 10, # Align the number of words using padding and transcription
    pad_to_max_length = True, # Put [PAD] in the blank area
    
    truncation = True # Cutout function. For example, max_length10 is a function that makes only the first 10 characters. I got an alert if I didn't put it in, so I'll put it in
)

In [18]:
result3


{'input_ids': [101, 7526, 2339, 1996, 10086, 2015, 2081, 2104, 2026, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
train = train.sort_values("target").reset_index(drop=True)

In [20]:
p_train = train[train["fold"]!=0].reset_index(drop=True)
p_valid = train[train["fold"]==0].reset_index(drop=True)

In [21]:
class BERTDataSet(Dataset):
    
    def __init__(self,sentences,targets):
        
        self.sentences = sentences
        self.targets = targets
        
    def __len__(self):
        
        return len(self.sentences)
    
    def __getitem__(self,idx):
        
        sentence = self.sentences[idx]
        
        bert_sens = tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True, 
                                max_length = CFG.max_sens,#2502, # 上で314に設定しています
                                pad_to_max_length = True, 
                                return_attention_mask = True)

        ids = torch.tensor(bert_sens['input_ids'], dtype=torch.long)
        mask = torch.tensor(bert_sens['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(bert_sens['token_type_ids'], dtype=torch.long)
     
            
        target = torch.tensor(self.targets[idx],dtype=torch.float)
        
        return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'targets': target
            }

In [22]:
train_dataset = BERTDataSet(p_train["comment_text"],p_train["target"])
valid_dataset = BERTDataSet(p_valid["comment_text"],p_valid["target"])

In [23]:
train_dataset[0]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'ids': tensor([  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
         18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
          1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
          3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
          1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
          1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [24]:
train_batch = 4
valid_batch = 4

In [25]:
train_dataloader = DataLoader(train_dataset,batch_size=train_batch,shuffle = True,num_workers=0,pin_memory=True)
valid_dataloader = DataLoader(valid_dataset,batch_size=valid_batch,shuffle = False,num_workers=0,pin_memory=True)

In [26]:
for a in train_dataloader:
    print(a)
    break

{'ids': tensor([[ 101, 2115, 2862,  ...,    0,    0,    0],
        [ 101, 2040, 1996,  ...,    0,    0,    0],
        [ 101, 7929, 1011,  ...,    0,    0,    0],
        [ 101, 2059, 2017,  ...,    0,    0,    0]]), 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'targets': tensor([0., 1., 0., 0.])}


In [27]:
model = transformers.BertForSequenceClassification.from_pretrained(CFG.path_ref_model,num_labels=1)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [28]:
model.to(CFG.device)
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [29]:
for a in train_dataloader:
    ids = a["ids"].to(CFG.device)
    mask = a["mask"].to(CFG.device)
    tokentype = a["token_type_ids"].to(CFG.device)
    print(ids.shape,mask.shape)
    output = model(ids,mask)
    break

torch.Size([4, 512]) torch.Size([4, 512])


In [30]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3419],
        [-0.6657],
        [-0.6734],
        [-0.8861]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [31]:
output["logits"]

tensor([[-0.3419],
        [-0.6657],
        [-0.6734],
        [-0.8861]], device='cuda:0', grad_fn=<AddmmBackward>)

In [32]:
output["logits"].shape

torch.Size([4, 1])

In [33]:
output["logits"].squeeze(-1)

tensor([-0.3419, -0.6657, -0.6734, -0.8861], device='cuda:0',
       grad_fn=<SqueezeBackward1>)

In [34]:
output = output["logits"].squeeze(-1)

In [35]:
output

tensor([-0.3419, -0.6657, -0.6734, -0.8861], device='cuda:0',
       grad_fn=<SqueezeBackward1>)

In [36]:
from transformers import AdamW
LR=2e-5
optimizer = AdamW(model.parameters(), LR,betas=(0.9, 0.999), weight_decay=1e-2) 

In [37]:
from transformers import get_linear_schedule_with_warmup


epochs = 1

train_steps = int(len(p_train)/train_batch*epochs)
print(train_steps)

num_steps = int(train_steps*0.1)

scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

31914


In [38]:
def loss_fn(output,target):
    return nn.BCEWithLogitsLoss()(output,target)

In [39]:
def training(
    train_dataloader,
    model,
    optimizer,
    scheduler
):
    
    model.train()
    torch.backends.cudnn.benchmark = True

    allpreds = []
    alltargets = []

    for a in train_dataloader:

        losses = []

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            ids = a["ids"].to(CFG.device,non_blocking=True)
            mask = a["mask"].to(CFG.device,non_blocking=True)
            tokentype = a["token_type_ids"].to(CFG.device,non_blocking=True)

            output = model(ids,mask)
            output = output["logits"].squeeze(-1)

            target = a["targets"].to(CFG.device,non_blocking=True)

            loss = loss_fn(output,target)


            # For scoring
            losses.append(loss.item())
            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())

        scaler.scale(loss).backward() # backwards of loss
        scaler.step(optimizer) # Update optimizer
        scaler.update() # scaler update

        scheduler.step() # Update learning rate schedule

        # Combine dataloader minutes

    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)

    # I don't use loss, but I collect it

    losses = np.mean(losses)

    # Score with rmse
    train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))

    return losses,train_rme_loss

In [40]:
train_dataloader = DataLoader(train_dataset,batch_size=train_batch,shuffle = True,num_workers=0,pin_memory=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

losses,train_rme_loss = training(train_dataloader,model,optimizer,scheduler)

print(losses,train_rme_loss)

RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 8.00 GiB total capacity; 6.10 GiB already allocated; 42.52 MiB free; 6.13 GiB reserved in total by PyTorch)