In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv
/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv
/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [2]:
train_df = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test_df

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...
...,...,...
7532,504235362,"Go away, you annoying vandal."
7533,504235566,This user is a vandal.
7534,504308177,""" \n\nSorry to sound like a pain, but one by f..."
7535,504570375,Well it's pretty fucking irrelevant now I'm un...


In [4]:
train_df.rename(columns = {"comment_text":"text"}, inplace = True)
text_data = pd.concat([train_df["text"],test_df["text"]]).reset_index(drop=True)
text_data.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: text, dtype: object

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

class JigsawDataset(Dataset):
    def __init__(self, text=None, labels=None, tokenizer=None, max_length=None):
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.text = text.values
        self.labels = labels
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text = self.text[idx]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_length,
                        padding='max_length'
                    )
        instance = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask']  , dtype=torch.long),
            'position_ids': torch.arange(self.max_length)
        }
        if self.labels is not None:
            instance["label"] = torch.tensor(self.labels.values[idx], dtype=torch.float).reshape(1)
        return instance
    
class DataModule(pl.LightningDataModule):
    def __init__(self,data,tokenizer, random_seed, max_seq_length, is_pretrain):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.random_seed = random_seed
        self.max_seq_length = max_seq_length
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.is_pretrain = is_pretrain
        
    def setup(self, stage):
        # Assign train/val datasets for use in dataloaders
        if stage == "fit" or stage is None:
            train_df, val_df = train_test_split(self.data, test_size = 0.1, random_state = self.random_seed)
            if self.is_pretrain:      
                self.train_data = JigsawDataset(text=train_df, tokenizer=self.tokenizer, max_length=self.max_seq_length)
                self.val_data = JigsawDataset(text=val_df, tokenizer=self.tokenizer, max_length=self.max_seq_length)
            else:
                self.train_data = JigsawDataset(text=train_df["text"],labels=train_df["labels"], 
                                                tokenizer=self.tokenizer, max_length=self.max_seq_length)
                self.val_data = JigsawDataset(text=val_df["text"],labels=val_df["labels"], 
                                              tokenizer=self.tokenizer, max_length=self.max_seq_length)
                
        if stage == "predict" or stage is None:
            self.test_data = JigsawDataset(text=self.data, tokenizer=self.tokenizer, max_length=self.max_seq_length)
            
    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=8, pin_memory = True)
    
    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=8 ,pin_memory =True)
    
    def predict_dataloader(self):
        return DataLoader(self.test_data, batch_size=8)

In [6]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from pytorch_lightning.callbacks import ModelCheckpoint
class Pretrain_model(pl.LightningModule):
    def __init__(self,config):
        super().__init__()
        self.config = config
        self.model = AutoModelForCausalLM.from_config(self.config)
        
    def forward(self,x):
        input_ids, attention_mask,position_ids = x["input_ids"],x["attention_mask"],x["position_ids"]
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=input_ids)
        return outputs
  
    def training_step(self, batch, batch_idx):
        loss = self(batch)[0]
        self.log("train_loss", loss, on_step=True, logger=True)
        return loss
  
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=2e-5)
    
    def configure_callbacks(self):
        checkpoint = ModelCheckpoint(monitor="val_loss")
        return [checkpoint]
    
    def validation_step(self, batch, batch_idx):
        loss = self(batch)[0]
        return loss, len(batch["input_ids"])
  
    def validation_epoch_end(self, validation_step_outputs):
        total_loss = 0
        cnt = 0
        for loss,batch_size in validation_step_outputs:
            total_loss += loss*batch_size
            cnt +=  batch_size
        val_loss = total_loss/cnt
        self.log("val_loss",val_loss)
        return val_loss


In [7]:
import os
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
config = AutoConfig.from_pretrained('roberta-base')
config.is_decoder = True
config.add_cross_attention = True
random_seed = 42
max_seq_length = config.max_position_embeddings
num_epochs = 1

pl.seed_everything(random_seed,workers=True)
# torch.manual_seed(random_seed)
# torch.cuda.manual_seed(random_seed)

data = DataModule(text_data, tokenizer, random_seed, max_seq_length, True)
trainer = pl.Trainer(max_epochs = num_epochs, gpus=1, deterministic=True)
model = Pretrain_model(config)
trainer.fit(model,data)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [8]:
import os
for dirname, _, filenames in os.walk('./lightning_logs/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./lightning_logs/version_0/hparams.yaml
./lightning_logs/version_0/events.out.tfevents.1639690355.eebaa9fe0f7d.23.0
./lightning_logs/version_0/checkpoints/epoch=0-step=18799.ckpt


In [9]:
train_df["severe_toxic"] = train_df["severe_toxic"]*2
train_df["labels"] = train_df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].sum(axis=1).astype(int)
train_df["labels"] = train_df["labels"]/train_df["labels"].max()
train_df.head()

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,labels
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0.0


In [10]:
class JigsawModel(pl.LightningModule):
    def __init__(self,model):
        super().__init__()
        self.model = model
        self.drop = nn.Dropout(p=0.2)
        self.dense = nn.Linear(self.model.config.hidden_size, 1)
        self.loss = nn.MSELoss()
        
    def forward(self,x):
        input_ids,attention_mask,position_ids = x["input_ids"],x["attention_mask"],x["position_ids"]
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
        output = self.drop(outputs[1])
        output = self.dense(output)
        return output

    def training_step(self, batch, batch_idx):
        output = self(batch)
        label = batch["label"]
        train_loss = self.loss(output,label)
        self.log("train_loss", train_loss, on_step=True, logger=True)
        return train_loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=2e-5)

    def configure_callbacks(self):
        checkpoint = ModelCheckpoint(monitor="val_loss")
        return [checkpoint]

    def validation_step(self, batch, batch_idx):
        output = self(batch)
        label = batch["label"]
        val_loss = self.loss(output,label)
        return val_loss, len(output)

    def validation_epoch_end(self, validation_step_outputs):
        total_loss = 0
        cnt = 0
        for loss,batch_size in validation_step_outputs:
            total_loss += loss*batch_size
            cnt +=  batch_size
        val_loss = total_loss/cnt
        self.log("val_loss",val_loss)
        return val_loss
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        output = self(batch)
        return output
    
    def predict_epoch_end(self, outputs):
        return outputs

In [11]:
from transformers import AutoModel
pl.seed_everything(random_seed)
config = AutoConfig.from_pretrained('roberta-base')
pretrained_model = AutoModel.from_config(config)
state_dict = torch.load("./lightning_logs/version_0/checkpoints/epoch=0-step=18799.ckpt")["state_dict"]

new_dict = {}
for key in pretrained_model.state_dict().keys():
    if "pooler" in key:
        new_dict[key] = pretrained_model.state_dict()[key]
    else:
        new_dict[key] = state_dict["model.roberta."+key]
pretrained_model.load_state_dict(new_dict)
num_epochs = 1
data = DataModule(train_df, tokenizer, random_seed, max_seq_length, False)
trainer = pl.Trainer(max_epochs = num_epochs, gpus=1, deterministic=True)
model = JigsawModel(pretrained_model)
trainer.fit(model,data)

Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]