In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-base/rust_model.ot
/kaggle/input/roberta-base/config.json
/kaggle/input/roberta-base/merges.txt
/kaggle/input/roberta-base/README.md
/kaggle/input/roberta-base/tokenizer.json
/kaggle/input/roberta-base/vocab.json
/kaggle/input/roberta-base/tf_model.h5
/kaggle/input/roberta-base/dict.txt
/kaggle/input/roberta-base/pytorch_model.bin
/kaggle/input/roberta-base/flax_model.msgpack
/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv
/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv
/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv
/kaggle/input/jigsaw-toxic/epoch0-step17951.ckpt


In [2]:
train_df = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test_df.head()

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

class JigsawDataset(Dataset):
    def __init__(self, text=None, tokenizer=None, max_length=None):
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.text = text.values
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text = self.text[idx]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_length,
                        padding='max_length'
                    )
        instance = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask']  , dtype=torch.long),
            'position_ids': torch.arange(self.max_length)
        }
        return instance
    
class DataModule(pl.LightningDataModule):
    def __init__(self,data,tokenizer, random_seed, max_seq_length):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.random_seed = random_seed
        self.max_seq_length = max_seq_length
        self.test_data = None
        
    def setup(self, stage):
        # Assign train/val datasets for use in dataloaders
        self.test_data = JigsawDataset(text=self.data, tokenizer=self.tokenizer, max_length=self.max_seq_length)
    
    def predict_dataloader(self):
        return DataLoader(self.test_data, batch_size=8)

In [5]:
class Jigsaw_model(nn.Module):
    def __init__(self,model):
        super().__init__()
        self.model = model
        self.drop = nn.Dropout(p=0.2)
        self.dense = nn.Linear(self.model.config.hidden_size, 1)
        
    def forward(self,x):
        input_ids,attention_mask,position_ids = x["input_ids"],x["attention_mask"],x["position_ids"]
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
        output = self.drop(outputs[1])
        output = self.dense(output)
        return output

class PredictModel(pl.LightningModule):
    def __init__(self,model):
        super().__init__()
        self.model = model
        
    def forward(self,x):
        output = self.model(x)
        return output

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        output = self(batch)
        return output
    
    def predict_epoch_end(self, outputs):
        return outputs

In [6]:
from transformers import AutoTokenizer, AutoConfig, AutoModel

random_seed = 42

pl.seed_everything(random_seed)
tokenizer = AutoTokenizer.from_pretrained("../input/roberta-base")
config = AutoConfig.from_pretrained('../input/roberta-base')
max_seq_length = config.max_position_embeddings
language_model = AutoModel.from_config(config)
trained_model = torch.load("/kaggle/input/jigsaw-toxic/epoch0-step17951.ckpt")
model = Jigsaw_model(language_model)
model.load_state_dict(trained_model["state_dict"])
predict_model = PredictModel(model)
trainer = pl.Trainer(gpus=1, deterministic=True,)
data=DataModule(test_df["text"], tokenizer, random_seed, max_seq_length)
prediction = trainer.predict(predict_model,data)

Predicting: 0it [00:00, ?it/s]

In [7]:
total = list()
for i,x in enumerate(prediction):
    total.extend(x.tolist())
result = sum(total,[])
result[:10]

[0.053538136184215546,
 0.05476636439561844,
 0.053219035267829895,
 0.060290977358818054,
 0.34828975796699524,
 0.052721939980983734,
 0.053493618965148926,
 0.05407945439219475,
 0.0536242239177227,
 0.3033179044723511]

In [8]:
submission = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
submission["score"] = result
scores = submission["score"]
submission["score"] = (scores-scores.min())/(scores.max()-scores.min())
submission.head()

Unnamed: 0,comment_id,score
0,114890,0.004414
1,732895,0.00611
2,1139051,0.003973
3,1434512,0.013738
4,2084821,0.411396


In [9]:
submission.to_csv('submission.csv', index=False)