In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Reading data from csv

In [2]:
data = pd.read_csv('./data/train.csv')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
Y_columns = data.columns[2:]
Y = data[Y_columns].values.tolist()
X = data['comment_text'].tolist()

In [4]:
X[0], type(X[0]), type(X)

("Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 str,
 list)

In [5]:
type(Y), type(Y[0][0])

(list, int)

In [6]:
X_train, X_test, Y_train ,Y_test = train_test_split(X, Y, test_size=0.2)

In [7]:
len(X_train), len(X_test)

(127656, 31915)

### Configuring models

In [8]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data.dataset import Dataset

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True, 
                                          strip_accents=True, 
                                          clean_text=True)

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=6
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
if torch.backends.mps.is_available():
    model.to('mps')

In [10]:
train_input_encoded = tokenizer(
    X_train,
    max_length=128,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt'
)

test_input_encoded = tokenizer(
    X_test,
    max_length=128,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt'
)



In [11]:
class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long, device='mps') for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float, device='mps')
        return item

In [12]:
train_dataset = TextClassifierDataset(train_input_encoded, Y_train)
test_dataset = TextClassifierDataset(test_input_encoded, Y_test)

train_dataset[0]

  item = {key: torch.tensor(val[idx], dtype=torch.long, device='mps') for key, val in self.encodings.items()}


{'input_ids': tensor([  101, 26476, 13475,  2008,  2146,  4487,  1008,  1047,  1997,  2216,
          7975, 11865,  2278,  1008,  9413,  2015,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

## Training

In [13]:
training_arguments = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=64,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  item = {key: torch.tensor(val[idx], dtype=torch.long, device='mps') for key, val in self.encodings.items()}
  8%|▊         | 500/5985 [14:24<2:04:43,  1.36s/it]

{'loss': 0.0692, 'grad_norm': 0.2606276273727417, 'learning_rate': 4.582289055973267e-05, 'epoch': 0.25}


  item = {key: torch.tensor(val[idx], dtype=torch.long, device='mps') for key, val in self.encodings.items()}
 17%|█▋        | 1000/5985 [27:02<1:47:20,  1.29s/it]

{'loss': 0.0438, 'grad_norm': 0.17788130044937134, 'learning_rate': 4.164578111946533e-05, 'epoch': 0.5}


  item = {key: torch.tensor(val[idx], dtype=torch.long, device='mps') for key, val in self.encodings.items()}
 25%|██▌       | 1500/5985 [38:36<1:59:44,  1.60s/it]

{'loss': 0.0423, 'grad_norm': 0.317321240901947, 'learning_rate': 3.7468671679198e-05, 'epoch': 0.75}


  item = {key: torch.tensor(val[idx], dtype=torch.long, device='mps') for key, val in self.encodings.items()}
                                                     
 33%|███▎      | 1995/5985 [54:53<1:33:36,  1.41s/it]

{'eval_loss': 0.03781583532691002, 'eval_runtime': 229.8025, 'eval_samples_per_second': 138.88, 'eval_steps_per_second': 17.363, 'epoch': 1.0}


 33%|███▎      | 2000/5985 [55:23<23:18:57, 21.06s/it]

{'loss': 0.0411, 'grad_norm': 0.213401660323143, 'learning_rate': 3.329156223893066e-05, 'epoch': 1.0}


  item = {key: torch.tensor(val[idx], dtype=torch.long, device='mps') for key, val in self.encodings.items()}
 42%|████▏     | 2484/5985 [1:33:25<3:54:29,  4.02s/it] 

KeyboardInterrupt: 

In [14]:
torch.save(model.state_dict(), './model/toxic.pt')

In [31]:
model.to('cpu')
input = 'I wanna kill my bad habits'
input_embeddings = tokenizer(
    input,
    max_length=128,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt'
)

In [32]:
output = model(**input_embeddings)
torch.sigmoid(output.logits)

tensor([[0.5790, 0.0009, 0.0110, 0.0048, 0.0124, 0.0020]],
       grad_fn=<SigmoidBackward0>)

In [33]:
!pip freeze > requirements.txt

python(9074) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
