In [6]:
import gc
from collections import Counter
import torch
import pandas as pd
import numpy as np
from transformers import AutoModel, AdamW, AutoTokenizer
from scipy.stats import rankdata
from sklearn import metrics
from models.classifiers import RobertaMultiLabel
from parser.datasets import DataSetSeq2SeqMultiLabel
from torch_utils.trainer import train_epoch, fit
from torch_utils.eval import eval
from torch_utils.inference import predict
from config import ConfigSeq2SeqMultiLabel
from torch_utils.utils import set_seed, get_device, set_cv_dataset_partitions

ImportError: cannot import name 'Counter' from 'itertools' (unknown location)

In [None]:
class Config(ConfigSeq2SeqMultiLabel):
  DEVICE = get_device()
  # data
  LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
  #TEXT_COLUMN =  'comment_text'
  PARITION_COLUMN = 'kfold'
  # model
  MODEL_NAME = '../input/roberta-base'
  MODEL_DROPOUT = 0.2
  MODEL_HIDDEN_STATES = 768
  MODEL_LABELS = len(LABEL_COLUMNS)
  # tokenizer
  MAX_LENGTH = 128
  # train
  TRAIN_BATCH_SIZE = 64 # 32
  LEARNING_RATE = 3e-5
  EVAL_BATCH_SIZE = 64
  TEST_BATCH_SIZE = 64
  EPOCHS = 3 # 1
  N_FOLDS = 5

In [None]:
def get_data_loaders(df: pd.DataFrame, df_torch_parser: DataSetSeq2SeqMultiLabel, config:Config,
                     tokenizer: AutoTokenizer, kfold: int, text_column: str):

    df_torch_test = df_torch_parser(df=df[df[config.PARITION_COLUMN]==kfold],
                                  tokenizer=tokenizer,
                                  config=config,
                                  text_column=text_column)

    df_torch_train = df_torch_parser(df=df[df[config.PARITION_COLUMN]!=kfold],
                                   tokenizer=tokenizer,
                                   config=config,
                                   text_column=text_column)

    data_loader_train = torch.utils.data.DataLoader(df_torch_train,
                                            batch_size=config.TRAIN_BATCH_SIZE,
                                            num_workers=2,
                                            shuffle=True,
                                            pin_memory=True,
                                            drop_last=False)

    data_loader_test = torch.utils.data.DataLoader(df_torch_test,
                                                batch_size=config.TEST_BATCH_SIZE,
                                                num_workers=2,
                                                shuffle=True,
                                                pin_memory=True,
                                                drop_last=False)

    return data_loader_train, data_loader_test

In [None]:
# Read data
df_multi_label = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_submmit = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

# Assign multi label tags for stratified parition
y_tags = [str(y_values) for y_values in  df_multi_label[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values]
y_tags_2_idx = dict(Counter(y_tags))
df_multi_label['tag_idx'] = [(y_tags_2_idx[y_tag]) for y_tag in y_tags]
print(f'dataset size = {len(df_multi_label)}')
df_multi_label.head(3)

# Perform random under sampling
df_multi_label['toxic_score'] = df_multi_label[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
df_multi_label_toxic = df_multi_label.loc[df_multi_label['toxic_score']>0]
df_multi_label_non_toxic = df_multi_label.loc[df_multi_label['toxic_score']==0].sample(n=len(df_multi_label_toxic), random_state=41)
df_multi_label = pd.concat([df_multi_label_non_toxic, df_multi_label_toxic]).reset_index(drop=True)
df_multi_label.toxic_score.value_counts()


In [None]:
# Get train components
model = RobertaMultiLabel(config=Config)
model.to(Config.DEVICE)
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean') # combines Sigmoid layer : MultiLabelSoftMarginLoss
optimizer = AdamW(model.parameters(), lr=Config.LEARNING_RATE, weight_decay=1e-6)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=500)


In [None]:
df_torch_submmit = DataSetSeq2SeqMultiLabel(df=df_submmit, tokenizer=tokenizer, config=Config, inference_mode=True,
                                            text_column='text')

data_loader_submmit = torch.utils.data.DataLoader(df_torch_submmit, batch_size=Config.TRAIN_BATCH_SIZE, num_workers=2,
                                                  shuffle=False, pin_memory=True, drop_last=False)

preds_multi_label = pd.DataFrame()

# Execute training and inference
for kfold in range(Config.N_FOLDS):
    # Get data loaders
    train_loader, val_loader = get_data_loaders(df=df_multi_label, df_torch_parser=DataSetSeq2SeqMultiLabel,
                                                config=Config, tokenizer=tokenizer, kfold=kfold,
                                                text_column='comment_text')
    # Perform kfold training
    best_model, history_log = fit(model=model, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader,
                                  loss_fn=loss_fn, device=Config.DEVICE, epochs=Config.EPOCHS,
                                  model_path=f'model_multi_{kfold}.bin', scheduler=scheduler)

    # PERFORM MODEL PREDICTION
    predictions_multilabel = np.array(predict(model=best_model, data_loader=data_loader_submmit, device=Config.DEVICE))
    cv_preds = predictions_multilabel.mean(axis=1)
    preds_multi_label[f'kfold_{kfold}'] = cv_preds

preds_multi_label = preds_multi_label[[f'kfold_{kfold}' for kfold in range(0,Config.N_FOLDS)]].mean(axis=1)


## MODEL EVALUATION

In [None]:
# MODEL EVALUATION "TOXIC CLASSIFICATION" DATASET
class_threshold = 0.3
predictions, targets = eval(model=best_model, data_loader=val_loader, device= Config.DEVICE)
outputs = np.array(predictions) >= class_threshold
print(targets[0:10])
print(outputs[0:10])
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score for toxic classification df = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

# PERFORM EVALUATION VAL DATA FROM "TOXIC RANKING" DATASET OF LAST CV TRAINED MODEL
gc.collect()
val_predictions = {}
for toxic_column in ['less_toxic', 'more_toxic']:
    # PERFORM MODEL PREDICTION
    df_torch_val = DataSetSeq2SeqMultiLabel(df=df_val, tokenizer=tokenizer,config=Config, inference_mode=True,
                                            text_column=toxic_column)

    data_loader_val = torch.utils.data.DataLoader(df_torch_val, batch_size=Config.TRAIN_BATCH_SIZE, num_workers=2,
                                                  shuffle=True, pin_memory=True, drop_last=False)

    predictions_multilabel = np.array(predict(model=best_model, data_loader=data_loader_val, device=Config.DEVICE))
    #outputs = np.array(predictions_multilabel) >= class_threshold
    preds = predictions_multilabel.mean(axis=1)
    val_predictions[toxic_column] = preds

metric = np.mean(val_predictions['less_toxic']<val_predictions['more_toxic'])
print(f'Accuracy validation data = {metric}')

In [None]:
# Perform submission
df_submmit['score'] = preds_multi_label
df_submmit[['comment_id', 'score']].to_csv("submission.csv", index=False)
df_submmit['score'] = rankdata(preds_multi_label,  method='ordinal')
df_submmit.head(100)
