In [1]:
!nvidia-smi

Mon Dec  6 21:35:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers
!pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!python -m pip install --user numpy scipy pandas

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 9.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 58.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 526 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [3]:
pip install -U scikit-learn



In [4]:
from typing import Tuple

import numpy as np
import pandas as pd
from pandas import Series
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments
from transformers import Trainer
from transformers.file_utils import cached_property
from transformers.trainer_callback import EarlyStoppingCallback
import torch
from torch.utils.data import Dataset
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [5]:
class UnsafeDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [6]:
def get_metrics(preds):
    preds, labels = preds.predictions, preds.label_ids
    # standard round approach
    pred_flat = np.argmax(preds, axis=1).flatten()
    pr, rec, f, _ = precision_recall_fscore_support(labels, pred_flat, average='weighted')

    print("precision", pr)
    print("recall", rec)
    print("fscore_weighted", f)

    # adjust threshold approach
    preds_adj = np.array([[float(el1), float(el2)] for el1, el2 in preds])
    preds_adj = softmax(preds_adj, axis=1)
    roc_auc = roc_auc_score(labels, preds_adj[:, 1])
    print("roc_auc", roc_auc)

    all_metrcis = []
    for threshold in [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
        metrcis = []
        pred_labels = (preds_adj[:, 1] >= threshold).astype(int)
        metrcis.append(threshold)
        metrcis.append(round(f1_score(labels, pred_labels, average='weighted'), 2))
        metrcis.append(round(precision_score(labels, pred_labels), 2))
        metrcis.append(round(recall_score(labels, pred_labels), 2))
        metrcis.append(round(accuracy_score(labels, pred_labels), 2))
        all_metrcis.append(metrcis)

    df_metrics = pd.DataFrame(data=all_metrcis, columns=['threshold', 'f1', 'prec', 'rec', 'acc'])
    df_metrics = df_metrics.sort_values(by='f1', ascending=False)

    print(classification_report(labels, pred_flat))

    print(df_metrics.head())

    return f


def compute_metrics(pred_):
    labels = pred_.label_ids
    preds = pred_.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [8]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

data = pd.read_csv("train.csv")
data_eval = pd.read_csv("val.csv")
data_test = pd.read_csv("test.csv")

print(data.describe())
# print(data.columns)
# print(data.info)

# приводим датасет в порядок
# фильтрация по ограничениям
label_name = 'inappropriate'
threshold = 0
data = data[(data[label_name] >= 1 - threshold) | (data[label_name] <= threshold)]
data_eval = data_eval[(data_eval[label_name] >= 1 - threshold) | (data_eval[label_name] <= threshold)]
data_test = data_test[(data_test[label_name] >= 1 - threshold) | (data_test[label_name] <= threshold)]

# окргуление до 0 или 1
data[label_name] = data[label_name].apply(round)
data_eval[label_name] = data_eval[label_name].apply(round)
data_test[label_name] = data_test[label_name].apply(round)

print(data.describe())

train_dataset = UnsafeDataset(tokenizer(data.text.tolist(),
                                        max_length=64,
                                        truncation=True,
                                        padding='longest'), data.inappropriate.tolist())

eval_dataset = UnsafeDataset(tokenizer(data_eval.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), data_eval.inappropriate.tolist())


test_dataset = UnsafeDataset(tokenizer(data_test.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), data_test.inappropriate.tolist())

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassi

       inappropriate  offline_crime  ...  social_injustice  human_labeled
count  130665.000000  130665.000000  ...     130665.000000  130665.000000
mean        0.310579       0.048903  ...          0.046057       0.053924
std         0.406632       0.215503  ...          0.209389       0.225869
min         0.000000       0.000000  ...          0.000000       0.000000
25%         0.000000       0.000000  ...          0.000000       0.000000
50%         0.050000       0.000000  ...          0.000000       0.000000
75%         0.760000       0.000000  ...          0.000000       0.000000
max         1.000000       1.000000  ...          1.000000       1.000000

[8 rows x 20 columns]
       inappropriate  offline_crime  ...  social_injustice  human_labeled
count   84903.000000   84903.000000  ...      84903.000000   84903.000000
mean        0.257046       0.046145  ...          0.052667       0.053155
std         0.437008       0.209605  ...          0.223103       0.224343
min         0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
device

device(type='cuda')

In [11]:
class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self) -> Tuple["torch.device", int]:
        return device


model.to(device)

for param in model.bert.parameters():
    param.requires_grad = True

training_args = TrainingArguments(
    output_dir='./unsafe/FINAL_VERS',  # output directory
    overwrite_output_dir=True,
    num_train_epochs=5,  # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=0,  # number of warmup steps for learning rate scheduler
    weight_decay=1e-8,  # strength of weight decay
    learning_rate=2e-5,
    save_total_limit=2,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=2500,
    eval_steps=2500,
    save_steps=2500,
    evaluation_strategy='steps', metric_for_best_model='f1', greater_is_better=True, load_best_model_at_end=True
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=eval_dataset,  # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.add_callback(EarlyStoppingCallback(3))

In [12]:
training_args.device

device(type='cuda', index=0)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 84903
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 13270


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
2500,0.3144,0.265402,0.885704,0.885841,0.885985,0.885704
5000,0.1619,0.341383,0.885421,0.885606,0.885805,0.885421
7500,0.08,0.56177,0.8808,0.881671,0.882834,0.8808
10000,0.0391,0.743581,0.884572,0.883167,0.882473,0.884572


***** Running Evaluation *****
  Num examples = 10604
  Batch size = 32
Saving model checkpoint to ./unsafe/FINAL_VERS/checkpoint-2500
Configuration saved in ./unsafe/FINAL_VERS/checkpoint-2500/config.json
Model weights saved in ./unsafe/FINAL_VERS/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./unsafe/FINAL_VERS/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./unsafe/FINAL_VERS/checkpoint-2500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10604
  Batch size = 32
Saving model checkpoint to ./unsafe/FINAL_VERS/checkpoint-5000
Configuration saved in ./unsafe/FINAL_VERS/checkpoint-5000/config.json
Model weights saved in ./unsafe/FINAL_VERS/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./unsafe/FINAL_VERS/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./unsafe/FINAL_VERS/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10604
  Batch size = 32
Saving

TrainOutput(global_step=10000, training_loss=0.14885815353393556, metrics={'train_runtime': 2371.06, 'train_samples_per_second': 179.04, 'train_steps_per_second': 5.597, 'total_flos': 1.0521975548256e+16, 'train_loss': 0.14885815353393556, 'epoch': 3.77})

In [14]:
pred = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10565
  Batch size = 32


In [15]:
get_metrics(pred)

precision 0.8870807480153264
recall 0.8874585896829152
fscore_weighted 0.8872615700183797
roc_auc 0.9391643004010369
              precision    recall  f1-score   support

           0       0.92      0.93      0.92      7839
           1       0.78      0.78      0.78      2726

    accuracy                           0.89     10565
   macro avg       0.85      0.85      0.85     10565
weighted avg       0.89      0.89      0.89     10565

   threshold    f1  prec   rec   acc
4        0.4  0.89  0.75  0.82  0.88
5        0.5  0.89  0.78  0.78  0.89
6        0.6  0.88  0.82  0.72  0.89
7        0.7  0.88  0.85  0.66  0.88
3        0.3  0.87  0.70  0.86  0.87


  _warn_prf(average, modifier, msg_start, len(result))


0.8872615700183797

In [19]:
while 1==1:
  i=0

KeyboardInterrupt: ignored

In [81]:
trained_model = BertForSequenceClassification.from_pretrained("../model/2/10000/", local_files_only=True)

loading configuration file ./unsafe/FINAL_VERS/checkpoint-10000/config.json
Model config BertConfig {
  "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache"

In [82]:
trained_tokenizer = BertTokenizer.from_pretrained("../model/2/10000/", local_files_only=True)

Didn't find file ./unsafe/FINAL_VERS/checkpoint-10000/added_tokens.json. We won't load it.
Didn't find file ./unsafe/FINAL_VERS/checkpoint-10000/tokenizer.json. We won't load it.
loading file ./unsafe/FINAL_VERS/checkpoint-10000/vocab.txt
loading file None
loading file ./unsafe/FINAL_VERS/checkpoint-10000/special_tokens_map.json
loading file ./unsafe/FINAL_VERS/checkpoint-10000/tokenizer_config.json
loading file None


In [95]:
encoded_input = trained_tokenizer("Боже мой, как же неприятно было в Афганистане. Ненавижу жару! Аллах Акбар", return_tensors='pt')
output = trained_model(**encoded_input)

In [96]:
output.get('logits').argmax(axis=1)

tensor([1])

In [97]:
output.get('logits').softmax(1)

tensor([[2.4403e-04, 9.9976e-01]], grad_fn=<SoftmaxBackward0>)

In [98]:
output

SequenceClassifierOutput([('logits',
                           tensor([[-4.3503,  3.9677]], grad_fn=<AddmmBackward0>))])

In [18]:
!zip -r /content/file.zip /content/unsafe/

  adding: content/unsafe/ (stored 0%)
  adding: content/unsafe/FINAL_VERS/ (stored 0%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/ (stored 0%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/special_tokens_map.json (deflated 40%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/optimizer.pt (deflated 29%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/tokenizer_config.json (deflated 38%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/pytorch_model.bin (deflated 8%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/scheduler.pt (deflated 49%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/training_args.bin (deflated 49%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/vocab.txt (deflated 65%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/config.json (deflated 53%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/trainer_state.json (deflated 53%)
  adding: content/unsafe/FINAL_VERS/checkpoint-2500/rng_state.pth (deflated 27%)
  adding: conten