In [1]:
%%capture
!pip install transformers

In [2]:
import transformers
from transformers import BertModel, AutoTokenizer, BertTokenizer, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup, DistilBertTokenizer, DistilBertModel
import torch.nn.functional as F

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader


# Подготовка данных (то, что было по большей части в тетрадке)

In [3]:
!gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv

Downloading...
From: https://drive.google.com/uc?id=1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
To: /content/apps.csv
100% 134k/134k [00:00<00:00, 61.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
To: /content/reviews.csv
100% 7.17M/7.17M [00:00<00:00, 264MB/s]


In [4]:
df = pd.read_csv("reviews.csv")

In [5]:
def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else: 
    return 2

df['sentiment'] = df.score.apply(to_sentiment)

In [6]:
RANDOM_SEED = 1
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

## Предобработка данных



я решила взять модель, основанную на DistilBert и дообученную на задаче классификации эмоций. DistilBert, потому что он быстрее, и на классификации эмоций, потому что задача коррелирует с простым сантимент анализом, может он будет работать лучше.

In [7]:
PRE_TRAINED_MODEL_NAME = 'bhadresh-savani/distilbert-base-uncased-emotion'


Загрузим предобученный [BertTokenizer](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer):

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [9]:
train_encodings = tokenizer(df_train.content.tolist()[:5000], truncation=True, padding=True) #НЕ ЗАБЫТЬ УБРАТЬ ОГРАНИЧЕНИЕ
val_encodings = tokenizer(df_val.content.tolist()[:5000], truncation=True, padding=True)
test_encodings = tokenizer(df_test.content.tolist()[:5000], truncation=True, padding=True)

In [10]:
class GPReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GPReviewDataset(train_encodings, df_train.sentiment.tolist()[:5000])
val_dataset = GPReviewDataset(val_encodings, df_val.sentiment.tolist()[:5000])
test_dataset = GPReviewDataset(test_encodings, df_test.sentiment.tolist()[:5000])

## Метрики и параметры для trainer


In [11]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

#Модель для задачи 1 - выходы пуллер-слоя.


Я выбрала DistilBert, потому что он быстрый. Однако он не делает пуллинг слой в своем классическом варианте и не возвращает pooled_output. Поэтому я решила сделать типа свой пуллинг слой и конкатенировать выходы с последних 3 слоев (почему-то мне казалось, что это то, что описывают авторы классического берта, но нигде точной инфы не нашла).

In [14]:
class SentimentClassifierPooledOutput(nn.Module):

  def __init__(self, ):
    super().__init__()
    self.distilbert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(3*self.distilbert.config.hidden_size, 3)
  
  def forward(self, input_ids, attention_mask, labels):
    _, hidden_states = self.distilbert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False, 
      output_hidden_states=True)

    pooled_output = torch.stack(hidden_states[-3:], dim=0) #stack last 3 layers, layers*batch*maxlen*embdim
    pooled_output = pooled_output[:, :, 0, :]  #take first token for CLS, layers*batch*emb_dim
    pooled_output = torch.cat(tuple([layer for layer in pooled_output]), dim=-1) #concat by layers batch*emb_dim*3
    
    logits = self.out(self.drop(pooled_output))  #*batch*num_classes(3)

    loss = None
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, 3), labels.view(-1))
    
    return loss, logits
            

In [15]:
model1 = SentimentClassifierPooledOutput()
model1 = model1.to(device)

Some weights of the model checkpoint at bhadresh-savani/distilbert-base-uncased-emotion were not used when initializing DistilBertModel: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
trainer1 = Trainer(
    model=model1,                      
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,           
    compute_metrics = compute_metrics    
)

trainer1.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 626


Step,Training Loss
10,1.1768
20,1.1719
30,1.1337
40,1.0991
50,1.0854
60,1.0719
70,1.0426
80,0.9967
90,0.9901
100,0.9425


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=626, training_loss=0.7977215059268208, metrics={'train_runtime': 451.0348, 'train_samples_per_second': 22.171, 'train_steps_per_second': 1.388, 'total_flos': 0.0, 'train_loss': 0.7977215059268208, 'epoch': 2.0})

In [17]:
trainer1.evaluate()
trainer1.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


{'epoch': 2.0,
 'test_accuracy': 0.6952380952380952,
 'test_f1': 0.6780346694077214,
 'test_loss': 0.6999900937080383,
 'test_precision': 0.6795332176488694,
 'test_recall': 0.6839459547770672,
 'test_runtime': 13.9555,
 'test_samples_per_second': 112.859,
 'test_steps_per_second': 3.583}

# Модель для задачи 2 - выходы с пуллер слоя + CLS токен последнего слоя

In [18]:
class SentimentClassifierPooledOutput_LastCLS(nn.Module):

  def __init__(self, ):
    super().__init__()
    self.distilbert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(4*self.distilbert.config.hidden_size, 3)
  
  def forward(self, input_ids, attention_mask, labels):
    _, hidden_states = self.distilbert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False, 
      output_hidden_states=True)

    pooled_output = torch.stack(hidden_states[-3:], dim=0) #stack last 3 layers, layers*batch*maxlen*embdim
    pooled_output = pooled_output[:, :, 0, :]  #take first token for CLS, layers*batch*emb_dim
    pooled_output = torch.cat(tuple([layer for layer in pooled_output]), dim=-1) #concat by layers batch*3emb_dim

    last_cls = hidden_states[-1][:, 0, :] #batch(*maxlen)*embdim
    concat_output = torch.cat(tuple([pooled_output, last_cls]), dim=-1) #batch*4embdim
    
    logits = self.out(self.drop(concat_output))  #*batch*num_classes(3)

    loss = None
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, 3), labels.view(-1))
    
    return loss, logits
            

In [19]:
model2 = SentimentClassifierPooledOutput_LastCLS()
model2 = model2.to(device)

loading configuration file https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/690674b44bd5b1a7ef81fea02641d3b53827649f92ae54381924832f1edefaac.49a3ba1a12c5b0c12c1f5d39ce0fc262dc3810bdc41be4d875eaf3181375d3f3
Model config DistilBertConfig {
  "_name_or_path": "./",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
  },
  "initializer_range": 0.02,
  "label2id": {
    "anger": 3,
    "fear": 4,
    "joy": 1,
    "love": 2,
    "sadness": 0,
    "surprise": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_e

In [20]:
trainer2 = Trainer(
    model=model2,                      
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,           
    compute_metrics = compute_metrics    
)

trainer2.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 626


Step,Training Loss
10,1.1381
20,1.1669
30,1.146
40,1.072
50,1.0916
60,1.0483
70,1.0583
80,0.9763
90,0.9602
100,0.9516


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=626, training_loss=0.7981916163295222, metrics={'train_runtime': 457.894, 'train_samples_per_second': 21.839, 'train_steps_per_second': 1.367, 'total_flos': 0.0, 'train_loss': 0.7981916163295222, 'epoch': 2.0})

In [21]:
trainer2.evaluate()
trainer2.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


{'epoch': 2.0,
 'test_accuracy': 0.692063492063492,
 'test_f1': 0.6765370022691427,
 'test_loss': 0.6996183395385742,
 'test_precision': 0.6771994505040393,
 'test_recall': 0.6814415493660778,
 'test_runtime': 14.8169,
 'test_samples_per_second': 106.297,
 'test_steps_per_second': 3.375}

# Модель для задачи 3 - готовая модель

Так как я использовала модель, основанную на DistilBert, то логично будет взять готовую модель DistilBertForSequenceClassification.

In [25]:
from transformers import DistilBertForSequenceClassification, DistilBertConfig

In [None]:
config = DistilBertConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)
config.num_labels = 3


In [31]:
model3 = DistilBertForSequenceClassification(config)
model3 = model3.to(device)

In [28]:
trainer3 = Trainer(
    model=model3,                      
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,           
    compute_metrics = compute_metrics    
)

trainer3.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 626


Step,Training Loss
10,1.703
20,1.6213
30,1.5475
40,1.357
50,1.2714
60,1.1998
70,1.1609
80,1.1934
90,1.1482
100,1.1405


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=626, training_loss=1.0161658567361558, metrics={'train_runtime': 453.3294, 'train_samples_per_second': 22.059, 'train_steps_per_second': 1.381, 'total_flos': 1252320204960000.0, 'train_loss': 1.0161658567361558, 'epoch': 2.0})

In [29]:
trainer3.evaluate()
trainer3.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


{'epoch': 2.0,
 'test_accuracy': 0.6266666666666667,
 'test_f1': 0.5991607478548776,
 'test_loss': 0.8236094117164612,
 'test_precision': 0.6051334134804541,
 'test_recall': 0.6129874881251777,
 'test_runtime': 14.7417,
 'test_samples_per_second': 106.84,
 'test_steps_per_second': 3.392}

# Модель для задачи 4 - агрегированные cls-токены с нескольких слоев

я делала похожее в 1 модели, но только конкатенировала. попробуем здесь взять сумму по последним трем слоям.

In [32]:
class SentimentClassifierAggregateCLS(nn.Module):

  def __init__(self, ):
    super().__init__()
    self.distilbert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.distilbert.config.hidden_size, 3)
  
  def forward(self, input_ids, attention_mask, labels):
    _, hidden_states = self.distilbert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False, 
      output_hidden_states=True)

    pooled_output = torch.stack(hidden_states[-3:], dim=0) #stack last 3 layers, layers*batch*maxlen*embdim
    pooled_output = pooled_output[:, :, 0, :]  #take first token for CLS, layers*batch*emb_dim
    pooled_output = torch.sum(pooled_output, dim=0) #take sum by layer dim, batch*emb_dim

    logits = self.out(self.drop(pooled_output))  #*batch*num_classes(3)

    loss = None
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, 3), labels.view(-1))
    
    return loss, logits

In [33]:
model4 = SentimentClassifierAggregateCLS()
model4 = model4.to(device)

loading configuration file https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/690674b44bd5b1a7ef81fea02641d3b53827649f92ae54381924832f1edefaac.49a3ba1a12c5b0c12c1f5d39ce0fc262dc3810bdc41be4d875eaf3181375d3f3
Model config DistilBertConfig {
  "_name_or_path": "./",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
  },
  "initializer_range": 0.02,
  "label2id": {
    "anger": 3,
    "fear": 4,
    "joy": 1,
    "love": 2,
    "sadness": 0,
    "surprise": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_e

In [34]:
trainer4 = Trainer(
    model=model4,                      
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,           
    compute_metrics = compute_metrics    
)

trainer4.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 626


Step,Training Loss
10,1.34
20,1.3983
30,1.142
40,1.1778
50,1.103
60,1.0491
70,1.0913
80,1.0973
90,1.0841
100,0.9444


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=626, training_loss=0.8440329975213486, metrics={'train_runtime': 456.0305, 'train_samples_per_second': 21.928, 'train_steps_per_second': 1.373, 'total_flos': 0.0, 'train_loss': 0.8440329975213486, 'epoch': 2.0})

In [35]:
trainer4.evaluate()
trainer4.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


***** Running Evaluation *****
  Num examples = 1575
  Batch size = 32


{'epoch': 2.0,
 'test_accuracy': 0.7047619047619048,
 'test_f1': 0.6836601009862379,
 'test_loss': 0.7006795406341553,
 'test_precision': 0.690450687642024,
 'test_recall': 0.6922615734043087,
 'test_runtime': 14.7543,
 'test_samples_per_second': 106.748,
 'test_steps_per_second': 3.389}

# Задача 5 - как работают модели на выбранных отзывах

In [52]:
review_neg = 'The absolute worst VPN. Riddled with ads, \
requires money for everything, starts itself on its own,\
the app opens when you turn the VPN off, makes your internet slower, \
sometimes completely kills your connection. Don\'t bother.'

review_pos = 'It\'s very good and useful, I use the VPN, I use the free version, \
it\'s very good and the speed is good and reliable to visit and work on websites, \
YouTube videos, Facebook and many other apps and websites (as long as one doesn\'t \
mess with the app using it for torrenting or gaming). Thank you very mu'

review_neutr = 'Very good application can be used to do little things like \
browsing and yt video (480-720) but there was a problem of connecting \
to indian server if it will connect the Speed may be awesome .'

review_list = [review_neg, review_neutr, review_pos]
label_list = [0, 1, 2]

In [53]:
encodings = tokenizer(review_list, truncation=True, padding=True)
dataset = GPReviewDataset(encodings, label_list)


In [60]:
id2label = {1: 'neutral', 2: 'positive', 0: 'negative'}

In [61]:
def print_pretty(preds, label_list):
    pred_values = [id2label[pred] for pred in preds]
    label_values = [id2label[pred] for pred in label_list]
    print('Predictions:\n1. ', pred_values[0],
          '\n2. ', pred_values[1],
          '\n3. ', pred_values[2],       
          )
    print('Ground True:\n1. ', label_values[0],
          '\n2. ', label_values[1],
          '\n3. ', label_values[2],       
          )

## модель 1

In [64]:
predictions = trainer1.predict(dataset)
preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 3
  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
print_pretty(preds, label_list)

Predictions:
1.  negative 
2.  positive 
3.  positive
Ground True:
1.  negative 
2.  neutral 
3.  positive


## модель 2

In [66]:
predictions = trainer2.predict(dataset)
preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 3
  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
print_pretty(preds, label_list)

Predictions:
1.  negative 
2.  positive 
3.  positive
Ground True:
1.  negative 
2.  neutral 
3.  positive


## модель 3

In [68]:
predictions = trainer3.predict(dataset)
preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 3
  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
print_pretty(preds, label_list)

Predictions:
1.  negative 
2.  positive 
3.  positive
Ground True:
1.  negative 
2.  neutral 
3.  positive


## модель 4

In [70]:
predictions = trainer4.predict(dataset)
preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 3
  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


In [71]:
print_pretty(preds, label_list)

Predictions:
1.  negative 
2.  positive 
3.  positive
Ground True:
1.  negative 
2.  neutral 
3.  positive


Ни одна из моделей не справляется с нейтральным отзывом и относит его к положительным. Стоит отметить, что в нем содержится несколько позитивных слов, что, возможно, путает модель