# Load libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

# import dataset

In [2]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

Для улучшенной производительности возьмем первые 2000 строк

In [4]:
batch_1 = df[:2000]

In [5]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

Классы практически равны

# Loading the Pre-trained BERT model

In [6]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [7]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

# Prepare dataset

## tokenization

In [10]:
tokenized = batch_1[0].apply(lambda x: tokenizer.encode(x,add_special_tokens=True))

## Padding

Найдем максимальную длинну токенизированного списка

In [12]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

In [16]:
np.array(padded).shape

(2000, 59)

## Masking

In [17]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

# Model #1

In [18]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [23]:
features = last_hidden_states[0][:,0,:].numpy()

In [24]:
labels = batch_1[1]

In [25]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

# Model #2

In [26]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

In [27]:
lr_clf.score(test_features, test_labels)

0.852

## Evaluating model #2

In [28]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.525 (+/- 0.00)


# Conclusion

Наша модель работает заметно лучше dummy classifier