**BERT**

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers
import warnings
warnings.filterwarnings('ignore')

In [12]:
#DistilBERT:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')

In [32]:
text = "Replace NVIDIA DGX A100 by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
first_sentence_ids = encoded_input['input_ids'][0]
print(tokenizer.convert_ids_to_tokens(first_sentence_ids))
tokenizer.decode(encoded_input['input_ids'][0])

['[CLS]', 'replace', 'n', '##vid', '##ia', 'd', '##g', '##x', 'a1', '##00', 'by', 'any', 'text', 'you', "'", 'd', 'like', '.', '[SEP]']


"[CLS] replace nvidia dgx a100 by any text you'd like. [SEP]"

**DATASET**

In [2]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
batch_1 = df[:2000]
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [4]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [7]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
padded.shape

(2000, 59)

In [8]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

**Embeddings**

In [18]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    embeddings = model(input_ids, attention_mask=attention_mask)

In [19]:
features = embeddings.last_hidden_state[:,0,:].numpy()

In [20]:
labels = batch_1[1]

**Classifier**

In [21]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [23]:
parameters = {'C': np.linspace(0.0001, 100, 50)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 2.040914285714286}
best scrores:  0.828


In [24]:
clf = LogisticRegression(C=grid_search.best_params_['C'])
clf.fit(train_features, train_labels)

In [25]:
clf.score(test_features, test_labels)

0.846