In [0]:
!pip install transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [0]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df = df[:3000]

In [0]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
tokenized = df[0].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [0]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [0]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3000, 66)

In [0]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [0]:
features = last_hidden_states[0].numpy()[:,0,:]

In [0]:
labels = np.array(df[1])


In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [0]:
type(train_labels)

numpy.ndarray

In [0]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.824888888888889


In [0]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
lr_clf.score(test_features, test_labels)

0.828

In [0]:
features[1]

In [0]:
lr_clf.predict([features[0]])

array([1])

In [0]:
def sentences_to_features(x): # x is type of pandas series and sentences are string
  tokens = x.apply(lambda y: tokenizer.encode(y, add_special_tokens=True))

  max_len = 0
  for i in tokens.values:
    if len(i) > max_len:
      max_len = len(i)

  with_pad = np.array([i + [0]*(max_len-len(i)) for i in tokens.values])

  mask = np.where(with_pad != 0, 1, 0)

  with_pad = torch.tensor(with_pad)  
  mask = torch.tensor(mask)

  with torch.no_grad():
    full_bert_output = model(with_pad, attention_mask=mask)  
    return full_bert_output[0].numpy()[:,0,:]
  

In [0]:
sentences = ['hello how are you?', 'that would be really great', 'I\'m so sad', 'it is snowing, that\'s fucking great']
f = sentences_to_features(pd.Series(sentences))
lr_clf.predict(f)

array([1, 1, 0, 1])

In [0]:
########################## NN fine-tuning ########################

In [0]:
from keras import models
from keras import layers
from keras.utils import to_categorical
from keras.layers import Dense,Activation,Dropout

In [0]:
np.shape(train_features)

(2250, 768)

In [0]:
network = models.Sequential()
network.add(layers.Dense(1000, activation='tanh', input_shape=(768,)))
network.add(Dropout(0.20))
network.add(layers.Dense(1000, activation='tanh', input_shape=(1000,)))
network.add(Dropout(0.30))
network.add(layers.Dense(500, activation='tanh', input_shape=(1000,)))
network.add(Dropout(0.20))
network.add(layers.Dense(100, activation='tanh', input_shape=(500,)))
network.add(layers.Dense(10, activation='tanh', input_shape=(100,)))
network.add(layers.Dense(1, activation='sigmoid'))
network.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
warnings.filterwarnings('ignore')


In [0]:
network.fit(train_features, train_labels, epochs=1, batch_size=10)
warnings.filterwarnings('ignore')

Epoch 1/1


In [0]:
test_loss, test_acc = network.evaluate(test_features, test_labels)
print('test_acc:', test_acc, 'test_loss', test_loss)

test_acc: 0.8453333336512248 test_loss 0.3817604646682739
