In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging
from sklearn.model_selection import train_test_split
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers import InputExample, InputFeatures
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

In [2]:
train = pd.read_csv('../input/train-example/movie_review.tsv', delimiter='\t', header=None, error_bad_lines=False)
test = pd.read_csv('../input/testexample/movie_review_test.tsv', delimiter='\t', header=None, error_bad_lines=False)

In [3]:
train.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [4]:
test.head()

Unnamed: 0,0,1
0,"no movement , no yuks , not much of anything",0
1,"a gob of drivel so sickly sweet , even the eag...",0
2,"gangs of new york is an unapologetic mess , wh...",0
3,"we never really feel involved with the story ,...",0
4,this is one of polanski 's best films,1


In [5]:
X_train = train[0]
y_train = train[1]

X_test = test[0]
y_test = test[1]

In [6]:
y_test

0       0
1       0
2       0
3       0
4       1
       ..
1816    0
1817    0
1818    0
1819    0
1820    0
Name: 1, Length: 1821, dtype: int64

In [7]:
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [9]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  1538      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________


In [10]:
max_len = 150

In [11]:
train_encodings = tokenizer(list(X_train.values),
                            max_length=max_len,
                            truncation=True, 
                            padding=True)

test_encodings = tokenizer(list(X_test.values),
                           max_length=max_len,
                           truncation=True, 
                           padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(y_train.values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                    list(y_test.values)))

train_dataset  = train_dataset.shuffle(len(X_train)).batch(200)

In [12]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_dataset, epochs=5, batch_size=200 )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8969be3850>

In [13]:
model.evaluate(test_dataset.shuffle(len(X_test)).batch(200), 
               return_dict=True, 
               batch_size=200)



{'loss': 0.5277782082557678, 'accuracy': 0.901702344417572}

In [14]:
to_predict_encoding = tokenizer(['One of the worst movies I\'ve ever seen', 'An absolute delight for the eyes', 'Mmm I\'m neutral about this movie'], max_length=max_len, truncation=True, padding=True)
to_predict_tensor = tf.data.Dataset.from_tensor_slices((dict(to_predict_encoding)))
to_predict_tensor = to_predict_tensor.batch(1)

preds = model.predict(to_predict_tensor)

In [15]:
preds = preds.logits
preds = tf.keras.activations.softmax(tf.convert_to_tensor(preds)).numpy()
negative_preds = [p[0] for p in preds]
positive_preds = [p[1] for p in preds]

negative probabilities

In [16]:
negative_preds

[0.9998956, 3.806e-05, 0.99996173]

positive probabilities

In [17]:
positive_preds

[0.0001043832, 0.999962, 3.8304726e-05]