In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/yelp-review-polarity/yelp_review_polarity_csv/readme.txt
/kaggle/input/yelp-review-polarity/yelp_review_polarity_csv/train.csv
/kaggle/input/yelp-review-polarity/yelp_review_polarity_csv/test.csv
/kaggle/input/yelp-review-polarity/yelp_review_polarity_csv/yelp_review_polarity_csv/readme.txt
/kaggle/input/yelp-review-polarity/yelp_review_polarity_csv/yelp_review_polarity_csv/train.csv
/kaggle/input/yelp-review-polarity/yelp_review_polarity_csv/yelp_review_polarity_csv/test.csv


In [2]:
# importing the necessary libraries
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses

import transformers
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

In [3]:
# loading the dataset
column_names = ['Sentiment', 'Reviews']
df_train = pd.read_csv('/kaggle/input/yelp-review-polarity/yelp_review_polarity_csv/train.csv', names=column_names)
df_train['Sentiment'].replace({1: 0, 2: 1}, inplace=True)
print(df_train.head())

   Sentiment                                            Reviews
0          0  Unfortunately, the frustration of being Dr. Go...
1          1  Been going to Dr. Goldberg for over 10 years. ...
2          0  I don't know what Dr. Goldberg was like before...
3          0  I'm writing this review to give you a heads up...
4          1  All the food is great here. But the best thing...


In [4]:
# downsampling the dataset due to computational constraints
df = df_train[:10000]
x = list(df['Reviews'])
y = list(df['Sentiment'])

In [5]:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 20

# initializing tokenizer
tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# displaying an example of the tokenized data
review = x[0]
inputs = tkzr(review, max_length=MAX_LEN, truncation=True, padding=True)
print(f'review: \'{review}\'')
print(f'input ids: {inputs["input_ids"]}')
print(f'attention mask: {inputs["attention_mask"]}')


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

review: 'Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.'
input ids: [101, 6854, 1010, 1996, 9135, 1997, 2108, 2852, 1012, 18522, 1005, 1055, 5776, 2003, 1037, 9377, 1997, 1996, 3325, 102]
attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [6]:
# encoding and tokenizing the reviews
def construct_encodings(x, tkzr, max_len, trucation=True, padding=True):
    return tkzr(x, max_length=max_len, truncation=trucation, padding=padding)
    
encodings = construct_encodings(x, tkzr, max_len=MAX_LEN)   

In [7]:
# converting those tokenized encoding into dataset objects
def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))
    
tfdataset = construct_tfdataset(encodings, y)

2022-01-06 21:31:56.811691: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-06 21:31:56.812720: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-06 21:31:56.813382: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-06 21:31:56.815367: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [8]:
# splitting the dataset into train and test sets
TEST_SPLIT = 0.2
BATCH_SIZE = 500

train_size = int(len(x) * (1-TEST_SPLIT))

tfdataset = tfdataset.shuffle(len(x))
tfdataset_train = tfdataset.take(train_size)
tfdataset_test = tfdataset.skip(train_size)

tfdataset_train = tfdataset_train.batch(BATCH_SIZE)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)

In [9]:
# training the dataset
N_EPOCHS = 5

model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=3e-5)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2022-01-06 21:32:14.603296: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint 

Epoch 1/5


2022-01-06 21:32:25.318726: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5c3562b750>

In [10]:
# evaluating
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(benchmarks)

{'loss': 0.2845988869667053, 'accuracy': 0.8815000057220459}
