In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 3.2 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 71.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 40.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.2


In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer


In [None]:
df = pd.read_excel('/content/text_classification_annotation.xlsx')
df.head()

Unnamed: 0,document,score
0,This appeal is directed against the final judg...,0
1,Respondent No. 1-the Contractee Company was aw...,0
2,"On 21.09.2012, the Contractee Company submitte...",0
3,"Subsequently, on 24.10.2012, the Contractee Co...",0
4,"The Contractee-Company, vide letter dated 12.0...",0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313 entries, 0 to 312
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  313 non-null    object
 1   score     313 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.0+ KB


In [None]:
df = df.dropna()

In [None]:
#df.drop(118202, inplace=True)
df['score'] = df['score'].astype(int)

In [None]:
df['score'].value_counts()

 3    151
 2     70
 0     56
 1     34
-1      2
Name: score, dtype: int64

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
token = tokenizer.encode_plus(
    df['document'].iloc[0], 
    max_length=500, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
token.input_ids

<tf.Tensor: shape=(1, 500), dtype=int32, numpy=
array([[  101,  1188,  5767,  1110,  2002,  1222,  1103,  1509,  9228,
         1105,  1546,  5422,  1367,   119,  5187,   119,  1410,  2085,
         1118,  1103,  1693,  2031,  1104, 12247,  1120,  7756, 12328,
         1107,   140,   119,   150,   119,   153,   119,  1302,   119,
         2588,  1104,  1387, 13949,  3560,  1423,  5274,  1104,  1103,
         1693,  2031,  2148,  1103, 10077,  5770,  1118,  1103,  6297,
         3452,  1302,   119,   122,   118,  1881,  1111,  5516,  1104,
         1126,   170, 26281,  2875, 11412,  1111,  6021,  1104,  1103,
         7287,  1206,  1103, 12647,  7772,  2227,   118,  1881, 20979,
        17337,  1753,   159,  9866,  8971, 15278, 11612,   156, 27370,
        17308,  2240,  3663, 12507, 14265,   131,  1857,   119,  5507,
          119,  5004,  1105,  6297,  3452,  1302,   119,   122,   118,
         1881,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [None]:
X_input_ids = np.zeros((len(df), 500))
X_attn_masks = np.zeros((len(df), 500))

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['document'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=500, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [None]:
labels = np.zeros((len(df), 4))
labels.shape

(313, 4)

In [None]:
labels[np.arange(len(df)), df['score'].values] = 1 # one-hot encoded target tensor

In [None]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<TakeDataset element_spec=(TensorSpec(shape=(500,), dtype=tf.float64, name=None), TensorSpec(shape=(500,), dtype=tf.float64, name=None), TensorSpec(shape=(4,), dtype=tf.float64, name=None))>

In [None]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset

In [None]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(500,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(500,), dtype=tf.float64, name=None)}, TensorSpec(shape=(4,), dtype=tf.float64, name=None))>

In [None]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [None]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 500), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 500), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 4), dtype=tf.float64, name=None))>

In [None]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [None]:
train_size

15

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [None]:
from transformers import TFBertModel

In [None]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [None]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2


ValueError: ignored

In [None]:
sentiment_model.save('sentiment_model')

In [None]:
sentiment_model = tf.keras.models.load_model('sentiment_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['Negative', 'A bit negative', 'Neutral', 'A bit positive', 'Positive']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [None]:
input_text = input('Enter movie review here: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(sentiment_model, processed_data=processed_data)
print(f"Predicted Sentiment: {result}")