In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer

2022-12-02 22:24:36.852109: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-12-02 22:24:36.852253: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
#USING TPU 

# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470


2022-12-02 22:24:42.633112: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-12-02 22:24:42.637015: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-12-02 22:24:42.637055: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-02 22:24:42.637083: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (13a7ba5d3af9): /proc/driver/nvidia/version does not exist
2022-12-02 22:24:42.640800: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

REPLICAS:  8


In [3]:
def BERT_MODEL(CSV,text_col,target_col,EPOCHS, BATCH_SIZE, MAX_LEN, NUM_TARGET_VALUES):

    data= pd.read_csv(CSV)
    
    if data[target_col].dtypes != 'int64':
        le = preprocessing.LabelEncoder()
        data[target_col] = le.fit_transform(data[target_col])
    
    X = data[text_col].values
    y = data[target_col].values

    X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state=0
                                                    )
    
    def fast_encode(texts, tokenizer, chunk_size=256, maxlen=MAX_LEN):

        tokenizer.enable_truncation(max_length=maxlen)
        tokenizer.enable_padding(length=maxlen)
        all_ids = []

        for i in tqdm(range(0, len(texts), chunk_size)):
            text_chunk = texts[i:i+chunk_size].tolist()
            encs = tokenizer.encode_batch(text_chunk)
            all_ids.extend([enc.ids for enc in encs])

        return np.array(all_ids)

    #IMP DATA FOR CONFIG

    AUTO = tf.data.experimental.AUTOTUNE


    # First load the real tokenizer
    tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    # Save the loaded tokenizer locally
    tokenizer.save_pretrained('.')
    # Reload it with the huggingface tokenizers library
    fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
    fast_tokenizer

    x_train = fast_encode(X_train.astype(str), fast_tokenizer, maxlen=MAX_LEN)
    x_valid = fast_encode(X_test.astype(str), fast_tokenizer, maxlen=MAX_LEN)

    y_train = y_train
    y_valid = y_test

    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )

    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(AUTO)
    )

    def build_model(transformer, max_len, num_outputs): 
        """
        function for training the BERT model
        """
        input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
        sequence_output = transformer(input_word_ids)[0]
        cls_token = sequence_output[:, 0, :]
        out = Dense(num_outputs , activation='softmax')(cls_token)

        model = Model(inputs=input_word_ids, outputs=out)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        return model

    with strategy.scope(): #Remove strategy.scope() if not using the TPU and leave the 'transformer_layer' and 'model' with the main function.
        transformer_layer = (
            transformers.TFDistilBertModel
            .from_pretrained('distilbert-base-multilingual-cased')
        )
        model = build_model(transformer_layer, max_len=MAX_LEN, num_outputs=NUM_TARGET_VALUES )
        model.summary()

    n_steps = x_train.shape[0] // BATCH_SIZE
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=n_steps,
        validation_data=valid_dataset,
        epochs=EPOCHS
    )
    return(train_history)

In [4]:
# Replace the configurations below before running the code. 

NUM_TARGET_VALUES = 3 #Target column of my data has 3 unique target values 
EPOCHS = 10 
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 200 # This will depend on the length of the input that has the max number of words or tokens
CSV = "/kaggle/input/preprocessed-dataset-sentiment-analysis/EcoPreprocessed.csv" # CSV location of your data
text_col = "review" #column name where text is stored 
target_col = "division" #column name where target is stored

BERT_MODEL(CSV,text_col,target_col,EPOCHS, BATCH_SIZE, MAX_LEN, NUM_TARGET_VALUES)

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

100%|██████████| 12/12 [00:00<00:00, 89.63it/s]
100%|██████████| 4/4 [00:00<00:00, 116.03it/s]


Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/911M [00:00<?, ?B/s]

2022-12-02 22:25:22.378751: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-multilingual-cased.
If your task is similar to the 

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 200)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist TFBaseModelOutput(last_hi 134734080 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 3)                 2307      
Total params: 134,736,387
Trainable params: 134,736,387
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f815e3dbe50>