# Imports and Defines

In [1]:
from IPython.display import clear_output
!pip install transformers
clear_output()

import numpy as np
import pandas as pd
import random,os
import warnings
warnings.filterwarnings('ignore')

import transformers
from transformers import BertTokenizer
from transformers import TFBertModel

import tensorflow as tf 
from tensorflow.keras.optimizers import Adam

TRAIN_PATH = "../input/nlp-getting-started/train.csv"
TEST_PATH = "../input/nlp-getting-started/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/nlp-getting-started/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "id"
TARGET = "target"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()


MODEL_NAME = "bert-large-uncased"
MODEL_MAX_LENGTH = 60
MODEL_INPUT_IDS_COL = "input_ids"
MODEL_ATTENTION_MASK_COL = "attention_mask"

MODEL_DATATYPE = "int32"
MODEL_DENSE = 32
MODEL_DROPOUT = 0.2
MODEL_ACTIVATION = "relu"
MODEL_LAST_ACTIVATION = "sigmoid"
MODEL_LR = 6e-6
MODEL_LOSS = "binary_crossentropy"
MODEL_METRICS = ['accuracy']
MODEL_EPOCH = 2
MODEL_BATCH_SIZE = 10
MODEL_VAL_SIZE = 0.2

2022-08-16 05:11:57.222239: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 05:11:57.223866: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 05:11:57.224890: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 05:11:57.227746: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

# Preprocess Data

In [2]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

def bert_encode(data,maximum_length) :
    input_ids = []
    attention_masks = []
  

    for i in range(len(data.text)):
        encoded = tokenizer.encode_plus(

        data.text[i],
        add_special_tokens=True,
        max_length=maximum_length,
        pad_to_max_length=True,

        return_attention_mask=True,

        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids),np.array(attention_masks)

train_input_ids,train_attention_masks = bert_encode(train,MODEL_MAX_LENGTH)
test_input_ids,test_attention_masks = bert_encode(test,MODEL_MAX_LENGTH)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Define Model

In [3]:
def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(MODEL_MAX_LENGTH,),dtype=MODEL_DATATYPE)
    attention_masks = tf.keras.Input(shape=(MODEL_MAX_LENGTH,),dtype=MODEL_DATATYPE)

    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(MODEL_DENSE,activation=MODEL_ACTIVATION)(output)
    output = tf.keras.layers.Dropout(MODEL_DROPOUT)(output)

    output = tf.keras.layers.Dense(1,activation=MODEL_LAST_ACTIVATION)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr=MODEL_LR), loss=MODEL_LOSS, metrics=MODEL_METRICS)
    return model


bert_model = TFBertModel.from_pretrained(MODEL_NAME)

model = create_model(bert_model)
model.summary()

Downloading:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 335141888   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 32)           32800       tf_bert_model[0][1]          

# Build Model

In [4]:
history = model.fit(
    [train_input_ids,train_attention_masks],
    train[TARGET],
    validation_split=MODEL_VAL_SIZE,
    epochs=MODEL_EPOCH,
    batch_size=MODEL_BATCH_SIZE)

Epoch 1/2


2022-08-16 05:12:54.900216: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/2


# Predict Data

In [5]:
pred_test = model.predict([test_input_ids,test_attention_masks])

sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = np.round(pred_test).astype(int)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
