In [10]:
import os
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense,Dropout, Input
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras import regularizers
from transformers import TFDistilBertModel,DistilBertTokenizer,DistilBertConfig
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from text_cleaning import preprocess_sentence

max_len, num_classes = 32, 2

dbert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
dbert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")


def bert_preproc (sentences):
    # Prepare the model input
    input_ids = []
    attention_masks = []

    for sent in sentences:
        dbert_inps = dbert_tokenizer.encode_plus(sent, add_special_tokens=True, max_length=max_len,
                                                 pad_to_max_length=True, return_attention_mask=True, truncation=True)
        input_ids.append(dbert_inps['input_ids'])
        attention_masks.append(dbert_inps['attention_mask'])

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    return input_ids, attention_masks


def bert_model ():
    inps = Input(shape=(max_len,), dtype='int64')
    masks = Input(shape=(max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:, 0, :]
    dense = Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout = Dropout(0.5)(dense)
    pred = Dense(num_classes, activation='softmax', kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps, masks], outputs=pred)
    print(model.summary())
    return model



class BERT_Classification:
    def __init__(self):

        self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        self.metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
        self.model = bert_model ()


    def bert_train (self, workspace, train_inp, train_mask, train_label, val_inp, val_mask, val_label,num_epochs):
        log_dir = workspace + os.sep + 'log'
    
        if os.path.exists(log_dir) and os.path.isdir(log_dir):
            shutil.rmtree(log_dir)

        os.makedirs(log_dir, exist_ok=True)
        model_save_path = workspace + os.sep +  'dbert_model.h5'
        chkpt = ModelCheckpoint(filepath = model_save_path, save_weights_only=True, monitor='val_loss',
                                               mode='min', save_best_only=True)
        tboard = TensorBoard(log_dir=log_dir)
        callbacks = [chkpt, tboard]

        self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=[self.metric])
        history = self.model.fit([train_inp, train_mask], train_label, batch_size=16, epochs=num_epochs,
                            validation_data=([val_inp, val_mask], val_label), callbacks=callbacks)
        return history


    def bert_predict (self, workspace, val_inp, val_mask):
        model_save_path = workspace + os.sep + 'dbert_model.h5'
        trained_model = bert_model()
        trained_model.compile(loss=self.loss, optimizer=self.optimizer, metrics=[self.metric])
        trained_model.load_weights(model_save_path)
        preds = trained_model.predict([val_inp, val_mask], batch_size=16)
        pred_labels = preds.argmax(axis=1)
        return pred_labels, preds.max(axis=1)



def get_data ():
   
    file_path = r"C:\Users\jayanti.prasad\Data\NLP_DATA\train\IMDB_reviews.csv" 

    df = pd.read_csv(file_path, encoding='utf-8',nrows=1000)
    return df  


def plot_history(history):
    fig, axs = plt.subplots(2, 1, figsize=(12, 12))

    axs[0].set_ylabel("Loss")
    axs[1].set_ylabel("Accuracy")

    axs[1].plot(history.history['accuracy'], '-o', label="Training")
    axs[1].plot(history.history['val_accuracy'], '-o', label='Validation')
    axs[0].plot(history.history['loss'], '-o', label='Training')
    axs[0].plot(history.history['val_loss'], '-o', label='Validation')

    axs[0].legend()
    axs[1].legend()
    plt.legend()
    plt.show()


if __name__ == "__main__":
    max_len = 32
    num_epochs = 10 

    df = get_data() 
    
    print(df.columns, df.shape)
    # prepare the data

    sentences = df['text'].to_list()
    labels = df['label'].to_list()
    labels = np.array(labels)
    print(len(sentences), len(labels))
      
    input_ids, attention_masks  = bert_preproc (sentences)

    label_class_dict = {0: 'n', 1: 'y'}
    
    target_names = label_class_dict.values()

    train_inp, val_inp, train_label, val_label, train_mask, val_mask \
        = train_test_split(input_ids, labels,attention_masks, test_size=0.2)

    M = BERT_Classification()
    history = M.bert_train(workspace, train_inp, train_mask, train_label, val_inp, val_mask, val_label,num_epochs)
    plot_history(history)


    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input-file',help='Input File')
    parser.add_argument('-c','--column-name',help='Column Name')
    parser.add_argument('-o','--output-dir',help='Output dir')
    parser.add_argument('-n','--num_epochs',type=int,help='Number of epochs')

    args = parser.parse_args()

    data = get_data(args.input_file, args.column_name)

    data.rename(columns={'COMPLEATION_NOTES': 'text'}, inplace=True)

    workspace = args.output_dir

    os.makedirs (workspace, exist_ok=True)

    #data['gt'] = data['label'].map({'n': 0, 'y': 1})
    #print('Available labels: ', data.label.unique())
    data['text'] = data['text'].map(preprocess_sentence)
    num_classes = len(data.label.unique())

    # prepare the data
    max_len = 32
    sentences = data['text']
    labels = data['label'].to_list()
    labels = np.array(labels)
    print(len(sentences), len(labels))
    print("labels=",labels)
    #sys.exit()

    input_ids, attention_masks  = bert_preproc (sentences)

    label_class_dict = {0: 'n', 1: 'y'}
    target_names = label_class_dict.values()

    train_inp, val_inp, train_label, val_label, train_mask, val_mask \
        = train_test_split(input_ids, labels,attention_masks, test_size=0.2)

    M = BERT_Classification()
    history = M.bert_train(workspace, train_inp, train_mask, train_label, val_inp, val_mask, val_label,args.num_epochs)
    plot_history(history)

    lab_p, prob_p = M.bert_predict(workspace, val_inp, val_mask)

    print(lab_p)
    print(prob_p)
    """

loading configuration file config.json from cache at C:\Users\jayanti.prasad/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb64c8a40076\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\jayanti.prasad/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb

Index(['Unnamed: 0', 'text', 'label'], dtype='object') (1000, 3)
1000 1000


AttributeError: 'KerasTensor' object has no attribute 'size'