In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import tensorflow as tf
from transformers import TFAutoModel
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report
import os
import json
  


In [2]:
def set_seed(SEED):
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    
SEED = 42
set_seed(SEED)

In [3]:
df = pd.read_csv('../data/raw/sofmattress_train.csv')
df.head()

Unnamed: 0,sentence,label
0,You guys provide EMI option?,EMI
1,Do you offer Zero Percent EMI payment options?,EMI
2,0% EMI.,EMI
3,EMI,EMI
4,I want in installment,EMI


In [4]:

with open('../config/labels.json') as json_file:
    label_mapping = json.load(json_file)
  

In [5]:
folds = pd.read_csv('../data/folds.csv')

In [6]:
folds.label = folds.label.map(label_mapping)

In [7]:
folds

Unnamed: 0,index,sentence,label,fold
0,0,May I please know about the offers,20,4
1,1,Current state of my order,16,2
2,2,I want refund,17,0
3,3,Can pay later on delivery,1,1
4,4,Can I cancel my order here,18,4
...,...,...,...,...
323,323,Product Variants,13,3
324,324,Is COD option available,1,2
325,325,Almost 1 month over,15,3
326,326,When will the order be delivered to me?,16,4


In [8]:
valid_fold=0

In [9]:
train_data = folds[folds.fold != valid_fold].reset_index(drop=True)#.drop(columns=['fold'])
valid_data = folds[folds.fold == valid_fold].reset_index(drop=True)#.drop(columns=['fold'])
train_data = pd.get_dummies(train_data, columns=['label'])
valid_data = pd.get_dummies(valid_data, columns=['label'])
train_data['sentence']=train_data['sentence'].str.lower()
valid_data['sentence']=valid_data['sentence'].str.lower()

In [10]:
label_train=train_data.iloc[:,3:]
label_valid = valid_data.iloc[:,3:]

In [11]:
label_train

Unnamed: 0,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,...,label_11,label_12,label_13,label_14,label_15,label_16,label_17,label_18,label_19,label_20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
258,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
259,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [12]:
seq_len = 512
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [13]:
X_train_ids = []
X_train_masks = []

X_valid_ids = []
X_valid_masks = []

def get_id_masks(data):
    ids = []
    masks = []
    for phrase in tqdm(data):
        tokens = tokenizer.encode_plus(phrase,
                                       max_length = seq_len,
                                       truncation=True,
                                       padding='max_length',
                                       add_special_tokens=True,
                                       return_tensors='tf')
        ids.append( tokens['input_ids'][0] )
        masks.append( tokens['attention_mask'][0] )
    return ids, masks
    
X_train_ids, X_train_masks = get_id_masks(train_data['sentence'])
X_valid_ids, X_valid_masks = get_id_masks(valid_data['sentence'])


  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

In [14]:
dataset_train = tf.data.Dataset.from_tensor_slices((X_train_ids, X_train_masks, label_train))
dataset_valid = tf.data.Dataset.from_tensor_slices((X_valid_ids, X_valid_masks, label_valid))
dataset_train.take(1)


<TakeDataset shapes: ((512,), (512,), (21,)), types: (tf.int32, tf.int32, tf.uint8)>

In [15]:
def map_func(X_ids, X_masks, label):
    return {'input_ids': X_ids, 'attention_mask':X_masks}, label

dataset_train = dataset_train.map(map_func).shuffle(1000).batch(8, drop_remainder=True)
dataset_valid = dataset_valid.map(map_func).shuffle(1000).batch(8, drop_remainder=True)
dataset_train.take(1)

<TakeDataset shapes: ({input_ids: (8, 512), attention_mask: (8, 512)}, (8, 21)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.uint8)>

In [16]:
bert = TFAutoModel.from_pretrained('bert-base-uncased')
bert.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [17]:
_input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
_input_masks = tf.keras.layers.Input(shape=(seq_len, ), name='attention_mask', dtype='int32')

embeddings = bert.bert(_input_ids, _input_masks)[1]

x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
drop = tf.keras.layers.Dropout(0.5)(x)
y = tf.keras.layers.Dense(df['label'].nunique(), activation='softmax')(drop)


model = tf.keras.Model(inputs= [_input_ids, _input_masks], outputs=y)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 1024)         787456      bert[0][1]            

In [18]:
os.makedirs('../checkpoints/fold_'+str(valid_fold), exist_ok=True)
os.makedirs('../logs/fold_'+str(valid_fold), exist_ok=True)

In [19]:
mc = tf.keras.callbacks.ModelCheckpoint('../checkpoints/fold_'+str(valid_fold)+'/best_model.h5',verbose=1, save_best_only=True)
lm = tf.keras.callbacks.ModelCheckpoint('../checkpoints/fold_'+str(valid_fold)+'/last_model.h5',verbose=1, save_best_only=False)

plat = tf.keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)
es = tf.keras.callbacks.EarlyStopping(verbose=1, patience=5)
tb = tf.keras.callbacks.TensorBoard(log_dir = '../logs/fold_'+str(valid_fold))



In [None]:
optimizer = tf.keras.optimizers.Adam(lr = 1e-4)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer,loss = loss, metrics = [acc])

In [None]:
tf.test.is_gpu_available()

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
model.fit(dataset_train, epochs=25, validation_data=dataset_valid, callbacks=[mc, plat, es, lm, tb])

In [20]:
my_model = tf.keras.models.load_model('../checkpoints/fold_'+str(valid_fold)+'/best_model.h5')

In [21]:
def ret_token(phrase):
    tokens = tokenizer.encode_plus(phrase,
                                   max_length = seq_len,
                                   truncation=True,
                                   padding='max_length',
                                   add_special_tokens=True,
                                   return_tensors='tf', return_token_type_ids=False )
    
    return {'input_ids':tf.cast(tokens['input_ids'], tf.float64), 'attention_mask':tf.cast(tokens['attention_mask'], tf.float64)}

In [22]:
def get_prediction(data):
    _predicted_probs = []
    for item in tqdm(data['sentence']):
        ret = ret_token(item.lower())
        probs = my_model.predict(ret)
        _predicted_probs.append(probs)
    return _predicted_probs

In [23]:
def get_full_data_preds(data):
    _preds = get_prediction(data)
    _preds = [item[0] for item in _preds]
    _preds_df = pd.DataFrame(_preds)
    return pd.concat([data, _preds_df], axis=1)

In [24]:
train_preds = get_full_data_preds(train_data)

  0%|          | 0/262 [00:00<?, ?it/s]

In [25]:
len(valid_data)

66

In [26]:
valid_preds= get_full_data_preds(valid_data)

  0%|          | 0/66 [00:00<?, ?it/s]

In [27]:
os.makedirs('../results/fold_'+str(valid_fold), exist_ok=True)

In [28]:
train_preds.to_csv('../results/fold_'+str(valid_fold)+'/train_preds.csv', index=False)
valid_preds.to_csv('../results/fold_'+str(valid_fold)+'/valid_preds.csv', index=False)
