In [10]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import tensorflow as tf
from transformers import TFAutoModel
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report
import os
import json
  


In [4]:
def set_seed(SEED):
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    
SEED = 42
set_seed(SEED)

In [6]:
df = pd.read_csv('../data/raw/sofmattress_train.csv')
df.head()

Unnamed: 0,sentence,label
0,You guys provide EMI option?,EMI
1,Do you offer Zero Percent EMI payment options?,EMI
2,0% EMI.,EMI
3,EMI,EMI
4,I want in installment,EMI


In [13]:

with open('../config/labels.json') as json_file:
    label_mapping = json.load(json_file)
  

In [53]:
folds = pd.read_csv('../data/folds.csv')

In [54]:
folds.label = folds.label.map(label_mapping)

In [55]:
folds

Unnamed: 0,index,sentence,label,fold
0,0,May I please know about the offers,20,4
1,1,Current state of my order,16,2
2,2,I want refund,17,0
3,3,Can pay later on delivery,1,1
4,4,Can I cancel my order here,18,4
...,...,...,...,...
323,323,Product Variants,13,3
324,324,Is COD option available,1,2
325,325,Almost 1 month over,15,3
326,326,When will the order be delivered to me?,16,4


In [57]:
train_data = folds[folds.fold != valid_fold]#.drop(columns=['fold'])
valid_data = folds[folds.fold == valid_fold]#.drop(columns=['fold'])
train_data = pd.get_dummies(train_data, columns=['label'])
valid_data = pd.get_dummies(valid_data, columns=['label'])
train_data['sentence']=train_data['sentence'].str.lower()
valid_data['sentence']=valid_data['sentence'].str.lower()

In [59]:
label_train=train_data.iloc[:,3:]
label_valid = valid_data.iloc[:,3:]

In [60]:
label_train

Unnamed: 0,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,...,label_11,label_12,label_13,label_14,label_15,label_16,label_17,label_18,label_19,label_20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
323,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
324,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
325,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
seq_len = 512
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [38]:
X_train_ids = []
X_train_masks = []

X_valid_ids = []
X_valid_masks = []

def get_id_masks(data):
    ids = []
    masks = []
    for phrase in data:
        tokens = tokenizer.encode_plus(phrase,
                                       max_length = seq_len,
                                       truncation=True,
                                       padding='max_length',
                                       add_special_tokens=True,
                                       return_tensors='tf')
        ids.append( tokens['input_ids'][0] )
        masks.append( tokens['attention_mask'][0] )
    return ids, masks
    
X_train_ids, X_train_masks = get_id_masks(train_data['sentence'])
X_valid_ids, X_valid_masks = get_id_masks(valid_data['sentence'])


NameError: name 'tokenizer' is not defined

In [None]:
dataset_train = tf.data.Dataset.from_tensor_slices((X_train_ids, X_train_masks, label_train))
dataset_valid = tf.data.Dataset.from_tensor_slices((X_valid_ids, X_valid_masks, label_valid))
dataset_train.take(1)


In [48]:
def map_func(X_ids, X_masks, label):
    return {'input_ids': X_ids, 'attention_mask':X_masks}, label

dataset_train = dataset_train.map(map_func).shuffle(1000).batch(8, drop_remainder=True)
dataset_valid = dataset_valid.map(map_func).shuffle(1000).batch(8, drop_remainder=True)
dataset_train.take(1)

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-uncased')
bert.summary()

In [None]:
_input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
_input_masks = tf.keras.layers.Input(shape=(seq_len, ), name='attention_mask', dtype='int32')

embeddings = bert.bert(_input_ids, _input_masks)[1]

x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
drop = tf.keras.layers.Dropout(0.5)(x)
y = tf.keras.layers.Dense(df['label'].nunique(), activation='softmax')(drop)


model = tf.keras.Model(inputs= [_input_ids, _input_masks], outputs=y)
model.summary()

In [None]:
mkdir('../checkpoints/fold_'+str(valid_fold))
mkdir('../logs/fold_'+str(valid_fold))

In [49]:
mc = tf.keras.callbacks.ModelCheckpoint('../checkpoints/fold_'+str(valid_fold)+'/best_model.h5',verbose=1, save_best_only=True)
lm = tf.keras.callbacks.ModelCheckpoint('../checkpoints/fold_'+str(valid_fold)+'/last_model.h5',verbose=1, save_best_only=False)

plat = tf.keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)
es = tf.keras.callbacks.EarlyStopping(verbose=1, patience=5)
tb = tf.keras.callbacks.TensorBoard(log_dir = '../logs/fold_'+str(valid_fold))

callbacks = [mc, plat, es, lm, tb]

SyntaxError: EOL while scanning string literal (<ipython-input-49-87b76904aade>, line 1)

In [None]:
optimizer = tf.keras.optimizers.Adam(lr = 1e-4)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer,loss = loss, metrics = [acc])

In [None]:
model.fit(dataset_train, epochs=25, validation_data=dataset_valid, callbacks=[plat,mc, es])

In [None]:
my_model = tf.keras.models.load_model('../checkpoints/fold_'+str(valid_fold)+'/best_model.h5')

In [None]:
def ret_token(phrase):
    tokens = tokenizer.encode_plus(phrase,
                                   max_length = seq_len,
                                   truncation=True,
                                   padding='max_length',
                                   add_special_tokens=True,
                                   return_tensors='tf', return_token_type_ids=False )
    
    return {'input_ids':tf.cast(tokens['input_ids'], tf.float64), 'attention_mask':tf.cast(tokens['attention_mask'], tf.float64)}

In [None]:
def get_prediction(data):
    _predicted_probs = []
    for item in tqdm(train_data['sentence']):
        ret = ret_token(item)
        probs = my_model.predict(ret)
        _predicted_probs.append(probs)
    return _predicted_probs

In [None]:

get_prediction(train_data)


In [None]:
model.predict()