In [77]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import tensorflow as tf
from transformers import TFAutoModel
from tqdm import tqdm
from sklearn.metrics import classification_report
import os
import json
import sys
from sklearn.metrics import f1_score

In [2]:
seq_len = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def set_seed(SEED):
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

In [10]:
def ret_token( phrase):
    tokens = tokenizer.encode_plus(phrase,
                                   max_length = seq_len,
                                   truncation=True,
                                   padding='max_length',
                                   add_special_tokens=True,
                                   return_tensors='tf', return_token_type_ids=False )
    
    return {'input_ids':tf.cast(tokens['input_ids'], tf.float64), 'attention_mask':tf.cast(tokens['attention_mask'], tf.float64)}

def get_prediction(my_model, data):
    _predicted_probs = []
    for item in tqdm(data['sentence']):
        ret = ret_token(item.lower())
        probs = my_model.predict(ret)
        _predicted_probs.append(probs)
    return _predicted_probs

def get_full_data_preds(my_model, data):
    _preds = get_prediction(my_model, data)
    _preds = [item[0] for item in _preds]
    _preds_df = pd.DataFrame(_preds)
    return pd.concat([data, _preds_df], axis=1)
    
def execute_inference(data, valid_fold):
    my_model = tf.keras.models.load_model('../checkpoints/fold_'+str(valid_fold)+'/best_model.h5')
    _preds= get_full_data_preds(my_model, data)
    os.makedirs('../results/fold_'+str(valid_fold), exist_ok=True)
    
    _preds.to_csv('../results/fold_'+str(valid_fold)+'/test_preds.csv', index=False)


In [6]:
df_test = pd.read_csv('../data/raw/sofmattress_test.csv')

In [7]:
df_test.head()

Unnamed: 0,sentence,label
0,There are only 2 models,NO_NODES_DETECTED
1,Single,NO_NODES_DETECTED
2,What's difference between ergo and ortho,COMPARISON
3,Return order,RETURN_EXCHANGE
4,Hai not recieved my product,DELAY_IN_DELIVERY


In [11]:
for i in range(0,5):
    execute_inference(df_test,i)

100%|██████████| 397/397 [00:27<00:00, 14.30it/s]
100%|██████████| 397/397 [00:27<00:00, 14.36it/s]
100%|██████████| 397/397 [00:27<00:00, 14.48it/s]
100%|██████████| 397/397 [00:27<00:00, 14.38it/s]
100%|██████████| 397/397 [00:27<00:00, 14.34it/s]


## Test statistics

In [12]:
with open('../config/labels.json') as json_file:
    label_mapping = json.load(json_file)

In [21]:

df = pd.DataFrame()

for i in range(0,5):
    local_df = pd.read_csv('../results/fold_'+ str(i) +'/test_preds.csv')
    df =pd.concat([df, local_df])

In [22]:
df = df.reset_index()

df_mean = df.groupby('index').mean()

In [28]:
df.head()

Unnamed: 0,index,sentence,label,0,1,2,3,4,5,6,...,11,12,13,14,15,16,17,18,19,20
0,0,There are only 2 models,NO_NODES_DETECTED,0.000375,6.8e-05,0.000338,0.000329,0.003039,0.001263,0.000326,...,0.001333,0.00194,0.94571,0.035868,0.000174,0.000614,0.000439,0.000309,0.003437,0.000725
1,1,Single,NO_NODES_DETECTED,0.011035,0.006604,0.000569,0.000412,0.000286,0.006642,0.000893,...,0.000693,0.000338,0.007013,0.00073,0.119074,0.008688,0.004157,0.00051,0.819387,0.001122
2,2,What's difference between ergo and ortho,COMPARISON,1.3e-05,5.4e-05,0.000104,0.001262,0.993293,0.000183,0.000355,...,1.9e-05,0.000152,0.001308,0.001283,6.1e-05,3.2e-05,0.000637,9.6e-05,6.8e-05,8.5e-05
3,3,Return order,RETURN_EXCHANGE,0.000443,0.009688,0.003035,0.000981,0.000256,0.002209,0.001804,...,0.000139,3.6e-05,0.000279,4.9e-05,0.031462,0.134524,0.79519,0.009349,0.002357,0.000187
4,4,Hai not recieved my product,DELAY_IN_DELIVERY,0.000139,0.007549,0.000729,0.000635,0.0002,0.002177,0.001184,...,1.5e-05,1e-05,0.000215,3.7e-05,0.067952,0.001047,0.910713,0.000852,0.003484,0.000104


In [24]:
_df_fold=pd.read_csv('../results/fold_1/test_preds.csv')
_df_fold = _df_fold[['sentence','label']].reset_index()

In [30]:
final_df_test = pd.concat([_df_fold, df_mean], axis=1)

In [31]:
final_df_test.label = final_df_test.label.map(label_mapping)

In [42]:
pred_cols = [str(i) for i in range(0,21)]

In [44]:
def get_predicted_label(row):
    local_row = row[pred_cols]
    indx = np.argmax(local_row)
    conf = local_row[indx]
    row['confidence'] = conf
    row['pred_label'] = indx
    return row

final_df_test['pred_label'] = -1
final_df_test['confidence'] = -1


final_df_test = final_df_test.apply(get_predicted_label, axis=1)

In [47]:
def get_testing_accuracy(row):
    if row['label'] == row['pred_label']:
        return 'yes'
    else:
        return 'no'

In [48]:
final_df_test['correct'] = final_df_test.apply(get_testing_accuracy, axis=1)

In [50]:
final_df_test['correct'].value_counts(normalize=True)

no     0.561713
yes    0.438287
Name: correct, dtype: float64

# Inscope Accuracy 

In [63]:
final_df_test[final_df_test['label']!=21].correct.value_counts(normalize=True) * 100

yes    75.324675
no     24.675325
Name: correct, dtype: float64

## Weighted F1

In [98]:

f1_score(final_df_test[final_df_test['label']!=21]['label'], final_df_test[final_df_test['label']!=21]['pred_label'],  average='weighted')

0.753275866374932

## Inscope Accuracy is 75%

In [102]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
accuracy = []
inscope_accuracy = []
f1 = []

for threshold in thresholds:
    print("#"*30)
    print("Threshold: ", threshold)
    local_df = final_df_test.copy()
    local_df.loc[local_df['confidence'] < threshold, 'pred_label'] = 21
    local_df['correct'] = local_df.apply(get_testing_accuracy, axis=1)
   
    
    correct = len(local_df[local_df['correct'] == 'yes'])
    incorrect = len(local_df[local_df['correct'] == 'no'])
    
    acc = correct / (correct + incorrect) * 100
    
    local_df = local_df[local_df['label']!=21]
    
    correct = len(local_df[local_df['correct'] == 'yes'])
    incorrect = len(local_df[local_df['correct'] == 'no'])
#     print(local_df['label'].unique())
#     print(local_df['pred_label'].unique())
#     print((local_df['label'].nunique()))
#     print((local_df['pred_label'].nunique()))
    
#     for i in local_df['label'].unique():
#         if i not in local_df['pred_label'].unique():
#             print(i)
            
    f1.append(f1_score(list(local_df['label']), list(local_df['pred_label']),  average='weighted'))
    
    in_acc = correct / (correct + incorrect) * 100
    accuracy.append(acc)
    inscope_accuracy.append(in_acc)
    

##############################
Threshold:  0.1
##############################
Threshold:  0.2
##############################
Threshold:  0.3
##############################
Threshold:  0.4
##############################
Threshold:  0.5
##############################
Threshold:  0.6
##############################
Threshold:  0.7
##############################
Threshold:  0.8
##############################
Threshold:  0.9


In [103]:
pd.DataFrame({'Threshold': thresholds, 'Accuracy':accuracy, 'Inscope Accuracy':inscope_accuracy, 'F1 Scores':f1})

Unnamed: 0,Threshold,Accuracy,Inscope Accuracy,F1 Scores
0,0.1,43.828715,75.324675,0.753276
1,0.2,47.103275,75.324675,0.755909
2,0.3,57.43073,70.995671,0.731247
3,0.4,63.979849,67.099567,0.709103
4,0.5,67.758186,64.502165,0.69473
5,0.6,69.269521,58.874459,0.667155
6,0.7,67.254408,50.21645,0.590784
7,0.8,62.720403,39.393939,0.484181
8,0.9,59.193955,32.034632,0.401859
