In [1]:
from collections import defaultdict
from allennlp.predictors.predictor import Predictor
import allennlp_models.pair_classification
from tqdm import tqdm
import pandas as pd
import json
import requests

In [2]:
bot_name = 'curekart'

In [3]:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/decomposable-attention-elmo-2020.04.09.tar.gz", "textual_entailment")

Did not use initialization regex that was passed: .*token_embedder_tokens\._projection.*weight


In [4]:
def get_entail(premise, hypothesis):
    premise = premise.lower()
    hypothesis = hypothesis.lower()
    res = predictor.predict(hypothesis=hypothesis, premise=premise)
    return res

In [5]:
df_train = pd.read_csv(f'train/{bot_name}_train.csv')
df_train.head()

Unnamed: 0,label,sentence
0,CALL_CENTER,What time is your call centre operational duri...
1,CALL_CENTER,is the call center still functioning during lo...
2,CALL_CENTER,what are the working hours of your call center...
3,CALL_CENTER,does covid affext your call center time
4,CALL_CENTER,is your call center working during covid?


In [6]:
all_labels = list(set(df_train['label']))
correct_order_sens = []
correct_order_labels = []
for label in all_labels:
    sens = list(df_train[df_train['label']==label]['sentence'])
    sens = sorted(sens, key = lambda x: len(str(x).split()), reverse=True)
    correct_order_sens.extend(sens)
    correct_order_labels.extend([label]*len(sens))
df_correct_order = pd.DataFrame({'label': correct_order_labels, 'sentence': correct_order_sens})
df_correct_order.head(10)

Unnamed: 0,label,sentence
0,RECOMMEND_PRODUCT,I am confused about what to buy since there ar...
1,RECOMMEND_PRODUCT,I have been trying to maintain a healthy lifes...
2,RECOMMEND_PRODUCT,Can you help me with building an athletic body...
3,RECOMMEND_PRODUCT,I need some hair care products since I have be...
4,RECOMMEND_PRODUCT,I'm here to browse some products because my fr...
5,RECOMMEND_PRODUCT,"I want help to reach my weight goal, can you s..."
6,RECOMMEND_PRODUCT,Can you please suggest me some products specif...
7,RECOMMEND_PRODUCT,give me some other products regarding building...
8,RECOMMEND_PRODUCT,"I am aiming to get a lean body, can you help me?"
9,RECOMMEND_PRODUCT,What is the best pre workout protein to boost ...


In [7]:
# dict of {node_name: list of parent sens}
parent_sens = defaultdict(list)
# dict of {node_name: {parent: list of child}}
child_sens = defaultdict(lambda: defaultdict(list))
# raw output of entail
entail_output = {}
for index, row in tqdm(df_correct_order.iterrows(), total=df_correct_order.shape[0]):
    is_parent = True
    for psen in parent_sens[row['label']]:
        entail_forward = get_entail(psen, row['sentence'])
        entail_backward = get_entail(row['sentence'], psen)
        entail_output[(psen, row['sentence'])] = entail_forward
        entail_output[(row['sentence'], psen)] = entail_backward
        if entail_forward['label_probs'][0] > 0.6 and entail_backward['label_probs'][0] > 0.6:
            is_parent = False
            child_sens[row['label']][psen].append(row['sentence'])
            break
    if is_parent:
        parent_sens[row['label']].append(row['sentence'])

100%|██████████| 600/600 [1:14:42<00:00,  7.47s/it]


In [8]:
len(parent_sens)

28

In [9]:
all_nodes = []
all_sens = []
for node, sens in parent_sens.items():
    sens = [str(s) for s in sens]
    all_sens.extend(sens)
    all_nodes.extend([node]*len(sens))
assert(len(all_nodes) == len(all_sens))
df_subset_train = pd.DataFrame({'label': all_nodes, 'sentence': all_sens})
df_subset_train.head()

Unnamed: 0,label,sentence
0,RECOMMEND_PRODUCT,I am confused about what to buy since there ar...
1,RECOMMEND_PRODUCT,I have been trying to maintain a healthy lifes...
2,RECOMMEND_PRODUCT,Can you help me with building an athletic body...
3,RECOMMEND_PRODUCT,I need some hair care products since I have be...
4,RECOMMEND_PRODUCT,I'm here to browse some products because my fr...


In [10]:
df_subset_train.shape

(413, 2)

In [11]:
# all children
node_names = []
parents = []
childs = []
for node, pc in child_sens.items():
    for par, children in pc.items():
        node_names.extend([node]*len(children))
        parents.extend([par]*len(children))
        childs.extend(children)
    assert len(node_names) == len(parents) == len(childs)
df_node_parent_child = pd.DataFrame({'label': node_names, 'parents': parents, 'child': childs})
df_node_parent_child.head()

Unnamed: 0,label,parents,child
0,RECOMMEND_PRODUCT,Which products can you help me?,Can you suggest me some products?
1,RECOMMEND_PRODUCT,I want get muscle mass and lean mass,I want to gain more muscles
2,RECOMMEND_PRODUCT,what products would you suggest to lose weight?,I want to gain my weight
3,RECOMMEND_PRODUCT,i mean i want protein supplement,Wanted to get Protein powder
4,RECOMMEND_PRODUCT,I want to get the product,I want to buy


In [12]:
df_subset_train.to_csv(f'train/{bot_name}_subset_train.csv', index=False)

In [13]:
df_node_parent_child.to_csv(f'assets/{bot_name}_allennlp_node_parent_child')

In [14]:
import pickle
with open(f'assets/{bot_name}_allennlp_raw_output.pkl', 'wb') as f:
    pickle.dump(entail_output, f)