In [None]:
# These is the preprocessing to prepare the train/dev/test files
# Removing duplicates and analysis

In [1]:
import time, os
import torch
import numpy as np
import torch.nn as nn

import pandas as pd
from tqdm import tqdm

import json
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_json(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data


def get_stats_df(dataframe):
    stats = []
    for p in range(11):
        row = []
        for valid in [1,0,-1,-2]:
            row.append(len(dataframe[(dataframe['prop']==p)&(dataframe['valid']==valid)]))
        stats.append(row)
    return stats

    

def load_dataset(datapath="/users/cost/jlovonme/data", propfile="hierarchy_props/bansal_with_defs_trainprop_{prop}.json"):
    total_props = 9
    max_triplets = 25
    rand_seed = 42
   
    # Dataset sizes
    ds_sizes = {}
    all_dfs = []
    
    # Load all the property files and put it into a big one dataframe
    for prop in range(total_props):
        _propfile = propfile.format(prop=str(prop))
        df = pd.read_json(load_json(os.path.join(datapath,_propfile)), orient="index")
        df['prop'] = prop+1
        ndf = df[df['valid']==1]
        ds_sizes[prop] = len(ndf)
        all_dfs.append(df)
        #print(prop, len(ndf))

    bigdf = pd.concat(all_dfs)


    # Valid
    # 1 is positive
    # 0 father of A
    # -1 uncles
    # -2 children of uncle
    # -3 soft negative (outside subtree)

    # Remove -3 (soft negatives) because
    # 1. Confusion: Same term with another concept can be found (at which point is this useful?)
    # 2. Hard negatives should be enough to train the model.
    # Anycase, we are trying first with only hard negs.

    # Totals: all, without soft negs, only valids, (5778398, 876097, 491507)
    bigdf = bigdf[bigdf['valid']!=-3]
    
    print("# of tripets per property after removing soft-negatives")
    display(get_stats_df(bigdf))
    
    # Remove duplicates
    bigdf.drop_duplicates(subset=['tree_id','ent_1','ent_2','ent_3','def_1','def_2','def_3'],inplace=True) # without same negatives in subtree
    bigdf.drop_duplicates(subset=['ent_1','ent_2','ent_3','def_1','def_2','def_3'],inplace=True) # without same triplets in general
    
     # Final number of triplets for each property
    print("# of tripets per property after removing identical triplets")
    display(get_stats_df(bigdf))
    
    print("# valid of triplets per property")
    set_tids = set(bigdf['tree_id'])
    begin_tid, end_tid = min(set_tids),max(set_tids)
    total_tid = end_tid - begin_tid + 1
    print("Tree ids min - max - total", begin_tid, end_tid, total_tid)
    display((bigdf[bigdf['valid']==1].groupby(['prop']).count()/total_tid)['valid']) # avg # of valid triplets per property
    
    
    # Iterate by property on treeid, randomly select 25 triplets per subtreeid, arbitrary chosen 25 ) # For train
    newvaliddf = []
   
    for p in range(total_props):
        tmp = bigdf[(bigdf['valid']==1)&(bigdf['prop']==(p+1))]
        if p==0:
            newvaliddf.append(tmp)
            continue
        for tid in range(begin_tid,end_tid+1):
            tiddf = tmp[tmp['tree_id']==tid]
            cur_len = len(tiddf)
            if cur_len < max_triplets:
                a=0
                #if cur_len==0:
                #    print(p,tid, cur_len)
            else:
                tiddf = tiddf.sample(n=max_triplets, random_state=rand_seed)
            newvaliddf.append(tiddf)

    newvaliddf = pd.concat(newvaliddf)
    finaldf=pd.concat([bigdf[bigdf['valid']!=1],newvaliddf])
    
    print("Final total triplets per property")
    display(get_stats_df(finaldf))
    
    print("Total positive triplets", len(finaldf[finaldf['valid']==1]), "Total neg triplets",len(finaldf[finaldf['valid']!=1]))
    print("Total rows", len(finaldf))
    display(finaldf.head())
    return finaldf

def load_only_positives(datapath="/users/cost/jlovonme/data", propfile="hierarchy_props/bansal_with_defs_trainprop_{prop}.json", sample_prop=False):
    total_props = 9
    max_triplets = 25
    rand_seed = 42
   
    # Dataset sizes
    ds_sizes = {}
    all_dfs = []
    
    # Load all the property files and put it into a big one dataframe
    for prop in range(total_props):
        _propfile = propfile.format(prop=str(prop))
        df = pd.read_json(load_json(os.path.join(datapath,_propfile)), orient="index")
        df['prop'] = prop+1
        ndf = df[df['valid']==1]
        ds_sizes[prop] = len(ndf)
        all_dfs.append(ndf)
        #print(prop, len(ndf))

    bigdf = pd.concat(all_dfs)


    # Valid
    # 1 is positive

    bigdf = bigdf[bigdf['valid']!=-3]
    
    print("# of tripets per property after removing soft-negatives")
    display(get_stats_df(bigdf))
    
    # Remove duplicates
    bigdf.drop_duplicates(subset=['tree_id','ent_1','ent_2','ent_3','def_1','def_2','def_3'],inplace=True) # without same negatives in subtree
    bigdf.drop_duplicates(subset=['ent_1','ent_2','ent_3','def_1','def_2','def_3'],inplace=True) # without same triplets in general
    
     # Final number of triplets for each property
    print("# of tripets per property after removing identical triplets")
    display(get_stats_df(bigdf))
    
    if sample_prop is False:
        return bigdf
    
    print("# valid of triplets per property")
    set_tids = set(bigdf['tree_id'])
    begin_tid, end_tid = min(set_tids),max(set_tids)
    total_tid = end_tid - begin_tid + 1
    print("Tree ids min - max - total", begin_tid, end_tid, total_tid)
    display((bigdf[bigdf['valid']==1].groupby(['prop']).count()/total_tid)['valid']) # avg # of valid triplets per property
    
    
    # Iterate by property on treeid, randomly select 25 triplets per subtreeid, arbitrary chosen 25 ) # For train
    newvaliddf = []
   
    for p in range(total_props):
        tmp = bigdf[(bigdf['valid']==1)&(bigdf['prop']==(p+1))]
        if p==0:
            newvaliddf.append(tmp)
            continue
        for tid in range(begin_tid,end_tid+1):
            tiddf = tmp[tmp['tree_id']==tid]
            cur_len = len(tiddf)
            if cur_len < max_triplets:
                a=0
                #if cur_len==0:
                #    print(p,tid, cur_len)
            else:
                tiddf = tiddf.sample(n=max_triplets, random_state=rand_seed)
            newvaliddf.append(tiddf)

    finaldf = pd.concat(newvaliddf)
    
    print("Final total triplets per property")
    display(get_stats_df(finaldf))
    
    print("Total positive triplets", len(finaldf[finaldf['valid']==1]), "Total neg triplets",len(finaldf[finaldf['valid']!=1]))
    print("Total rows", len(finaldf))
    display(finaldf.head())
    return finaldf


# Prepare data: train/dev/test files

### Train

In [15]:
bigdf =load_dataset() # train

# of tripets per property after removing soft-negatives


[[0, 0, 0, 0],
 [6397, 1714, 9451, 9307],
 [31116, 3770, 15648, 19468],
 [39505, 8177, 35507, 38153],
 [28912, 2638, 9322, 12206],
 [31116, 3770, 15648, 19468],
 [39505, 8177, 35507, 38153],
 [28912, 2638, 9322, 12206],
 [159848, 5909, 24077, 27107],
 [126196, 1974, 6416, 8857],
 [0, 0, 0, 0]]

# of tripets per property after removing identical triplets


[[0, 0, 0, 0],
 [6383, 1714, 9451, 9307],
 [30808, 3770, 15648, 19468],
 [39388, 8177, 35507, 38153],
 [28894, 2638, 9322, 12206],
 [31094, 0, 0, 0],
 [39394, 0, 0, 0],
 [28894, 0, 0, 0],
 [159636, 0, 0, 0],
 [126176, 0, 0, 0],
 [0, 0, 0, 0]]

# valid of triplets per property
Tree ids min - max - total 0 532 533


prop
1     11.975610
2     57.801126
3     73.898687
4     54.210131
5     58.337711
6     73.909944
7     54.210131
8    299.504690
9    236.727955
Name: valid, dtype: float64

Final total triplets per property


[[0, 0, 0, 0],
 [6383, 1714, 9451, 9307],
 [9670, 3770, 15648, 19468],
 [12146, 8177, 35507, 38153],
 [8666, 2638, 9322, 12206],
 [9771, 0, 0, 0],
 [12146, 0, 0, 0],
 [8666, 0, 0, 0],
 [12216, 0, 0, 0],
 [8865, 0, 0, 0],
 [0, 0, 0, 0]]

Total positive triplets 88529 Total neg triplets 165361
Total rows 253890


Unnamed: 0,tree_id,ent_1,ent_2,ent_3,def_1,def_2,def_3,valid,prop
30,0,miniature pinscher,working dog,watchdog,small German version of a Doberman pinscher,any of several breeds of usually large powerfu...,a dog trained to guard property,0,1
31,0,miniature pinscher,seizure-alert dog,watchdog,small German version of a Doberman pinscher,a dog that can alert or assist people with sei...,a dog trained to guard property,-1,1
32,0,miniature pinscher,Sennenhunde,watchdog,small German version of a Doberman pinscher,any of four Swiss breeds,a dog trained to guard property,-1,1
33,0,miniature pinscher,Appenzeller,watchdog,small German version of a Doberman pinscher,a smaller of the four Swiss breeds,a dog trained to guard property,-2,1
34,0,miniature pinscher,Greater Swiss Mountain dog,watchdog,small German version of a Doberman pinscher,the largest of the four Swiss breeds,a dog trained to guard property,-2,1


In [13]:
bigdf.reset_index().to_json(path_or_buf='/users/cost/jlovonme/data/hierarchy_props/train_hardneg_sample_25.json',orient="index")

### Dev

In [17]:
#bigdf_dev.reset_index().to_json(path_or_buf='/users/cost/jlovonme/data/hierarchy_props/dev_hardneg_sample_25.json',orient="index")

In [3]:
bigdf_dev2 = load_only_positives(propfile="hierarchy_props/bansal_with_defs_devprop_{prop}.json",sample_prop=True)

# of tripets per property after removing soft-negatives


[[0, 0, 0, 0],
 [1443, 0, 0, 0],
 [5744, 0, 0, 0],
 [9347, 0, 0, 0],
 [8540, 0, 0, 0],
 [5744, 0, 0, 0],
 [9347, 0, 0, 0],
 [8540, 0, 0, 0],
 [37624, 0, 0, 0],
 [34066, 0, 0, 0],
 [0, 0, 0, 0]]

# of tripets per property after removing identical triplets


[[0, 0, 0, 0],
 [1439, 0, 0, 0],
 [5712, 0, 0, 0],
 [9327, 0, 0, 0],
 [8530, 0, 0, 0],
 [5742, 0, 0, 0],
 [9327, 0, 0, 0],
 [8530, 0, 0, 0],
 [37614, 0, 0, 0],
 [34062, 0, 0, 0],
 [0, 0, 0, 0]]

# valid of triplets per property
Tree ids min - max - total 533 646 114


prop
1     12.622807
2     50.105263
3     81.815789
4     74.824561
5     50.368421
6     81.815789
7     74.824561
8    329.947368
9    298.789474
Name: valid, dtype: float64

Final total triplets per property


[[0, 0, 0, 0],
 [1439, 0, 0, 0],
 [2232, 0, 0, 0],
 [2611, 0, 0, 0],
 [2018, 0, 0, 0],
 [2257, 0, 0, 0],
 [2611, 0, 0, 0],
 [2018, 0, 0, 0],
 [2607, 0, 0, 0],
 [2042, 0, 0, 0],
 [0, 0, 0, 0]]

Total positive triplets 19835 Total neg triplets 0
Total rows 19835


Unnamed: 0,tree_id,ent_1,ent_2,ent_3,def_1,def_2,def_3,valid,prop
0,533,enuresis,incontinence,elimination,inability to control the flow of urine and inv...,involuntary urination or defecation,the bodily process of discharging waste matter,1,1
1,533,urochesia,defecation,elimination,passage of urine from the anus,the elimination of fecal waste through the anus,the bodily process of discharging waste matter,1,1
2,533,hematochezia,defecation,elimination,passage of stools containing blood (as from di...,the elimination of fecal waste through the anus,the bodily process of discharging waste matter,1,1
3,533,bowel movement,defecation,elimination,a euphemism for defecation,the elimination of fecal waste through the anus,the bodily process of discharging waste matter,1,1
4,533,shit,defecation,elimination,a coarse term for defecation,the elimination of fecal waste through the anus,the bodily process of discharging waste matter,1,1


In [4]:
bigdf_dev2.reset_index().to_json(path_or_buf='/users/cost/jlovonme/data/hierarchy_props/dev_onlypos_sample_25.json',orient="index")

In [6]:
bigdf_test2 = load_only_positives(propfile="hierarchy_props/bansal_with_defs_testprop_{prop}.json",sample_prop=False) # This split does NOT change after checking for duplicates

# of tripets per property after removing soft-negatives


[[0, 0, 0, 0],
 [1435, 0, 0, 0],
 [7658, 0, 0, 0],
 [8273, 0, 0, 0],
 [6522, 0, 0, 0],
 [7658, 0, 0, 0],
 [8273, 0, 0, 0],
 [6522, 0, 0, 0],
 [33160, 0, 0, 0],
 [31746, 0, 0, 0],
 [0, 0, 0, 0]]

# of tripets per property after removing identical triplets


[[0, 0, 0, 0],
 [1435, 0, 0, 0],
 [7658, 0, 0, 0],
 [8273, 0, 0, 0],
 [6522, 0, 0, 0],
 [7658, 0, 0, 0],
 [8273, 0, 0, 0],
 [6522, 0, 0, 0],
 [33160, 0, 0, 0],
 [31746, 0, 0, 0],
 [0, 0, 0, 0]]

In [7]:
bigdf_test2.reset_index().to_json(path_or_buf='/users/cost/jlovonme/data/hierarchy_props/test_onlypos.json',orient="index")

In [14]:
# here start training on triplet network (Check for correct loss + validation optimal)

# Evaluators
We consider 2 types of evaluation to determine a good model
1. Triplet Evaluator -> Built-in on SentenceTransformer library
2. Taxonomy Reconstruction Evaluator -> Need to be written

In [4]:
def taxonomy_evaluator():
    pass

In [10]:
testfile = '/users/cost/jlovonme/data/hierarchy_props/bansal_with_defs_dev.json'

In [11]:
df = pd.read_json(load_json(testfile),orient='index')
df.head()

Unnamed: 0,father,child,treeid,split,father_definition,child_definition
10364,elimination,incontinence,533,dev,the bodily process of discharging waste matter,involuntary urination or defecation
10365,elimination,defecation,533,dev,the bodily process of discharging waste matter,the elimination of fecal waste through the anus
10366,elimination,micturition,533,dev,the bodily process of discharging waste matter,the discharge of urine
10367,incontinence,enuresis,533,dev,involuntary urination or defecation,inability to control the flow of urine and inv...
10368,enuresis,bed-wetting,533,dev,inability to control the flow of urine and inv...,enuresis during sleep; especially common in ch...


In [7]:
tree_ids = set(df["treeid"])
model, tokenizer

In [9]:
for tree_id in tree_ids:
    subtree = df[df['treeid']==tree_id] # get subtree
    #get words + definitions
    map_elements = {}
    for idx,row in subtree.iterrows():
        phrase1 = row['father'] +" is defined as "+row['father_definition']
        phrase2 = row['child'] +" is defined as "+row['child_definition']
        map_elements[row['father']] = phrase1
        map_elements[row['child']] = phrase2
        
    # obtain representations
    # Evaluate model
    model.eval() # set evaluation mode
    model_representations = {} # to store representations

    #return dataloader, model
    for batch_num, batch in enumerate(tqdm(dataloader)):
        #print(batch)
        batch = batch_to_device(batch,device)# send to device
        #print(batch.keys())
        b_input_mask = batch['attention_mask'] # to identify not null tokens 

        with torch.no_grad():
            out_features = model(**batch,
                    #input_ids=b_input_ids,
                    #token_type_ids=None,
                    #attention_mask=b_input_mask,
                    output_hidden_states=True)

        n_layer = 0
        my_layers = [out_features['hidden_states'][-1]]
        
        for layer in my_layers: # iterate each layer with shape (batch_size, sequence_length, hidden_size)
            
            for phrase, input_mask in zip(layer,b_input_mask): # iterate batch
                #print(input_mask)
                sequence_representation = []
                for token, token_mask in zip(phrase, input_mask): # iterate sequence
                    if token_mask: # if is not 0 or None
                        sequence_representation.append(token) # output vector
                sequence_representation = torch.stack(sequence_representation)


                cls_rep = sequence_representation[0,:].detach().cpu() # Get first (CLS) token
                avg_all_rep = sequence_representation[1:-1,:].mean(dim=0).detach().cpu() # Averaging all subtokens in each layer, not including CLS or SEP
                #avg_word = to implement
                model_representations.append([cls_rep, avg_all_rep])

                #print(n_layer, sequence_representation.shape)

    
    # Compute full graph
    
    # Create golden tree and predicted tree
    
    # Compute metrics
    print(subtree)
    break

           father              child  treeid split  \
12646   explosion           airburst     647  test   
12647   explosion           blowback     647  test   
12648   explosion           big bang     647  test   
12649   explosion          inflation     647  test   
12650   explosion              blast     647  test   
12651   explosion           backfire     647  test   
12652   explosion      fragmentation     647  test   
12653    blowback          backblast     647  test   
12654       blast         bomb blast     647  test   
12655  bomb blast  nuclear explosion     647  test   

                                       father_definition  \
12646  a violent release of energy caused by a chemic...   
12647  a violent release of energy caused by a chemic...   
12648  a violent release of energy caused by a chemic...   
12649  a violent release of energy caused by a chemic...   
12650  a violent release of energy caused by a chemic...   
12651  a violent release of energy caused by 

In [5]:
trainfile = '/users/cost/jlovonme/data/hierarchy_props/train_hardneg_sample_25.json'
devfile = '/users/cost/jlovonme/data/hierarchy_props/dev_onlypos_sample_25.json'
testfile = '/users/cost/jlovonme/data/hierarchy_props/test_onlypos.json'

In [11]:
df = pd.read_json(trainfile, orient='index')
df = df[df['valid']==1] # We take only positive, do not know how to handle negatives yet

In [14]:
set(df['prop']) # Check all properties are present

{1, 2, 3, 4, 5, 6, 7, 8, 9}

In [12]:
#df = pd.read_json(trainfile, orient='index')
train_dataset = Dataset.from_pandas(df)
df_dev = pd.read_json(devfile, orient='index')
dev_dataset = Dataset.from_pandas(df_dev)
df_test = pd.read_json(testfile, orient='index')
test_dataset = Dataset.from_pandas(df_test)

In [13]:
train_dataset, dev_dataset, test_dataset

Dataset({
    features: ['index', 'tree_id', 'ent_1', 'ent_2', 'ent_3', 'def_1', 'def_2', 'def_3', 'valid', 'prop', '__index_level_0__'],
    num_rows: 88473
})

In [18]:
# Evaluate intersection vocabulary
train_words = set(df['ent_1']).union(set(df['ent_2'])).union(set(df['ent_3']))
dev_words = set(df_dev['ent_1']).union(set(df_dev['ent_2'])).union(set(df_dev['ent_3']))
test_words = set(df_test['ent_1']).union(set(df_test['ent_2'])).union(set(df_test['ent_3']))

### Words intersection in different splits

In [19]:
len(train_words), len(dev_words),len(train_words.intersection(dev_words)) # Vocabulary (entities) intersection

(10349, 2363, 227)

In [20]:
len(test_words), len(train_words.intersection(test_words)) , len(dev_words.intersection(test_words))

(2309, 201, 52)