In [2]:
import pandas as pd
import os
import networkx as nx
import random
import json

input_ctpfile = "datasets/definitions_bansal.json"
input_bansalfile = "datasets/bansal14_trees.csv"

output_mixfiles = [
    "properties/bansal_with_defs_train.json",
    "properties/bansal_with_defs_dev.json",
    "properties/bansal_with_defs_test.json",
]


# Handlers JSON
def load_json(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data

def save_json(myjson, filepath):
    jstring = json.dumps(myjson)
    jfile = open(filepath, "w")
    jfile.write(jstring)
    jfile.close()


# Align subtaxonomies with definition, to extend initial file
def create_mix_bansal_ctp(bansalfile, ctpfile, outputfiles, splits=[], datapath_base='.'):
    """
    Align samples subtaxonomies from Bansal dataset with the oracle definitions given in the CTP paper.
    Input: Bansal file, CTP file (bansal with definitions), splits to use: normally train/dev/test, datapath_base: common data directory
    Output: None. Save json (from dataframes) files for each split in the outputfiles paths.
    """
    bansal_df = pd.read_csv(bansalfile, names=["father",'child', 'treeid', 'split'])
    definitions_json = load_json(ctpfile)

    dictionary_words = {}
    for treeid,defs in definitions_json.items():
        for term,defi in defs.items():
            tmp_key = (term,int(treeid))
            defs = dictionary_words.get(tmp_key,set())
            defs.add(defi)
            dictionary_words[tmp_key] = defs

    print("Check if multiple definitions are present.")
    cnt = 0
    for w,defs in dictionary_words.items():
        if len(defs) > 1:
            print(w, defs)
            cnt += 1
    print("Total words", len(dictionary_words), ". Words with more than 1 definition ", cnt)

    # Creates separate files for train/dev/test with definitions
    for split, outputfile in zip(splits, outputfiles):
        filtered_bansal_df = bansal_df[bansal_df["split"]==split] # Filter bansal split
        # Some stats about the json file
        print("########   ", split, " ###########")
        # use term definition as oracle glosses (from wordnet)
        print("Number of entries: ", len(filtered_bansal_df) )

        filtered_bansal_df["father"] = filtered_bansal_df["father"].apply(lambda x: x.replace("_$_", " "))
        filtered_bansal_df["child"] = filtered_bansal_df["child"].apply(lambda x: x.replace("_$_", " "))

        #print(display(filtered_bansal_df.head()))
        father_defs = []
        child_defs = []
        # Mix datasets
        for index, row in filtered_bansal_df.iterrows():
            key = (row["father"], int(row["treeid"]) )
            defs = dictionary_words.get(key, None)
            if defs is None:
                print("Not found", key)
            else:
                father_defs.append(list(defs)[0])
            key = (row["child"], int(row["treeid"]))
            defs = dictionary_words.get(key, None)
            if defs is None:
                print("Not found", key)
            else:
                child_defs.append(list(defs)[0])
        # Add new columns
        filtered_bansal_df["father_definition"] = father_defs
        filtered_bansal_df["child_definition"] = child_defs
        print(filtered_bansal_df.head()) # Print a bit so we now it is ok :)
        # Save json
        filtered_bansal_df.to_json(outputfile,orient="index")



# Only run once, aligns words with definitions from subtrees
# 3 datasets on English (train,dev,test)
#_context are extracted from Wikipedia and others. And reorder according to relevance based on Glove embeddings similarity on CTP
#_definition are the ones provided in WordNet
#create_mix_bansal_ctp(input_bansalfile, input_ctpfile, output_mixfiles, splits=["train", "dev", "test"])

Check if multiple definitions are present.
Total words 15634 . Words with more than 1 definition  0
########    train  ###########
Number of entries:  10364


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bansal_df["father"] = filtered_bansal_df["father"].apply(lambda x: x.replace("_$_", " "))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bansal_df["child"] = filtered_bansal_df["child"].apply(lambda x: x.replace("_$_", " "))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_

        father              child  treeid  split  \
0  working dog           watchdog       0  train   
1  working dog  seizure-alert dog       0  train   
2  working dog        Sennenhunde       0  train   
3  working dog         Eskimo dog       0  train   
4  working dog         Great Dane       0  train   

                                   father_definition  \
0  any of several breeds of usually large powerfu...   
1  any of several breeds of usually large powerfu...   
2  any of several breeds of usually large powerfu...   
3  any of several breeds of usually large powerfu...   
4  any of several breeds of usually large powerfu...   

                                    child_definition  
0                    a dog trained to guard property  
1  a dog that can alert or assist people with sei...  
2                           any of four Swiss breeds  
3              breed of heavy-coated Arctic sled dog  
4     very large powerful smooth-coated breed of dog  
########    dev  ###

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bansal_df["father"] = filtered_bansal_df["father"].apply(lambda x: x.replace("_$_", " "))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bansal_df["child"] = filtered_bansal_df["child"].apply(lambda x: x.replace("_$_", " "))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_

# Generate Properties
With the generated files construct trees based on Bansal subtaxonomies, and sample all the possible patterns found in a full tree of height=3.

Adapted for the WordNet dataset

In [19]:
import sys
import os
from multiprocessing import Pool
import math
import subprocess

import pandas as pd
import networkx as nx
import random
import json

def get_trees_from_jsondf(inputfile, ):
    df = pd.read_json(inputfile, orient="index")
    # Iterate by subtrees, create directed graph + collect definitions
    subtrees = {} # key:value -> (id_tree): {definitions:{}, graph:}
    for tid in range(min(df["treeid"]), max(df["treeid"])+1): # For each subtree
        filtered_df = df[df["treeid"]==tid]
        # Retrieve definitions in this subtree
        dictionary_defs, G = {}, nx.DiGraph()
        for idx,row in filtered_df.iterrows():
            dictionary_defs[row['father']] = row['father_definition']
            dictionary_defs[row['child']] = row['child_definition']
            G.add_edge(row['father'], row['child'])

        subtrees[tid] = {'definitions':dictionary_defs, 'graph':G}
        #print("Root", [n for n,d in G.in_degree() if d==0] )
    return subtrees



def generate_hard_negatives(node_left, node_right, rel_root, G):
    # Negative triplet (change intermediate (child_node) entity)
    negative_triplets = []
    for node_x in G.predecessors(rel_root): # fathers of A
        negative_triplets.append((node_left, node_x, node_right,-1))
        for node_y in G.successors(node_x): 
            if node_y == rel_root:
                continue
            # all descendants not in subtree
            negative_triplets.append((node_left, node_y, node_right,-2))
            # This last level could be removed (are more difficult?)
            for node_z in nx.descendants(G,node_y):
                negative_triplets.append((node_left,node_z,node_right,-3))
    return negative_triplets



def generate_soft_negatives(node_left, node_right,tree_id, subtrees):
    # Return negatives out of the current subtree
    all_tids = subtrees.keys()
    list_names = []
    negative_triplets = []
    random_tids = random.sample(list(all_tids),15)
    for r_tid in random_tids:
        if r_tid==tree_id:
            continue
        node_names = [(k,v) for k,v in subtrees[r_tid]['definitions'].items()]
        list_names.extend(random.sample(list(node_names),5))
    #if len(list_names)==0:
    #    print(node_left, node_right, random_tids)
    final_list = random.sample(list(list_names),10) # Generate max 10 randoms
    for (name,defs) in final_list:
        negative_triplets.append((node_left,name,node_right,-4,defs))
    return negative_triplets


def generate_soft_positives(node_left, node_right,tree_id, subtrees, minsample=5):
    all_tids = subtrees.keys()
    list_names = []
    negative_triplets = []
    random_tids = random.sample(list(all_tids),min(minsample,len(all_tids))) # before , 1st run was minsample=15
    for r_tid in random_tids:
        if r_tid==tree_id:
            continue
        node_names = [(k,v) for k,v in subtrees[r_tid]['definitions'].items()]
        list_names.extend(random.sample(list(node_names),min(minsample,len(node_names))))
    #if len(list_names)==0:
    #    print(node_left, node_right, random_tids)
    final_list = random.sample(list(list_names),min(minsample,len(list_names))) # Generate max 10 randoms
    for (name,defs) in final_list:
        negative_triplets.append((node_left,node_right,name,0,defs))
    return negative_triplets


def generate_triplets(property_generator):
    
    def wrapper(subtrees):
        only_soff_pos = True
        # generate properties from the graph
        all_triplets = [] # treeid, e1,e2,e3, def_1,def_2,def_3,valid,property
        for tree_id, info_tree in subtrees.items(): # for each subtree
            G = info_tree['graph']
            definitions = info_tree['definitions']
            positive_triplets = list()
            negative_triplets = list()
            soft_negatives_triplets = list()
            soft_positive_triplets = list()
            #print(G.nodes())
            # Iterator of triplets, it returns 3 entities + 1 the grand ancestor
            # We use the grand ancestor (node_anc) to draw entities out of the mini subtree
            iter_prop = property_generator(G)
            for (node_d,node_b,node_a, node_anc) in iter_prop:
                positive_triplets.append((node_d, node_b, node_a,1)) # Tag 1 for hard negative
                if not only_soff_pos:
                    hard_negatives = generate_hard_negatives(node_d,node_a, node_anc, G)
                    negative_triplets.extend(hard_negatives) # tags -1,-2,-3 for different positions from negatives
                    soft_negatives = generate_soft_negatives(node_d, node_a,tree_id, subtrees) #tag -4
                    soft_negatives_triplets.extend(soft_negatives)

                soft_positives = generate_soft_positives(node_d, node_b,tree_id, subtrees) #tag 0, positive triplets with 3rd entity easier
                soft_positive_triplets.extend(soft_positives)
                soft_positives = generate_soft_positives(node_d, node_a,tree_id, subtrees) #tag 0, positive triplets with 3rd entity easier
                soft_positive_triplets.extend(soft_positives)

            # Format for dataframe
            for triplet in positive_triplets:
                all_triplets.append([tree_id, triplet[0],triplet[1],triplet[2],
                                    definitions[triplet[0]],
                                    definitions[triplet[1]],
                                    definitions[triplet[2]],
                                    triplet[3]])
            if not only_soff_pos:
                for triplet in negative_triplets:
                    all_triplets.append([tree_id, triplet[0],triplet[1],triplet[2],
                                        definitions[triplet[0]],
                                        definitions[triplet[1]],
                                        definitions[triplet[2]],
                                        triplet[3]])
                for triplet in soft_negatives_triplets:
                    all_triplets.append([tree_id, triplet[0],triplet[1],triplet[2],
                                        definitions[triplet[0]],
                                        triplet[4],
                                        definitions[triplet[2]],
                                        triplet[3]])

            for triplet in soft_positive_triplets:
                all_triplets.append([tree_id, triplet[0],triplet[1],triplet[2],
                                    definitions[triplet[0]],
                                    definitions[triplet[1]],
                                    triplet[4],
                                    triplet[3]])

        dataframe_prop = pd.DataFrame(all_triplets,columns=['tree_id', 'ent_1','ent_2','ent_3','def_1','def_2','def_3','valid'])
        dataframe_prop.drop_duplicates(inplace=True)
        for i in [1,0,-1,-2,-3,-4]:
            print(i, len(dataframe_prop[dataframe_prop['valid']==i]))
            #(6397, 20472)
        return dataframe_prop
    return wrapper



@generate_triplets
def it_property1(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                yield (node_d,node_b,node_a, node_a)
                
@generate_triplets
def it_property2(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                for node_e in G.successors(node_b):
                    if node_e == node_d:
                        continue
                    yield (node_d,node_b,node_e, node_a)
                    
@generate_triplets
def it_property3(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                for node_c in G.successors(node_a):
                    if node_b==node_c:
                        continue
                    yield (node_d, node_b, node_c,node_a)
                    

@generate_triplets
def it_property4(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                for node_c in G.successors(node_a):
                    if node_b==node_c:
                        continue
                    for node_f in G.successors(node_c):
                        yield (node_d, node_b, node_f,node_a)
                    

@generate_triplets
def it_property5(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                for node_e in G.successors(node_b):
                    if node_d == node_e:
                        continue
                    yield (node_d, node_a, node_e,node_a)
                    
@generate_triplets
def it_property6(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_c in G.successors(node_a):
                if node_b == node_c:
                    continue
                for node_d in G.successors(node_b):
                    yield (node_d, node_a, node_c,node_a)
                    

@generate_triplets
def it_property7(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                for node_c in G.successors(node_a):
                    if node_b == node_c:
                        continue
                    for node_f in G.successors(node_c):
                        yield (node_d, node_a, node_f,node_a)
                    
@generate_triplets
def it_property8(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                for node_e in G.successors(node_b):
                    if node_d == node_e:
                        continue
                    for node_c in G.successors(node_a):
                        if node_c == node_b:
                            continue
                        
                        yield (node_d, node_e, node_c,node_a)

@generate_triplets
def it_property9(G):
    for node_a in G.nodes():
        for node_b in G.successors(node_a):
            for node_d in G.successors(node_b):
                for node_e in G.successors(node_b):
                    if node_d == node_e:
                        continue
                    for node_c in G.successors(node_a):
                        if node_c == node_b:
                            continue
                        for node_f in G.successors(node_c):
                            yield (node_d, node_e, node_f,node_a)



In [26]:
# Parallel generation of properties

output_folder='properties'
output_mixfiles = [
    "properties/bansal_with_defs_train.json",
    "properties/bansal_with_defs_dev.json",
    "properties/bansal_with_defs_test.json",
]

if not os.path.exists(output_folder):
    os.makedirs(output_folder)



def run_proc(idx, ifile,_mysubtrees):
    all_properties = [it_property1,it_property2,it_property3,it_property4,it_property5,it_property6,it_property7,it_property8,it_property9]
    all_idxs = list(range(9))
    #all_properties = [it_property1,it_property2,it_property3,it_property5,it_property6,it_property8,it_property9]
    #all_idxs = [0,1,2,4,5,7,8]
    prop=all_properties[idx]
    myidx = all_idxs[idx]
    ofilename = ifile.replace(".json", "_prop_"+str(myidx)+".json")
    newdf = prop(_mysubtrees)
    newdf.to_json(ofilename, orient="index")


def main_properties_generator():
    for ifile in output_mixfiles:
        print(ifile)
        _mysubtrees = get_trees_from_jsondf(ifile)
        #run_proc(0,ifile,_mysubtrees)
        n = 9 # 9 properties
        p = Pool(n)
        for i in range(n):
            p.apply_async(run_proc, args=(i, ifile,_mysubtrees))
        p.close()
        p.join()
        


main_properties_generator()


properties/bansal_with_defs_train.json
1 6397
0 63970
-1 0
-2 0
-3 0
-4 0
1 28912
0 288751
-1 0
-2 0
-3 0
-4 0
1 28912
0 288724
-1 0
-2 0
-3 0
-4 0
1 31116
0 310707
-1 0
-2 0
-3 0
-4 0
1 31116
0 310714
-1 0
-2 0
-3 0
-4 0
1 39505
0 394507
-1 0
-2 0
-3 0
-4 0
1 39505
0 394465
-1 0
-2 0
-3 0
-4 0
1 126196
0 1258866
-1 0
-2 0
-3 0
-4 0
1 159848
0 1594615
-1 0
-2 0
-3 0
-4 0
properties/bansal_with_defs_dev.json
1 1443
0 14430
-1 0
-2 0
-3 0
-4 0
1 5744
0 57216
-1 0
-2 0
-3 0
-4 0
1 5744
0 57214
-1 0
-2 0
-3 0
-4 0
1 8540
0 84785
-1 0
-2 0
-3 0
-4 0
1 8540
0 84762
1-1  93470

-2 0
-3 0
-4 00
 92941
-1 0
-2 0
-3 0
-4 0
1 9347
0 92933
-1 0
-2 0
-3 0
-4 0
1 34066
0 337169
-1 0
-2 0
-3 0
-4 0
1 37624
0 372397
-1 0
-2 0
-3 0
-4 0
properties/bansal_with_defs_test.json
1 1435
0 14350
-1 0
-2 0
-3 0
-4 0
1 6522
0 64826
-1 0
-2 0
-3 0
-4 0
1 6522
0 64869
-1 0
-2 0
-3 0
-4 0
1 7658
0 76108
-1 0
-2 0
-3 0
-4 0
1 8273
0 82248
-1 0
-2 0
-3 0
-4 0
1 7658
0 76072
-1 0
-2 0
-3 0
-4 0
1 8273
0 82275
-1 0
-2

In [27]:
# To sample up to 8x negatives
# mydf = sample_mydf_v2("bansal_with_defs_trainprop_{prop}.json", frac1=6, frac2=0.65, frac3=1,frac_pos=5, soft=True)
def sample_mydf_v2(filestr, frac1=5, frac2=0.65, frac3=1, frac_pos=30,soft=False):
    sampled_df = []
    for prop in [0,1,2,4,5,7,8]:
        print("Property", prop+1)
        _propfile = filestr.format(prop=str(prop))
        df = pd.read_json(_propfile, orient="index") # complete dataframe

        hard_pos_df = df[df['valid']==1] # Filter hard positives

        if prop ==0: # If P1 (0) use everything, it's the smallest!
            hard_pos_df['prop'] = prop+1
            sampled_df.append(hard_pos_df)
        else:
            # Random sample fixing 2 entities, the if is because there are more triplets for prop 8,9
            if prop<7:
                sdf = hard_pos_df.groupby(['tree_id','ent_1','ent_2']).sample(n=frac1,random_state=42, replace=True)
            else:
                sdf = hard_pos_df.groupby(['tree_id','ent_1','ent_2']).sample(n=frac3,random_state=42, replace=True)
                sdf = sdf.sample(frac=frac2, random_state=42)
            sdf['prop'] = prop+1 # Add new column to identify property
            sampled_df.append(sdf)
        print("Total hard triplets",len(sampled_df[-1]))
        del hard_pos_df
        if soft: # sample soft positives too
            soft_pos_df = df[df['valid']==0]# Filter soft positives
            soft_pos_df['prop'] = prop+1 # Add new column to identify property
            new_soft = soft_pos_df.groupby(['tree_id','ent_1','ent_2']).sample(n=frac_pos,random_state=42, replace=True)

            sample_size = 1
            if prop==0:
                sample_size = 10 #80 for x30
            if prop==2 or prop==5:
                sample_size = 1#5 for x30
            new_soft = new_soft.sample(n=min(len(new_soft),len(sampled_df[-1])*sample_size),random_state=42)
            new_soft['prop'] = prop+1
            sampled_df.append(new_soft)
            print("Total soft triplets",len(sampled_df[-1]))
            del new_soft
        del df
    final_df = pd.concat(sampled_df)
    print(len(final_df))
    final_df.drop_duplicates(['tree_id','ent_1','ent_2','ent_3'], inplace=True)
    print("Final ",len(final_df))
    return final_df

mydf = sample_mydf_v2("properties/bansal_with_defs_train_prop_{prop}.json", frac1=6, frac2=0.65, frac3=1,frac_pos=5, soft=True)
for prop in range(9):
    print(prop+1, len(mydf[mydf['prop']==(prop+1)]))

Property 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soft_pos_df['prop'] = prop+1 # Add new column to identify property


Total hard triplets 6397
Total soft triplets 63970
Property 2
Total hard triplets 32688


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soft_pos_df['prop'] = prop+1 # Add new column to identify property


Total soft triplets 32688
Property 3
Total hard triplets 36156


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soft_pos_df['prop'] = prop+1 # Add new column to identify property


Total soft triplets 36156
Property 5
Total hard triplets 32688


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soft_pos_df['prop'] = prop+1 # Add new column to identify property


Total soft triplets 32688
Property 6
Total hard triplets 36156


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soft_pos_df['prop'] = prop+1 # Add new column to identify property


Total soft triplets 36156
Property 8
Total hard triplets 17909


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soft_pos_df['prop'] = prop+1 # Add new column to identify property


Total soft triplets 17909
Property 9
Total hard triplets 13124


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soft_pos_df['prop'] = prop+1 # Add new column to identify property


Total soft triplets 13124
407809
Final  315920
1 49623
2 47411
3 54858
4 0
5 47431
6 54838
7 0
8 35638
9 26121


In [28]:
# Remove prop 5 since we are not sure
new_df = mydf[mydf['prop']!=5]
new_df.reset_index().to_json( "properties/train_hard_soft_neg_sample_5x_p123689.json", orient='index')

In [29]:
# Train
# Consider only uncle (too many cases if we add nephew too!, that's why 4 and 7 are 0)
ndf = new_df[new_df['valid']==0]
for i in range(10):
    print(i, len(ndf[ndf['prop']==i]))

0 0
1 43226
2 30618
3 34148
4 0
5 0
6 34131
7 0
8 17729
9 12998


In [30]:
# Make only hard positive test file
test_df = []
_filestr = "properties/bansal_with_defs_test_prop_{prop}.json"
for prop in range(9):
    _propfile = _filestr.format(prop=str(prop))
    df = pd.read_json(_propfile, orient="index") # complete dataframe
    hard_pos = df[df['valid']==1] # Filter hard positives
    hard_pos['prop'] = prop+1 # Add new column to identify property
    test_df.append(hard_pos)
final_df = pd.concat(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos['prop'] = prop+1 # Add new column to identify property
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos['prop'] = prop+1 # Add new column to identify property
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos['prop'] = prop+1 # Add new column to identify property
A value is tr

In [31]:
for prop in range(9):
    print(prop+1, len(final_df[final_df['prop']==(prop+1)]))
final_df.reset_index().to_json("properties/test_hard_neg.json", orient='index')

1 1435
2 7658
3 8273
4 6522
5 7658
6 8273
7 6522
8 33160
9 31746


In [37]:
# Dev

def sample_mydf(filestr, frac1=5, frac2=0.65, frac3=1, soft=False):
    sampled_df = []
    for prop in range(9):
        print("Property", prop+1)
        _propfile = filestr.format(prop=str(prop))
        df = pd.read_json(_propfile, orient="index") # complete dataframe

        hard_pos_df = df[df['valid']==1] # Filter hard positives
        hard_pos_df['prop'] = prop+1 # Add new column to identify property

        if prop ==0: # If P1 (0) use everything, it's the smallest!
            sampled_df.append(hard_pos_df)
        else:
            # Random sample fixing 2 entities, the if is because there are more triplets for prop 8,9
            if prop<7:
                sdf = hard_pos_df.groupby(['tree_id','ent_1','ent_2']).sample(n=frac1,random_state=42, replace=True)
            else:
                sdf = hard_pos_df.groupby(['tree_id','ent_1','ent_2']).sample(n=frac3,random_state=42, replace=True)
                sdf = sdf.sample(frac=frac2, random_state=42)
            sampled_df.append(sdf)
        if soft: # sample soft positives too
            soft_pos_df = df[df['valid']==0]# Filter soft positives
            soft_pos_df['prop'] = prop+1 # Add new column to identify property
            new_soft = soft_pos_df.groupby(['tree_id','ent_1','ent_2']).sample(n=frac1,random_state=42, replace=True)
            sample_size = 4
            if prop==0:
                sample_size = 6#10
            new_soft = new_soft.sample(n=len(sampled_df[-1])*sample_size,random_state=42)
            sampled_df.append(new_soft)
    final_df = pd.concat(sampled_df)
    print(len(final_df))
    final_df.drop_duplicates(['tree_id','ent_1','ent_2','ent_3'], inplace=True)
    print(len(final_df))
    return final_df

# Dev
mydf = sample_mydf("properties/bansal_with_defs_dev_prop_{prop}.json", frac1=2, frac2=0.4)
for prop in range(9):
    print(prop+1, len(mydf[mydf['prop']==(prop+1)]))


    
mydf.reset_index().to_json( "properties/dev_hard_neg.json", orient='index')


Property 1
Property 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


Property 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


Property 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


Property 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


Property 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


Property 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


Property 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


Property 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_pos_df['prop'] = prop+1 # Add new column to identify property


20579
18291
1 1443
2 2016
3 2401
4 1951
5 2016
6 2401
7 1951
8 2208
9 1904


In [38]:
#Stats
for prop in range(9):
    print(prop+1, len(mydf[mydf['prop']==(prop+1)]))
#final_df.reset_index().to_json("properties/dev_hard_neg.json", orient='index')

1 1443
2 2016
3 2401
4 1951
5 2016
6 2401
7 1951
8 2208
9 1904


In [2]:
# Files to use
# Train: hierarchy_evaluation2023/data_generator/properties/train_hard_soft_neg_sample_5x_p123689.json -> train_hard_soft_neg.json
# Dev: hierarchy_evaluation2023/data_generator/properties/dev_hard_neg.json
# Test: hierarchy_evaluation2023/data_generator/properties/test_hard_neg.json
!mv ~/hierarchy_evaluation2023/data_generator/properties/train_hard_soft_neg_sample_5x_p123689.json ~/hierarchy_evaluation2023/data_generator/properties/train_hard_soft_neg.json