In [2]:
import pandas as pd

In [3]:
def remove_symmetric_triples(original):
  """
  Remove all pairs of triples which are symmetric sets.

  Input:
    dataframe with columns head, relation, tail

  Output:
    dataframe with columns head, relation, tail

  Example:
    df_some_sym = pd.DataFrame({'head' : ['e1', 'e3', 'e1', 'e2'],
                                'relation':['r1','r2', 'r3', 'r1'],
                                'tail': ['e2', 'e4', 'r4', 'e1']})

    df_cleaned = remove_symmetric_triples(df_some_sym)

    print(df_some_sym)
    print('\\n')
    print(df_cleaned)

  Example output:
      head relation tail
    0   e1       r1   e2
    1   e3       r2   e4
    2   e1       r3   r4
    3   e2       r1   e1

      head relation tail
    1   e3       r2   e4
    2   e1       r3   r4

  """
  inverse = pd.DataFrame()

  inverse.insert(loc=0, column='head', value = original['tail'])
  inverse.insert(loc=1, column='relation', value = original['relation'])
  inverse.insert(loc=2, column='tail', value = original['head'])

  diff_df = pd.merge(original, inverse, how='outer', indicator='Exists')
  clean = diff_df.loc[diff_df['Exists']== 'left_only'].drop('Exists', axis=1)
  return(clean)  


In [4]:
def add_symmetric_triples(original):
  '''
  Add symmetric triples to a all the triples in a dataframe

  Input:
    dataframe with columns head, relation, tail

  Output:
    dataframe with columns head, relation, tail

  Example:
    df_some_sym = pd.DataFrame({'head' : ['e1', 'e3', 'e1', 'e2'],
                                  'relation':['r1','r2', 'r3', 'r1'],
                                  'tail': ['e2', 'e4', 'r4', 'e1']})

    full_df = add_symmetric_triples(df_some_sym)

    print(df_some_sym) 
    print('\\n')
    print(full_df)

  Example output:
      head relation tail
    0   e1       r1   e2
    1   e3       r2   e4
    2   e1       r3   r4
    3   e2       r1   e1


      head relation tail
    0   e1       r1   e2
    1   e3       r2   e4
    2   e1       r3   r4
    3   e2       r1   e1
    5   e4       r2   e3
    6   r4       r3   e1

  '''
  inverse = pd.DataFrame()

  inverse.insert(loc=0, column='head', value = original['tail'])
  inverse.insert(loc=1, column='relation', value = original['relation'])
  inverse.insert(loc=2, column='tail', value = original['head'])

  combined = original.append(inverse).reset_index(drop=True).drop_duplicates()
  return combined


In [5]:
def split_triples(df, predicates):
  '''
  Split a dataframe of triples into separate dataframes based on relation

  Input:
    df - a dataframe of triples where the column holding predicates is named 'relation'
    predicates - a list of n lists of predicates

  Output:
    a list of n dataframes where each dataframe contains the triples with the given predicates for the corresponding index

  Example:
    df_some_sym = pd.DataFrame({'head' : ['e1', 'e3', 'e1', 'e2'],
                                  'relation':['r1','r2', 'r3', 'r1'],
                                  'tail': ['e2', 'e4', 'r4', 'e1']})

    framelist = split_triples(df_with_sym, [['r1', 'r2'], ['r3'], ['r4']])

    print(df_some_sym) 
    for df in framelist:
      print(df)
      print('\\n')
  
  Example output:
        head relation tail
    0   e1       r1   e2
    1   e3       r2   e4
    2   e1       r3   e5
    3   e2       r1   e1


      head relation tail
    0   e1       r1   e2
    1   e2       r1   e1
    2   e3       r2   e4


      head relation tail
    0   e1       r3   e5
  '''

  split_dfs = []

  for predicate_set in predicates:
    new_df = pd.DataFrame()
    for predicate in predicate_set:    
      new_df = new_df.append(df.loc[df['relation'] == predicate]).reset_index(drop=True)
    split_dfs.append(new_df)  

  return(split_dfs)

In [6]:
  def count_sym(original):
    '''
    Count the number of triples in a dataframe that have a symmetric counterpart in the dataframe

    Input:
      dataframe with columns head, relation, tail

    Output:
      integer giving a number of tuples (always a multiple of 2 because of the symmetry)

    Example:
      df_some_sym = pd.DataFrame({'head' : ['e1', 'e3', 'e1', 'e2'],
                              'relation':['r1','r2', 'r3', 'r1'],
                              'tail': ['e2', 'e4', 'e5', 'e1']})

      count_sym(df_some_sym)

    Example output:
      2
    '''
      
    inverse = pd.DataFrame()

    inverse.insert(loc=0, column='head', value = original['tail'])
    inverse.insert(loc=1, column='relation', value = original['relation'])
    inverse.insert(loc=2, column='tail', value = original['head'])

    diff_df = pd.merge(original, inverse, how='outer', indicator='Exists')
    symmetric_df = diff_df[diff_df['Exists']== 'both']
    symmetric_tuples = symmetric_df.shape[0]
    return(symmetric_tuples)

In [7]:
def get_sym_antisym_pred(df):
    '''
    Get the symmetric and assymetric predicates, i.e. those who are more commonly
    used symmetrically or not.
    
    Input:
        dataframe with columns head, relation, tail
    
    Output:
        a tuple of two lists of predicates
    '''

    single_preds = []

    for item in df['relation'].unique():
        single_preds.append([item])

    singles = split_triples(df, single_preds)
    sym_predicates = []
    anti_predicates = []

    for item in singles:
        name = item['relation'][0]
        syms = count_sym(item)
        total = item.count()[0]

        if (syms>total/2):
            sym_predicates.append(name)

        else :
            anti_predicates.append(name)
    
    return (sym_predicates, anti_predicates)

In [8]:
import os

def split_file(filename, sym_predicates, anti_predicates):
    '''
    Split a tsv file with subject predicate object triples into two files,
    based on two lists of predicates.
    
    Input:
        filename - reference to a tsv file with triples
        sym_predicates - all predicates which should go into the symmetric portion of the data
        anti_predicates - all predicates which should go into the anti symmetric portion of the data
        
    Output:
        two files with prefix sym_ and asym_ for symmetric and anti symmetric data, in the same
        folder as the original file
    '''
    
    fullpath = os.path.join(os.getcwd(), filename)
    df = pd.read_csv(fullpath, header=None, sep='\t', names =  ['head','relation', 'tail'])    
    
    path = os.path.dirname(fullpath)
    filename = os.path.basename(fullpath)
    
    df_list = split_triples(df, [sym_predicates, anti_predicates])
    df_list[0].to_csv(os.path.join(path, "sym_" +filename), index=False, header=None, sep='\t')
    df_list[1].to_csv(os.path.join(path, "asym_" +filename), index=False, header=None, sep='\t')

# Usage examples

In [9]:
df_some_sym = pd.DataFrame({'head' : ['e1', 'e3', 'e1', 'e2'],
                             'relation':['r1','r2', 'r3', 'r1'],
                             'tail': ['e2', 'e4', 'e5', 'e1']})

In [10]:
count_sym(df_some_sym)

2

In [11]:
framelist = split_triples(df_some_sym, [['r1', 'r2'], ['r3']])

print(df_some_sym) 
for df in framelist:
    print('\n')
    print(df)
    

  head relation tail
0   e1       r1   e2
1   e3       r2   e4
2   e1       r3   e5
3   e2       r1   e1


  head relation tail
0   e1       r1   e2
1   e2       r1   e1
2   e3       r2   e4


  head relation tail
0   e1       r3   e5


In [12]:
full_df = add_symmetric_triples(df_some_sym)

print(df_some_sym) 
print('\n')
print(full_df)

  head relation tail
0   e1       r1   e2
1   e3       r2   e4
2   e1       r3   e5
3   e2       r1   e1


  head relation tail
0   e1       r1   e2
1   e3       r2   e4
2   e1       r3   e5
3   e2       r1   e1
5   e4       r2   e3
6   e5       r3   e1


In [13]:
df_cleaned = remove_symmetric_triples(df_some_sym)

print(df_some_sym)
print('\n')

print(df_cleaned)

  head relation tail
0   e1       r1   e2
1   e3       r2   e4
2   e1       r3   e5
3   e2       r1   e1


  head relation tail
1   e3       r2   e4
2   e1       r3   e5


# Process existing datasets

In [14]:
df = pd.read_csv('data/wn18rr/train_wn18rr.txt', header=None, sep='\t', names =  ['head','relation', 'tail'])
sym_predicates, anti_predicates = get_sym_antisym_pred(df)
print ('symmetric predicates: ' + str(sym_predicates) +'\n')
print ('anti symmetric predicates: ' + str(anti_predicates) + '\n')

split_file('data/wn18rr/train_wn18rr.txt', sym_predicates, anti_predicates)
split_file('data/wn18rr/test_wn18rr.txt', sym_predicates, anti_predicates)
split_file('data/wn18rr/valid_wn18rr.txt', sym_predicates, anti_predicates)

symmetric predicates: ['_derivationally_related_form', '_also_see', '_verb_group', '_similar_to']

anti symmetric predicates: ['_hypernym', '_instance_hypernym', '_member_meronym', '_synset_domain_topic_of', '_has_part', '_member_of_domain_usage', '_member_of_domain_region']



In [16]:
df = pd.read_csv('data/fb15k-237/train_fb15k-237.txt', header=None, sep='\t', names =  ['head','relation', 'tail'])
sym_predicates, anti_predicates = get_sym_antisym_pred(df)
print ('symmetric predicates: ' + str(sym_predicates) +'\n')
print ('anti symmetric predicates: ' + str(anti_predicates) + '\n')

split_file('data/fb15k-237/train_fb15k-237.txt', sym_predicates, anti_predicates)
split_file('data/fb15k-237/test_fb15k-237.txt', sym_predicates, anti_predicates)
split_file('data/fb15k-237/valid_fb15k-237.txt', sym_predicates, anti_predicates)

symmetric predicates: ['/award/award_winner/awards_won./award/award_honor/award_winner', '/award/award_nominee/award_nominations./award/award_nomination/award_nominee', '/music/performance_role/regular_performances./music/group_membership/role', '/base/popstra/celebrity/friendship./base/popstra/friendship/participant', '/base/popstra/celebrity/breakup./base/popstra/breakup/participant', '/location/hud_county_place/place', '/music/performance_role/guest_performances./music/recording_contribution/performance_role', '/location/location/adjoin_s./location/adjoining_relationship/adjoins', '/government/legislative_session/members./government/government_position_held/legislative_sessions', '/music/performance_role/track_performances./music/track_contribution/role', '/base/popstra/celebrity/dated./base/popstra/dated/participant', '/education/educational_institution_campus/educational_institution', '/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for', '/base/sat