In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [None]:
# Reading the rules - tab-separated file
df = pd.read_csv('amie-FB15k237.txt', sep='\t', names=['Rule', 'Support', 'StandardConfidence', 'PCAConfidence', 'HeadCoverage', 'BodySize', 'PCABodySize', 'variable', 'c2', 'c3', 'c4'])
df = df.drop(columns=['c2', 'c3', 'c4'])
print(df.head(1))

                                                                                                                                                                                                                      Rule  \
0  ?b  /organization/organization_member/member_of./organization/organization_membership/organization  ?a   => ?a  /government/political_party/politicians_in_this_party./government/political_party_tenure/politician  ?b   

    Support  StandardConfidence  PCAConfidence  HeadCoverage  BodySize  \
0  0.009346            0.001255          0.125             1       797   

   PCABodySize variable  
0            8       ?b  


In [None]:
# Distribution of each column
df.describe()

Unnamed: 0,Support,StandardConfidence,PCAConfidence,HeadCoverage,BodySize,PCABodySize
count,222964.0,222964.0,222964.0,222964.0,222964.0,222964.0
mean,0.031599,0.447096,0.59827,19.95123,117.94056,54.122688
std,0.060128,0.298605,0.295957,147.436139,1167.543515,741.705844
min,6.3e-05,2.2e-05,0.1,1.0,1.0,1.0
25%,0.012262,0.18,0.333333,3.0,10.0,6.0
50%,0.017105,0.413043,0.6,7.0,25.0,16.0
75%,0.028947,0.684211,0.875,14.0,47.0,31.0
max,1.0,1.0,1.0,14032.0,165293.0,89649.0


In [None]:
# unique values of column 'variable'
df['variable'].unique()

array(['?b', '?a'], dtype=object)

In [None]:
def rule_length(rule):
    # Split the rule by spaces to get components
    components = rule.split()
    return len(components)

In [None]:
df['rule_length'] = df['Rule'].apply(rule_length)
df['rule_length'].describe()

Unnamed: 0,rule_length
count,222964.0
mean,7.190847
std,0.732203
min,7.0
25%,7.0
50%,7.0
75%,7.0
max,10.0


In [None]:
df[df['rule_length'] > 7].head()

Unnamed: 0,Rule,Support,StandardConfidence,PCAConfidence,HeadCoverage,BodySize,PCABodySize,variable,rule_length
757,?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?f ?b /influence/influence_node/influenced_by ?f => ?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?b,0.056075,0.157895,0.6,6,38,10,?b,10
760,?f /people/person/employment_history./business/employment_tenure/company ?a ?b /people/person/spouse_s./people/marriage/spouse ?f => ?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?b,0.009346,0.041667,0.25,1,24,4,?b,10
761,?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?f ?b /people/person/spouse_s./people/marriage/spouse ?f => ?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?b,0.037383,0.4,0.8,4,10,5,?b,10
762,?e /people/person/employment_history./business/employment_tenure/company ?a ?e /people/person/spouse_s./people/marriage/spouse ?b => ?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?b,0.009346,0.04,0.25,1,25,4,?b,10
763,?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?e ?e /people/person/spouse_s./people/marriage/spouse ?b => ?a /government/political_party/politicians_in_this_party./government/political_party_tenure/politician ?b,0.037383,0.4,0.8,4,10,5,?b,10


In [None]:
len(df[df['rule_length']>7])

14184

In [None]:
df['Support'].describe()

Unnamed: 0,Support
count,222964.0
mean,0.031599
std,0.060128
min,6.3e-05
25%,0.012262
50%,0.017105
75%,0.028947
max,1.0


In [None]:
percentile_85_support = df['Support'].quantile(0.85)
percentile_85_support

0.041189931

In [None]:
df['StandardConfidence'].describe()

Unnamed: 0,StandardConfidence
count,222964.0
mean,0.447096
std,0.298605
min,2.2e-05
25%,0.18
50%,0.413043
75%,0.684211
max,1.0


In [None]:
percentile_85_confidence = df['StandardConfidence'].quantile(0.85)
percentile_85_confidence

0.7938719002999988

In [None]:
filtered_df = df[(df['Support'] >= percentile_85_support) & (df['StandardConfidence'] >= percentile_85_confidence)]
filtered_df.describe()

Unnamed: 0,Support,StandardConfidence,PCAConfidence,HeadCoverage,BodySize,PCABodySize,rule_length
count,6314.0,6314.0,6314.0,6314.0,6314.0,6314.0,6314.0
mean,0.124935,0.864598,0.931126,53.421444,61.585999,58.704941,7.21096
std,0.12921,0.059092,0.075084,253.506914,302.134518,300.011568,0.767117
min,0.04119,0.793939,0.794118,4.0,4.0,4.0,7.0
25%,0.058594,0.818182,0.857143,12.0,13.0,12.0,7.0
50%,0.080537,0.847222,0.963189,21.0,24.0,22.0,7.0
75%,0.142105,0.888889,1.0,34.0,41.0,38.0,7.0
max,1.0,1.0,1.0,12950.0,15989.0,15907.0,10.0


In [None]:
len(filtered_df[filtered_df['rule_length']>7])

444

In [None]:
filtered_df.to_csv('output_rules.csv', index=False)

In [None]:
df['filtered'] = 0
df.loc[(df['Support'] >= percentile_85_support) & (df['StandardConfidence'] >= percentile_85_confidence), 'filtered'] = 1
df[df['filtered'] == 1].head(2)

Unnamed: 0,Rule,Support,StandardConfidence,PCAConfidence,HeadCoverage,BodySize,PCABodySize,variable,rule_length,filtered
20,?b /sports/sports_team/roster./american_football/football_roster_position/position ?a => ?a /sports/sports_position/players./sports/sports_team_roster/team ?b,0.139143,0.802306,0.80363,487,607,606,?b,7,1
21,?b /sports/sports_team/roster./baseball/baseball_roster_position/position ?a => ?a /sports/sports_position/players./sports/sports_team_roster/team ?b,0.042,0.803279,0.803279,147,183,183,?b,7,1


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
stratify_col = filtered_df['rule_length']
df_train, df_sample = train_test_split(filtered_df, test_size=0.05, stratify=stratify_col, random_state=42)

df_sample.describe()

Unnamed: 0,Support,StandardConfidence,PCAConfidence,HeadCoverage,BodySize,PCABodySize,rule_length
count,316.0,316.0,316.0,316.0,316.0,316.0,316.0
mean,0.131714,0.860714,0.929083,53.050633,61.063291,57.892405,7.208861
std,0.135489,0.056438,0.075495,164.521849,187.905994,184.521572,0.764729
min,0.041405,0.794118,0.794118,5.0,5.0,5.0,7.0
25%,0.059061,0.818182,0.857143,11.75,13.0,12.0,7.0
50%,0.083333,0.846154,0.943297,21.0,24.0,22.0,7.0
75%,0.158025,0.888889,1.0,34.0,41.0,38.0,7.0
max,1.0,1.0,1.0,1833.0,2265.0,2261.0,10.0


In [None]:
len(df_sample[df_sample['rule_length']>7])

22

In [None]:
df_sample.head()

Unnamed: 0,Rule,Support,StandardConfidence,PCAConfidence,HeadCoverage,BodySize,PCABodySize,variable,rule_length
212609,/m/0gh65c5 /film/film/release_date_s./film/film_regional_release_date/film_release_region ?a => ?a /location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency /m/09nqf,0.157895,0.810811,1.0,30,37,30,?a,7
21405,/m/01zq91 /government/government_office_category/officeholders./government/government_position_held/jurisdiction_of_office ?a => ?a /location/statistical_region/gni_per_capita_in_ppp_dollars./measurement_unit/dated_money_value/currency /m/09nqf,0.073826,0.916667,1.0,11,12,11,?a,7
138310,?a /organization/organization_member/member_of./organization/organization_membership/organization /m/0j7v_ => ?a /organization/organization_member/member_of./organization/organization_membership/organization /m/02vk52z,0.041405,0.804878,0.804878,33,41,41,?a,7
212554,/m/09gkx35 /film/film/release_date_s./film/film_regional_release_date/film_release_region ?a => ?a /location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency /m/09nqf,0.105263,0.869565,1.0,20,23,20,?a,7
70291,/m/017gl1 /film/film/release_date_s./film/film_regional_release_date/film_release_region ?a => ?a /base/aareas/schema/administrative_area/administrative_area_type /m/0hzc9wc,0.237179,0.840909,1.0,37,44,37,?a,7


In [None]:
df_sample.to_csv('output_humaneval_rules.csv', index=False)

In [None]:
df['InSample'] = df.index.isin(df_sample.index).astype(int)
df[df['InSample'] == 1].head(2)

Unnamed: 0,Rule,Support,StandardConfidence,PCAConfidence,HeadCoverage,BodySize,PCABodySize,variable,rule_length,filtered,InSample
447,?a /soccer/football_team/current_roster./soccer/football_roster_position/position ?b => ?a /soccer/football_team/current_roster./sports/sports_team_roster/position ?b,0.810345,0.809272,0.810703,1833,2265,2261,?a,7,1,1
1148,?e /location/hud_county_place/place ?b ?a /people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony ?e => ?a /people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony ?b,0.215827,1.0,1.0,90,90,90,?b,10,1,1


In [None]:
df.to_csv('updated_rules.csv', index=False)