In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from Kelpie.dataset import Dataset
import numpy as np
import pandas as pd
from helpers.helpers import print_fact, extract_subgraph_of_kg, print_sample
from helpers.kelpie_models_helpers import train_complex
from helpers.knowledge_graph_simulation_experiment import KnowledgeGraphMitigationExperiment
import json

In [5]:

fb15k237_path = 'Kelpie_package/Kelpie/data/FB15k-237'
train_path = fb15k237_path + '/train.txt'
test_path = fb15k237_path + '/test.txt'
valid_path = fb15k237_path + '/valid.txt'
fact_to_investigate = ("/m/0693l", "/film/director/film" ,"/m/0gwjw0c")
fb15k237_dataset = Dataset(name="FB15k-237", load=True, train_path=train_path, test_path=test_path, valid_path=valid_path)
sample_to_investigate = fb15k237_dataset.fact_to_sample(fact_to_investigate)

label_map_path = 'entity2wikidata.json'
label_map = json.load(open(label_map_path))

In [6]:
from helpers.plotting_utilities import get_stats_tables, combine_all_stats_tables
import dataframe_image as dfi

experiment_pairs = []
for i in range(1, 8):
    with open(f"experiment_inputs/input_{i}.txt", "r", encoding="utf-8") as f:
        good_fact, bad_fact = f.readlines()
        good_fact = good_fact.strip().split("-")
        bad_fact = bad_fact.strip().split("-")
        experiment_pairs.append((good_fact, bad_fact))

In [7]:
tables = get_stats_tables(
    experiment_pairs, label_map, "results/remove_overlapping_budget", "stats.json"
)

In [None]:
# combined_table = combine_all_stats_tables(tables, experiment_pairs)
# combined_table
# dfi.export(combined_table,"plots/mytable.png", table_conversion = 'matplotlib', )


In [None]:
# from helpers.helpers import get_readable_fact_str
# combined_table['fact'] = combined_table['fact'].apply(lambda x: get_readable_fact_str(x.split(' '), label_map))
# latex_res = combined_table.set_index('fact').style.format(escape="latex").to_latex(position_float="centering", hrules=True, )
# print(latex_res)

In [8]:
from helpers.plotting_utilities import get_full_exp_results, simplify_full_exp_results,get_disinformer_mitigator_avg_difference, get_stats_tables, get_diff_aucs, get_ranking_aucs
strategies = ["greedy", "neighbor", "approx_greedy", "random", 'multi_greedy']

baselines = ["random"]
NUM_EPOCHS = 5

all_exp_results = get_full_exp_results(
    experiment_pairs, label_map, strategies, NUM_EPOCHS)
all_diffs = get_disinformer_mitigator_avg_difference(
    all_exp_results, experiment_pairs, strategies, label_map, tables)

all_rankings = simplify_full_exp_results(all_exp_results, NUM_EPOCHS)

df_auc_diffs = get_diff_aucs(
    experiment_pairs, strategies, baselines, all_diffs, label_map
)

df_auc_rankings = get_ranking_aucs(
    experiment_pairs, strategies, baselines, all_rankings, label_map
)

In [9]:
# replace neighbor experiment results with 2nd neighbor experiment results

# from helpers.plotting_utilities import get_base_exp_name
# strategies = ["greedy",  "approx_greedy", "random", 'multi_greedy']
# res_folder = 'results/remove_overlapping_budget'
# for good_fact, bad_fact in experiment_pairs:
#     base_exp_name = get_base_exp_name(good_fact, bad_fact, label_map)
#     for strategy in strategies:
#         with open(f'{res_folder}/{base_exp_name}_neighbor_{strategy}_2/all_results.json', 'r') as new_results:
#             data = json.load(new_results)

#             with open(f'{res_folder}/{base_exp_name}_neighbor_{strategy}/all_results.json', 'w') as old_results:
#                 json.dump(data, old_results)
                


In [10]:
from collections import defaultdict
from scipy.stats import ttest_ind
from IPython.display import display_html

mitigator_strategies = list(set(strategies) - {'random'}) 

def plot_means(disinformer_strategy):
    p_vals = defaultdict(dict)
    means = defaultdict(dict)
    df_auc_rankings_greedy_d = df_auc_rankings[df_auc_rankings['disinformer_strategy'] == disinformer_strategy]
    for strategy1 in mitigator_strategies:
        temp1 = df_auc_rankings_greedy_d[df_auc_rankings_greedy_d['mitigator_strategy']==strategy1]['auc'].to_numpy()
        for strategy2 in mitigator_strategies:
            if strategy1 != strategy2:
                temp2 = df_auc_rankings_greedy_d[df_auc_rankings_greedy_d['mitigator_strategy']==strategy2]['auc'].to_numpy()
                temp_res = ttest_ind(temp1, temp2, equal_var=False)
                p_vals[strategy1][strategy2] = temp_res[1]
                means[strategy1][strategy2] = f'{round(temp1.mean(), 2)} / {round(temp2.mean(), 2)}'
            
    p_df = pd.DataFrame(p_vals)
    means_df = pd.DataFrame(means)
    means_df = means_df.sort_index(axis=1).sort_index(axis=0)
    return means_df, p_df

In [11]:
means_df, p_df = plot_means('greedy')
styler_greedy = means_df.style.apply(lambda _: (p_df<0.1).replace({
    True: 'background-color:rgb(200, 70, 0, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.05).replace({
    True: 'background-color:rgb(100, 0, 70, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.01).replace({
    True: 'background-color:rgb(0, 0, 70, 0.5);',
    False: ''
}), axis=None)
html_greedy = '<h3>Disinformer: Greedy</h1>' + styler_greedy.to_html()
display_html(html_greedy, raw=True)


Unnamed: 0,approx_greedy,greedy,multi_greedy,neighbor
approx_greedy,,89.74 / 79.13,-204.62 / 79.13,20.13 / 79.13
greedy,79.13 / 89.74,,-204.62 / 89.74,20.13 / 89.74
multi_greedy,79.13 / -204.62,89.74 / -204.62,,20.13 / -204.62
neighbor,79.13 / 20.13,89.74 / 20.13,-204.62 / 20.13,


In [12]:
means_df, p_df = plot_means('random')
styler_random = means_df.style.apply(lambda _: (p_df<0.1).replace({
    True: 'background-color:rgb(200, 70, 0, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.05).replace({
    True: 'background-color:rgb(100, 0, 70, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.01).replace({
    True: 'background-color:rgb(0, 0, 70, 0.5);',
    False: ''
}), axis=None)
html_random = '<h3>Disinformer: Random</h1>' + styler_random.to_html()
display_html(html_random, raw=True)


Unnamed: 0,approx_greedy,greedy,multi_greedy,neighbor
approx_greedy,,92.39 / 78.7,-206.44 / 78.7,16.34 / 78.7
greedy,78.7 / 92.39,,-206.44 / 92.39,16.34 / 92.39
multi_greedy,78.7 / -206.44,92.39 / -206.44,,16.34 / -206.44
neighbor,78.7 / 16.34,92.39 / 16.34,-206.44 / 16.34,


In [13]:
from IPython.display import display_html
means_df, p_df = plot_means('approx_greedy')
styler_approx_greedy = means_df.style.apply(lambda _: (p_df<0.1).replace({
    True: 'background-color:rgb(200, 70, 0, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.05).replace({
    True: 'background-color:rgb(100, 0, 70, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.01).replace({
    True: 'background-color:rgb(0, 0, 70, 0.5);',
    False: ''
}), axis=None)
html_approx_greedy = '<h3>Disinformer: Approximate Greedy</h1>' + styler_approx_greedy.to_html()
display_html(html_approx_greedy, raw=True)


Unnamed: 0,approx_greedy,greedy,multi_greedy,neighbor
approx_greedy,,88.38 / 75.02,-192.41 / 75.02,19.24 / 75.02
greedy,75.02 / 88.38,,-192.41 / 88.38,19.24 / 88.38
multi_greedy,75.02 / -192.41,88.38 / -192.41,,19.24 / -192.41
neighbor,75.02 / 19.24,88.38 / 19.24,-192.41 / 19.24,


In [None]:

means_df, p_df = plot_means('neighbor')
styler_neighbor = means_df.style.apply(lambda _: (p_df<0.1).replace({
    True: 'background-color:rgb(200, 70, 0, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.05).replace({
    True: 'background-color:rgb(100, 0, 70, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.01).replace({
    True: 'background-color:rgb(0, 0, 70, 0.5);',
    False: ''
}), axis=None)
html_neighbor = '<h3>Disinformer: Neighbor</h1>' +styler_neighbor.to_html()
display_html(html_neighbor, raw=True)


In [None]:

means_df, p_df = plot_means('multi_greedy')
styler_multi_greedy = means_df.style.apply(lambda _: (p_df<0.1).replace({
    True: 'background-color:rgb(200, 70, 0, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.05).replace({
    True: 'background-color:rgb(100, 0, 70, 0.5);',
    False: ''
}), axis=None).apply(lambda _: (p_df<0.01).replace({
    True: 'background-color:rgb(0, 0, 70, 0.5);',
    False: ''
}), axis=None)
html_multi_greedy = '<h3>Disinformer: Multi-Objective Greedy</h1>' +styler_multi_greedy.to_html()
display_html(html_multi_greedy, raw=True)


In [None]:
# fig, ax = plt.subplots(figsize=(7,7))
# ax.matshow(p_df.to_numpy(), cmap=plt.cm.Blues)

# strategies_sorted = sorted(strategies)
# for i in range(4):
#     for j in range(4):
#         c = means_df.iloc[i,j]
#         ax.text(i, j, c, va='center', ha='center')

# ax.set_xticklabels([0]+strategies_sorted+[0])
# ax.set_yticklabels([0]+strategies_sorted+[0])

In [14]:
import statsmodels.api as sm
model = sm.OLS.from_formula('auc ~ mitigator_strategy + disinformer_strategy + mitigator_strategy:disinformer_strategy', df_auc_rankings).fit()

In [15]:
model.summary()

0,1,2,3
Dep. Variable:,auc,R-squared:,0.218
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,3.812
Date:,"Tue, 19 Dec 2023",Prob (F-statistic):,4.3e-07
Time:,13:16:30,Log-Likelihood:,-1912.8
No. Observations:,280,AIC:,3866.0
Df Residuals:,260,BIC:,3938.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,75.0214,62.170,1.207,0.229,-47.400,197.442
mitigator_strategy[T.greedy],13.3571,87.922,0.152,0.879,-159.772,186.487
mitigator_strategy[T.multi_greedy],-267.4286,87.922,-3.042,0.003,-440.558,-94.299
mitigator_strategy[T.neighbor],-55.7857,87.922,-0.634,0.526,-228.915,117.344
disinformer_strategy[T.greedy],4.1071,87.922,0.047,0.963,-169.022,177.237
disinformer_strategy[T.multi_greedy],8.5214,87.922,0.097,0.923,-164.608,181.651
disinformer_strategy[T.neighbor],0.4071,87.922,0.005,0.996,-172.722,173.537
disinformer_strategy[T.random],3.6786,87.922,0.042,0.967,-169.451,176.808
mitigator_strategy[T.greedy]:disinformer_strategy[T.greedy],-2.7500,124.340,-0.022,0.982,-247.592,242.092

0,1,2,3
Omnibus:,262.591,Durbin-Watson:,2.179
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5753.028
Skew:,-3.882,Prob(JB):,0.0
Kurtosis:,23.804,Cond. No.,27.9


In [None]:
df.loc[(df['mitigator_strategy']!='multi_greedy')&(df['disinformer_strategy']!='multi_greedy')]['disinformer_strategy'].unique()

In [None]:
from helpers.helpers import initialize_nx_graph
import networkx as nx

graph = initialize_nx_graph(fb15k237_dataset)
edges = nx.edge_bfs(graph)

In [None]:
import os

experiment_pairs = []
for fname in os.listdir('experiment_inputs'):
    if "input" in fname:
        with open(f'experiment_inputs/{fname}', 'r') as f:
            line1, line2 = f.readlines()
            pair = (line1.strip().split('-'), line2.split('-'))
            experiment_pairs.append(pair)


In [None]:
from helpers.plotting_utilities import get_all_exp_results, get_disinformer_mitigator_avg_difference, get_stats_tables

strategies = ["random", "greedy", "neighbor", "approx_greedy"]

tables = get_stats_tables(experiment_pairs, label_map)
all_exp_results = get_all_exp_results(experiment_pairs, label_map, strategies)
all_diffs = get_disinformer_mitigator_avg_difference(all_exp_results, experiment_pairs, strategies, label_map, tables)


In [None]:
all_diffs[get_base_exp_name(experiment_pairs[0][0], experiment_pairs[0][1], label_map)]['random_random_2']



In [None]:
from helpers.plotting_utilities import plot_matrix_diffs

plot_matrix_diffs(strategies, 
                  ['random'], 
                  experiment_pairs,
                  all_diffs, 
                  'test_plots.png',
                  label_map,
                  tables,
                  with_cost = False)



Debugging efforts

In [None]:
# check epoch differences
good_fact, bad_fact = experiment_pairs[2]
base_exp_name = f"{label_map[good_fact[0]]['label']}_{label_map[good_fact[2]]['label']}_{label_map[bad_fact[2]]['label']}"
ds = []

res_folder = f'results/{base_experiment_name}_greedy_random'
for e in range(3):
    ds.append(Dataset(name="FB15k-237", load=True, train_path=res_folder+f"/epoch_{e}_dataset.txt", test_path=test_path, valid_path=valid_path))


In [None]:
ds[0].train_samples_set == ds[1].train_samples_set

In [None]:
ds[1].train_samples_set == ds[2].train_samples_set

In [None]:
ds[2].train_samples_set == ds[0].train_samples_set

In [None]:
# check the head predictions and tail predictions
with open(f"{res_folder}/all_results.json", "r") as infile:
    data = json.load(infile)
    print(data[0]["15"]["tail_preds"])
    print(data[1]["15"]["tail_preds"])
    print(data[2]["15"]["tail_preds"])


### Graph

In [None]:
import matplotlib.pyplot as plt

In [None]:
experiment_pairs = []
for fname in os.listdir('experiment_inputs'):
    if "input" in fname:
        with open(f'experiment_inputs/{fname}', 'r') as f:
            line1, line2 = f.readlines()
            pair = (line1.strip().split('-'), line2.split('-'))
            experiment_pairs.append(pair)

In [None]:
from helpers.graph_utilities import get_experiment_records

strategies = ["random", "greedy", "neighbor", "approx_greedy"]

exp_records = get_experiment_records(strategies, experiment_pairs, label_map)

In [None]:
# from helpers.graph_utilities import plot_matrix

# strategies = ["random", "greedy", "neighbor", "approx_greedy"]


# plot_matrix(strategies, ['random'], experiment_pairs, exp_records, 'all_plots.png', label_map, attack_budget=15, with_cost=True, cost_spent=cost_spent)



In [None]:
non_random_strategies = ["random", "greedy", "neighbor", "approx_greedy"]

attack_budget = 15

nrows = len(strategies) **2
ncols = len(experiment_pairs) * 2

fig, ax = plt.subplots(nrows, ncols, figsize=(50,50), sharex=True, sharey=True)
plt.gca().invert_yaxis()


for j in range(len(experiment_pairs)):
    good_fact, bad_fact = experiment_pairs[j]
    ax[0][j*2].set_title(get_shortened_name(good_fact, bad_fact))
    ax[0][j*2+1].set_title(get_shortened_name(bad_fact, good_fact))
    
for i in range(len(strategies)):
    disinformer_strategy = strategies[i]
    for m in range(len(non_random_strategies)):
        mitigator_strategy = non_random_strategies[m]
        row = ax[i*(len(non_random_strategies)) + m]
        row[0].set_ylabel(f"Disinformer {disinformer_strategy}\n Mitigator {mitigator_strategy}")
        for j in range(0, ncols, 2):
            try:
                good_fact, bad_fact = experiment_pairs[j//2]
                base_experiment_name = get_base_exp_name(good_fact, bad_fact)

                exp_rankings = exp_records[base_experiment_name][f'{mitigator_strategy}_{disinformer_strategy}'][0]
                random_rankings = exp_records[base_experiment_name][f'random_{disinformer_strategy}'][0]
                row[j].plot(random_rankings, color='blue', label='Random')
                row[j].plot(exp_rankings, color='red', label=mitigator_strategy)
                row[j].fill_between([a for a in range(attack_budget+1)], random_rankings,exp_rankings, where=(np.array(exp_rankings)-np.array(random_rankings))>=0, color='red',alpha=0.2)
                row[j].fill_between([a for a in range(attack_budget+1)], exp_rankings,random_rankings, where=(np.array(exp_rankings)-np.array(random_rankings))<=0, color='green',alpha=0.2)
                row[j].legend(loc='lower right')
                mitigator_auc = exp_records[base_experiment_name][f'random_{disinformer_strategy}'][2]
                mitigator_auc -= exp_records[base_experiment_name][f'{mitigator_strategy}_{disinformer_strategy}'][2]
                row[j].text(0,40,f"AUC: {round(mitigator_auc, 2)}", fontsize='medium')
                
                cost_exp = cost_spent[base_experiment_name][f'{mitigator_strategy}_{disinformer_strategy}']['mitigator_cost']
                cost_random = cost_spent[base_experiment_name][f'random_{disinformer_strategy}']['mitigator_cost']
                cost_ax = row[j].twinx()
                cost_ax.plot(cost_random, color='red', label='random')
                cost_ax.plot(cost_exp, color='blue', label=mitigator_strategy)
                
                exp_rankings = exp_records[base_experiment_name][f'{disinformer_strategy}_{mitigator_strategy}'][1]
                random_rankings = exp_records[base_experiment_name][f'{disinformer_strategy}_random'][1]
                row[j+1].plot(random_rankings, color='blue', label='Random')
                row[j+1].plot(exp_rankings, color='red', label=mitigator_strategy)
                row[j+1].fill_between([a for a in range(attack_budget+1)], random_rankings,exp_rankings, where=(np.array(exp_rankings)-np.array(random_rankings))>=0, color='red',alpha=0.2)
                row[j+1].fill_between([a for a in range(attack_budget+1)], exp_rankings,random_rankings, where=(np.array(exp_rankings)-np.array(random_rankings))<=0, color='green',alpha=0.2)
                row[j+1].legend(loc='lower right')
                disinformer_auc = exp_records[base_experiment_name][f'{disinformer_strategy}_random'][3]
                disinformer_auc -= exp_records[base_experiment_name][f'{disinformer_strategy}_{mitigator_strategy}'][3]
                row[j+1].text(0,40,f"AUC: {round(disinformer_auc, 2)}", fontsize='medium')
                
            except Exception as e:
                print("Exception", e)
                continue
                
fig.savefig("all_plots.png")


Alternative neighbor strategy
- get all possible paths bn jackie chan and rush hour
- the distance of a fact is the min number of facts in the paths that include this fact
- if a fact is not on any path bn head and tail, it has a distance of infinity

Random vs random, run 10 times to make sure sanity check, no errors in code, differences due to statistical erro

TODO 17-10-2023
- Add neighbor strategy using closest nodes first
- check why they don't end up at the same point?
    - note: even in the same experiment, different epochs have different terminal rankings
    - Issue is because of common facts in their budgets: When a fact gets selected by one agent and it was already added
    to the dataset by the other agent, I forgot to remove it. So it could keep getting selected over and over again
    while nothing is being added to the dataset, so in reality between different epochs the facts being added are different.
- more topics
- side plot showing price cumulative
- investigate subgraphs