In [None]:
CREATE_MINIS = False
ANONYMISE_MINIS = True

In [13]:
# Issue: Most annotators we run with AIDA-Syn are running into timeouts with GERBIL...
# Let's create a smaller mini dataset, so we can have more than just the few annotators when talking about AIDA-Syn
# Load NIF Datasets using PyNIF
from pynif import NIFCollection

# Paths to the datasets
synthetic_dataset_path = "/mnt/webscistorage/wf7467/agnos/data/generated_dataset_42.nif"

# Load datasets
print("Loading synthetic dataset...")
with open(synthetic_dataset_path, "r", encoding="utf-8") as synthetic_file:
    synthetic_collection = NIFCollection.loads(synthetic_file.read(), format="turtle")
print(f"Finished loading synthetic dataset with {len(synthetic_collection.contexts)} contexts.") 


Loading synthetic dataset...
Finished loading synthetic dataset with 888 contexts.


In [None]:
import random

seed = 42

if CREATE_MINIS:
    def create_mini_dataset(synthetic_collection, num_contexts=50,seed=42, keep_original_text=True):
        mini_collection = NIFCollection(uri=base_uri)
        print(synthetic_collection.contexts)
        # Randomly select contexts from the synthetic dataset
        random.seed(seed)
        random.shuffle(synthetic_collection.contexts)
        limited_collection = synthetic_collection.contexts[:num_contexts]
        for i, context in enumerate(limited_collection):
            # Copy all possible information... unfortunately cannot pass a NIFContext directly. Ugh.
            mini_context = mini_collection.add_context(
                uri=context.original_uri, 
                mention=[context.mention if keep_original_text else ""][0], 
                beginIndex=context.beginIndex, 
                endIndex=context.endIndex)

            # Add the context's properties to the new context
            # Once again, we cannot pass the NIFPhrase directly. Ugh.
            for phrase in context.phrases:
                # Add a phrase to the context
                mini_context.add_phrase(
                beginIndex=phrase.beginIndex,
                endIndex=phrase.endIndex,
                annotator = phrase.annotator,
                score = phrase.score,
                taIdentRef = phrase.taIdentRef,
                taIdentRefLabel = phrase.taIdentRefLabel,
                taClassRef = phrase.taClassRef,
                taMsClassRef = phrase.taMsClassRef,
                uri = phrase.original_uri,
                source = phrase.source,
                is_hash_based_uri = phrase.isContextHashBasedString
                )
            
        print(f"Created mini dataset with {len(mini_collection.contexts)} contexts.")

        # context = collection.add_context(
        #     uri=document_uri,
        #     mention=sentence_str
        # )
        
        # for phrase in phrases:
        #     # Add a phrase to the context
        #     context.add_phrase(
        #         beginIndex=phrase['beginIndex'],
        #         endIndex=phrase['endIndex'],
        #         taIdentRef=phrase['taIdentRef']
        #     )

        generated_nif = mini_collection.dumps(format='turtle')
        return generated_nif


        # #print(generated_nif)
        # with open(nif_dataset_output_path, "w", encoding='utf-8') as dataset_file:
        #     dataset_file.write(generated_nif)



    base_uri = "http://anon.ymous/"
    document_uri = base_uri + "document"
    nif_dataset_output_path = f"/mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_{{}}_{seed}.nif"

    # Let's randomly pick 10 contexts from the synthetic dataset
    doc_count = 10
    nif_dump = create_mini_dataset(synthetic_collection, num_contexts=doc_count, seed=seed)
    with open(nif_dataset_output_path.format(doc_count), "w", encoding='utf-8') as dataset_file:
        print(f"Writing {doc_count} contexts to {nif_dataset_output_path.format(doc_count)}")
        dataset_file.write(nif_dump)

    doc_count = 50
    nif_dump = create_mini_dataset(synthetic_collection, num_contexts=doc_count, seed=seed)
    with open(nif_dataset_output_path.format(doc_count), "w", encoding='utf-8') as dataset_file:
        print(f"Writing {doc_count} contexts to {nif_dataset_output_path.format(doc_count)}")
        dataset_file.write(nif_dump)

    doc_count = 100
    nif_dump = create_mini_dataset(synthetic_collection, num_contexts=doc_count, seed=seed)
    with open(nif_dataset_output_path.format(doc_count), "w", encoding='utf-8') as dataset_file:
        print(f"Writing {doc_count} contexts to {nif_dataset_output_path.format(doc_count)}")
        dataset_file.write(nif_dump)

Created mini dataset with 10 contexts.
Writing 10 contexts to /mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_10_42.nif
Created mini dataset with 50 contexts.
Writing 50 contexts to /mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_50_42.nif
Created mini dataset with 100 contexts.
Writing 100 contexts to /mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_100_42.nif


In [22]:
import random
from pynif import NIFCollection, NIFPhrase, NIFContext

ANONYMISE_MINIS = True
if ANONYMISE_MINIS:
    seed = 42

    def anonymize_collection(synthetic_collection, anynomized_uri_base="http://anon.ymous/"):
        mini_collection = NIFCollection(uri=base_uri)
        print(synthetic_collection.contexts)
        for i, context in enumerate(synthetic_collection.contexts):
            anon_uri = anynomized_uri_base + context.original_uri.split("/")[-1]
            # Copy all possible information... unfortunately cannot pass a NIFContext directly. Ugh.
            mini_context = mini_collection.add_context(
                uri=anon_uri, # THIS URI SHOULD ALREADY BE ANONYMIZED/CUSTOMIZED
                mention="", # DO NOT KEEP THE ORIGINAL TEXT
                beginIndex=context.beginIndex, 
                endIndex=context.endIndex)

            # Add the context's properties to the new context
            # Once again, we cannot pass the NIFPhrase directly. Ugh.
            for phrase in context.phrases:
                anon_phrase_uri = anynomized_uri_base + phrase.original_uri.split("/")[-1]
                # Add a phrase to the context
                new_phrase = mini_context.add_phrase(
                beginIndex=phrase.beginIndex,
                endIndex=phrase.endIndex,
                annotator = phrase.annotator,
                score = phrase.score,
                taIdentRef = phrase.taIdentRef,
                taIdentRefLabel = phrase.taIdentRefLabel,
                taClassRef = phrase.taClassRef,
                taMsClassRef = phrase.taMsClassRef,
                uri = anon_phrase_uri,
                source = phrase.source,
                is_hash_based_uri = phrase.isContextHashBasedString
                )

                # Overwrite PyNIF's attributed "mention" as it defaults to "" due to empty original string
                new_phrase.mention = context.mention[new_phrase.beginIndex:new_phrase.endIndex]


        print(f"Created mini dataset with {len(mini_collection.contexts)} contexts.")

        # context = collection.add_context(
        #     uri=document_uri,
        #     mention=sentence_str
        # )
        
        # for phrase in phrases:
        #     # Add a phrase to the context
        #     context.add_phrase(
        #         beginIndex=phrase['beginIndex'],
        #         endIndex=phrase['endIndex'],
        #         taIdentRef=phrase['taIdentRef']
        #     )

        generated_nif = mini_collection.dumps(format='turtle')
        return generated_nif


        # #print(generated_nif)
        # with open(nif_dataset_output_path, "w", encoding='utf-8') as dataset_file:
        #     dataset_file.write(generated_nif)



    base_uri = "http://anon.ymous/"
    document_uri = base_uri + "document"
    nif_dataset_output_path = f"/mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_{{}}_42_anonymised.nif"


    # Now let's anonymize the dataset for reviewers and licensing
    # Mini 10
    print("Loading synthetic dataset - Mini 10...")
    synthetic_dataset_path = "/mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_10_42.nif"
    with open(synthetic_dataset_path, "r", encoding="utf-8") as synthetic_file:
        synthetic_collection_mini10 = NIFCollection.loads(synthetic_file.read(), format="turtle")
    print(f"Finished loading synthetic dataset with {len(synthetic_collection_mini10.contexts)} contexts.") 

    print("Anonymising synthetic dataset - Mini 10...")
    nif_dump = anonymize_collection(synthetic_collection_mini10)
    doc_count = 10
    with open(nif_dataset_output_path.format(doc_count), "w", encoding='utf-8') as dataset_file:
        print(f"Writing {doc_count} contexts to {nif_dataset_output_path.format(doc_count)}")
        dataset_file.write(nif_dump)

    # Mini 50
    print("Loading synthetic dataset - Mini 50...")
    synthetic_dataset_path = "/mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_50_42.nif"
    with open(synthetic_dataset_path, "r", encoding="utf-8") as synthetic_file:
        synthetic_collection_mini50 = NIFCollection.loads(synthetic_file.read(), format="turtle")
    print(f"Finished loading synthetic dataset with {len(synthetic_collection_mini50.contexts)} contexts.") 

    print("Anonymising synthetic dataset - Mini 50...")
    nif_dump = anonymize_collection(synthetic_collection_mini50)
    doc_count = 50
    with open(nif_dataset_output_path.format(doc_count), "w", encoding='utf-8') as dataset_file:
        print(f"Writing {doc_count} contexts to {nif_dataset_output_path.format(doc_count)}")
        dataset_file.write(nif_dump)

    # Mini 100
    print("Loading synthetic dataset - Mini 100...")
    synthetic_dataset_path = "/mnt/webscistorage/wf7467/agnos/data/AIDA-Syn_mini_100_42.nif"
    with open(synthetic_dataset_path, "r", encoding="utf-8") as synthetic_file:
        synthetic_collection_mini100 = NIFCollection.loads(synthetic_file.read(), format="turtle")
    print(f"Finished loading synthetic dataset with {len(synthetic_collection_mini100.contexts)} contexts.") 

    print("Anonymising synthetic dataset - Mini 100...")
    nif_dump = anonymize_collection(synthetic_collection_mini100)
    doc_count = 100
    with open(nif_dataset_output_path.format(doc_count), "w", encoding='utf-8') as dataset_file:
        print(f"Writing {doc_count} contexts to {nif_dataset_output_path.format(doc_count)}")
        dataset_file.write(nif_dump)


    # Anonymise the full dataset
    print("Loading synthetic dataset - Full...")
    synthetic_dataset_path = "/mnt/webscistorage/wf7467/agnos/data/generated_dataset_42.nif"
    with open(synthetic_dataset_path, "r", encoding="utf-8") as synthetic_file:
        synthetic_collection_full = NIFCollection.loads(synthetic_file.read(), format="turtle")
    print(f"Finished loading full synthetic dataset with {len(synthetic_collection_full.contexts)} contexts.")
    print("Anonymising synthetic dataset - Full...")
    nif_dump = anonymize_collection(synthetic_collection_full)
    doc_count = len(synthetic_collection_full.contexts)
    with open(nif_dataset_output_path.format(doc_count), "w", encoding='utf-8') as dataset_file:
        print(f"Writing {doc_count} contexts to {nif_dataset_output_path.format(doc_count)}")
        dataset_file.write(nif_dump)
    print("Finished anonymising full synthetic dataset.")
    # Now we have a full anonymised dataset, we can use this for the anonymous paper

Loading synthetic dataset - Mini 10...
Finished loading synthetic dataset with 10 contexts.
Anonymising synthetic dataset - Mini 10...
[<NIFContext 0-1129: 'Potent landmines found near Brazilian presidency ....'>, <NIFContext 0-969: 'Brazil cities to ban disposable plastic containers...'>, <NIFContext 0-344: 'CRICKET - AFGHANISTAN BEAT IRELAND BY FIVE WICKETS...'>, <NIFContext 0-1439: 'Australian daily port , shipping update for Dec 6 ...'>, <NIFContext 0-480: 'John Lewis UK store sales up 4.5 % in week .  Self...'>, <NIFContext 0-1831: 'American hostage in Syria describes ordeal .  WASH...'>, <NIFContext 0-1509: 'Toys "R" Us sees Q2 loss similar to Q1 loss .  WAY...'>, <NIFContext 0-848: 'SOCCER - ENGLISH FIRST DIVISION RESULTS / STANDING...'>, <NIFContext 0-1716: 'S.AFRICAN TRUTH BODY TO SUMMON APARTHEID POLICE . ...'>, <NIFContext 0-355: "India 's Levy to meet Narendra Modi in Pakistan . ...">]
Created mini dataset with 10 contexts.
Writing 10 contexts to /mnt/webscistorage/wf7467/a

In [85]:
import math
game_counter = 1
candidates = 100
k = 10
to = math.ceil(math.log2(candidates))
print(f"Number of rounds: {to}")
for j in range(1, to):
    denom = k**j
    current_games = math.floor(candidates/denom)
    print(denom)
    print(f"Number of games for {j} rounds: {current_games}")
    game_counter += current_games

print(f"Total number of games: {game_counter}")

Number of rounds: 7
10
Number of games for 1 rounds: 10
100
Number of games for 2 rounds: 1
1000
Number of games for 3 rounds: 0
10000
Number of games for 4 rounds: 0
100000
Number of games for 5 rounds: 0
1000000
Number of games for 6 rounds: 0
Total number of games: 12
