# A Jupyter Notebook to Compute Metrics on the Synthetically Generated NIF Dataset and Perform Advanced Analyses
This notebook provides a comprehensive analysis of the synthetic NIF dataset, comparing it with the prior dataset to evaluate metrics such as entity consistency, document coverage, and topic modeling.

## Import Required Libraries
Import necessary libraries such as `pandas`, `rdflib`, `sklearn`, `nltk`, and `sparqlwrapper`.

In [1]:
# Import Required Libraries
import pandas as pd
from rdflib import Graph
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from SPARQLWrapper import SPARQLWrapper, JSON

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/webscistorage/wf7467/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load NIF Datasets
Load the prior and synthetic NIF datasets using `rdflib` or a similar library.

In [2]:
# Load NIF Datasets using PyNIF
from pynif import NIFCollection

# Paths to the datasets
prior_dataset_path = "/mnt/webscistorage/wf7467/agnos/data/AIDA-YAGO2-dataset.tsv_nif"
synthetic_dataset_path = "/mnt/webscistorage/wf7467/agnos/data/generated_dataset_42.nif"

# Load datasets
print("Loading prior dataset...")
with open(prior_dataset_path, "r", encoding="utf-8") as prior_file:
    prior_collection = NIFCollection.loads(prior_file.read(), format="turtle")
print(f"Finished Loading prior dataset with {len(prior_collection.contexts)} contexts.")

print("Loading synthetic dataset...")
with open(synthetic_dataset_path, "r", encoding="utf-8") as synthetic_file:
    synthetic_collection = NIFCollection.loads(synthetic_file.read(), format="turtle")
print(f"Finished loading synthetic dataset with {len(synthetic_collection.contexts)} contexts.")

Loading prior dataset...
Finished Loading prior dataset with 1393 contexts.
Loading synthetic dataset...
Finished loading synthetic dataset with 888 contexts.


In [10]:
mention_count = 0
for context in synthetic_collection.contexts:
    print(f"{len(context.phrases)}: {context.phrases}")
    mention_count += len(context.phrases)
print(f"Total mentions in synthetic dataset: {mention_count}")


11: [<NIFPhrase 5-10: 'Dylan'>, <NIFPhrase 51-59: 'NEW YORK'>, <NIFPhrase 853-861: 'Canadian'>, <NIFPhrase 116-124: 'American'>, <NIFPhrase 139-148: 'Bob Dylan'>, <NIFPhrase 361-366: 'Dylan'>, <NIFPhrase 388-396: 'New York'>, <NIFPhrase 474-482: 'American'>, <NIFPhrase 491-498: 'Chicago'>, <NIFPhrase 652-657: 'Dylan'>, <NIFPhrase 807-812: 'Dylan'>]
14: [<NIFPhrase 0-6: 'French'>, <NIFPhrase 42-49: 'Algeria'>, <NIFPhrase 531-537: 'France'>, <NIFPhrase 596-603: 'Algeria'>, <NIFPhrase 802-808: 'French'>, <NIFPhrase 827-834: 'Algeria'>, <NIFPhrase 53-60: 'ALGIERS'>, <NIFPhrase 75-81: 'French'>, <NIFPhrase 112-118: 'French'>, <NIFPhrase 149-156: 'Algeria'>, <NIFPhrase 188-194: 'French'>, <NIFPhrase 360-367: 'Algeria'>, <NIFPhrase 434-440: 'French'>, <NIFPhrase 511-514: 'AFP'>]
3: [<NIFPhrase 0-9: 'Arlington'>, <NIFPhrase 12-20: 'Virginia'>, <NIFPhrase 62-72: 'WASHINGTON'>]
15: [<NIFPhrase 47-53: 'VIENNA'>, <NIFPhrase 156-177: 'Vienna Stock Exchange'>, <NIFPhrase 1713-1721: 'Austrian'>, <NIF

## File stuffz

In [3]:
# Add this after the import section
import os
import json
from datetime import datetime

# Create a results directory for storing metrics
results_dir = "/mnt/webscistorage/wf7467/agnos/analysis_results"
os.makedirs(results_dir, exist_ok=True)

# Generate a timestamp for the output files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
metrics_file = os.path.join(results_dir, f"dataset_metrics_{timestamp}.json")
type_transformations_file = os.path.join(results_dir, f"type_transformations_{timestamp}.json")

# Dictionary to store all metrics for output
all_metrics = {
    "dataset_info": {
        "prior_dataset": prior_dataset_path,
        "synthetic_dataset": synthetic_dataset_path,
        "timestamp": timestamp
    },
    "entity_counts": {},
    "mention_metrics": {},
    "type_consistency": {},
    "topic_coverage": {}
}

## Count Entities and Documents
Count the total number of entities and documents in the synthetic dataset.

In [4]:
# Count Entities and Documents
def count_entities_and_documents(nif_collection):
    # Count the number of contexts (documents)
    document_count = len(nif_collection.contexts)
    
    # Count the number of unique entities
    entity_set = set()
    for context in nif_collection.contexts:
        for phrase in context.phrases:
            if phrase.taIdentRef:
                entity_set.add(phrase.taIdentRef)
    
    entity_count = len(entity_set)
    return entity_count, document_count

# Count for prior dataset
prior_entity_count, prior_document_count = count_entities_and_documents(prior_collection)
print(f"Prior Dataset: {prior_entity_count} entities, {prior_document_count} documents.")

# Count for synthetic dataset
synthetic_entity_count, synthetic_document_count = count_entities_and_documents(synthetic_collection)
print(f"Synthetic Dataset: {synthetic_entity_count} entities, {synthetic_document_count} documents.")

Prior Dataset: 5598 entities, 1393 documents.
Synthetic Dataset: 4120 entities, 888 documents.


## Compute Average Mentions and Candidates
Calculate the average mentions per document and the average candidates per document/mention.

In [5]:
# Compute Average Mentions and Candidates
def compute_average_mentions_and_candidates(nif_collection):
    total_mentions = 0
    total_candidates = 0
    document_count = len(nif_collection.contexts)

    for context in nif_collection.contexts:
        mentions_in_doc = len(context.phrases)
        total_mentions += mentions_in_doc

        for phrase in context.phrases:
            if phrase.taIdentRef:
                # Assuming candidates are stored in a list or similar structure
                # Replace `len(phrase.candidates)` with the actual way candidates are stored
                total_candidates += 1  # Increment for each candidate (adjust if needed)

    avg_mentions_per_doc = total_mentions / document_count if document_count > 0 else 0
    avg_candidates_per_mention = total_candidates / total_mentions if total_mentions > 0 else 0

    return avg_mentions_per_doc, avg_candidates_per_mention

# Compute for synthetic dataset
avg_mentions, avg_candidates = compute_average_mentions_and_candidates(synthetic_collection)
print(f"Average mentions per document: {avg_mentions:.2f}")
print(f"Average candidates per mention: {avg_candidates:.2f}")

Average mentions per document: 17.25
Average candidates per mention: 1.00


## Analyse RDF Type Consistency
Send SPARQL queries to the DBpedia endpoint to compare `rdf:type` between the prior and synthetic datasets and compute type consistency coverage.

In [6]:
# Analyse RDF Type Consistency
from sparqlutils import query_multiple_uris, build_get_type
from collections import Counter
import json
import time
import glob

def analyse_rdf_type_consistency(nif_collection, collection_name, continue_from=None, retry_delay=60, prefix_uri_from="http://en.wikipedia.org/wiki/", prefix_uri_to="http://dbpedia.org/resource/"):
    # Collect all entities from the NIF collection
    entities = set()
    document_entities = {}
    
    for context in nif_collection.contexts:
        # Use the document's URI as the key
        key = context.uri
        phrases_entities = []
        
        for phrase in context.phrases:
            entity = phrase.taIdentRef
            if entity:
                # Make sure it is a DBpedia entity and not from Wikipedia
                #entity = entity.replace("http://en.wikipedia.org/wiki/", "http://dbpedia.org/resource/")
                entity = entity.replace(prefix_uri_from, prefix_uri_to)
                entities.add(entity)
                # Store the actual mention text along with the entity URI
                phrases_entities.append({
                    "mention": phrase.mention,
                    "entity": entity,
                    "beginIndex": phrase.beginIndex,
                    "endIndex": phrase.endIndex
                })
        
        document_entities[key] = phrases_entities
    
    # Load previously saved types if continuing from an existing file
    entity_types = {}
    start_index = 0
    
    if continue_from:
        try:
            with open(continue_from, 'r') as f:
                entity_types = json.load(f)
            print(f"Loaded {len(entity_types)} entity types from {continue_from}")
            
            # Calculate which entities are already processed
            processed_entities = set(entity_types.keys())
            entities_to_process = entities - processed_entities
            entity_list = list(entities_to_process)
            print(f"Continuing with {len(entities_to_process)} remaining entities out of {len(entities)} total")
        except Exception as e:
            print(f"Error loading previous results: {e}, starting fresh")
            entity_list = list(entities)
    else:
        # Find the most recent intermediate file for this collection
        pattern = os.path.join(results_dir, f"{collection_name}_types_intermediate_*.json")
        existing_files = glob.glob(pattern)
        
        if existing_files:
            most_recent = max(existing_files, key=os.path.getctime)
            try:
                with open(most_recent, 'r') as f:
                    entity_types = json.load(f)
                print(f"Automatically continuing from most recent file: {most_recent}")
                print(f"Loaded {len(entity_types)} entity types")
                
                # Calculate which entities are already processed
                processed_entities = set(entity_types.keys())
                entities_to_process = entities - processed_entities
                entity_list = list(entities_to_process)
                print(f"Continuing with {len(entities_to_process)} remaining entities out of {len(entities)} total")
            except Exception as e:
                print(f"Error loading recent results: {e}, starting fresh")
                entity_list = list(entities)
        else:
            entity_list = list(entities)
    
    if not entity_list:
        print(f"All entities already processed for {collection_name}. No queries needed.")
        return entity_types, document_entities
    
    # Query DBpedia for rdf:type information
    step = 50  # Query in batches to avoid exceeding SPARQL limits
    
    # Create intermediate file to save types as they're being retrieved
    intermediate_types_file = os.path.join(results_dir, f"{collection_name}_types_intermediate_{timestamp}.json")
    
    for i in range(0, len(entity_list), step):
        batch = entity_list[i:i + step]
        print(f"Querying types for entities {i} to {min(i+step, len(entity_list))} of {len(entity_list)}")
        
        try:
            results = query_multiple_uris(batch, build_get_type)
            for entity, types in results.items():
                entity_types[entity] = list(types) if isinstance(types, set) else [types]
            
            # Save intermediate results after each batch
            with open(intermediate_types_file, 'w') as f:
                json.dump(entity_types, f, indent=2)
            print(f"Saved intermediate types ({len(entity_types)} entities) to {intermediate_types_file}")
            
        except Exception as e:
            print(f"Error querying batch {i}-{i+step}: {e}")
            print(f"Saved progress to {intermediate_types_file} with {len(entity_types)} entities")
            print(f"To continue, run the function with continue_from='{intermediate_types_file}'")
            print(f"Waiting {retry_delay} seconds before retrying...")
            time.sleep(retry_delay)
            # Try again with the same batch
            i -= step
    
    # Save final types to a persistent file
    final_types_file = os.path.join(results_dir, f"{collection_name}_entity_types_{timestamp}.json")
    with open(final_types_file, 'w') as f:
        json.dump(entity_types, f, indent=2)
    print(f"Saved all entity types to {final_types_file}")
    
    return entity_types, document_entities

# Get RDF types for both datasets with continuation capability
print("Analyzing synthetic dataset types...")
# To continue from a specific file, uncomment and specify the file path:
#synthetic_continue_from = "/mnt/webscistorage/wf7467/agnos/analysis_results/synthetic_types_intermediate_20250425_164755.json"
synthetic_entity_types, synthetic_doc_entities = analyse_rdf_type_consistency(
    synthetic_collection, "synthetic", #continue_from=synthetic_continue_from
)
print(f"Retrieved types for {len(synthetic_entity_types)} entities in synthetic dataset")

print("Analyzing prior dataset types...")
# To continue from a specific file, uncomment and specify the file path:
# prior_continue_from = "/mnt/webscistorage/wf7467/agnos/analysis_results/prior_types_intermediate_20250425_164755.json"
prior_entity_types, prior_doc_entities = analyse_rdf_type_consistency(
    prior_collection, "prior" #, continue_from=prior_continue_from
)
print(f"Retrieved types for {len(prior_entity_types)} entities in prior dataset")

Analyzing synthetic dataset types...
Automatically continuing from most recent file: /mnt/webscistorage/wf7467/agnos/analysis_results/synthetic_types_intermediate_20250426_091152.json
Loaded 3807 entity types
Continuing with 313 remaining entities out of 4120 total
Querying types for entities 0 to 50 of 313
Saved intermediate types (3807 entities) to /mnt/webscistorage/wf7467/agnos/analysis_results/synthetic_types_intermediate_20250426_091420.json
Querying types for entities 50 to 100 of 313
Saved intermediate types (3807 entities) to /mnt/webscistorage/wf7467/agnos/analysis_results/synthetic_types_intermediate_20250426_091420.json
Querying types for entities 100 to 150 of 313
Saved intermediate types (3807 entities) to /mnt/webscistorage/wf7467/agnos/analysis_results/synthetic_types_intermediate_20250426_091420.json
Querying types for entities 150 to 200 of 313
Saved intermediate types (3807 entities) to /mnt/webscistorage/wf7467/agnos/analysis_results/synthetic_types_intermediate_202

In [7]:
# Compute type consistency metrics with enhanced output
def compute_detailed_type_consistency(prior_doc_entities, synthetic_doc_entities, prior_entity_types, synthetic_entity_types):
    # Track document-level and entity-level type consistency
    doc_consistency_scores = {}
    type_transformations = {}
    entity_mappings = {}  # Track entity-level mappings with document context
    total_docs = 0
    consistent_docs = 0
    
    # Find documents that exist in both datasets using URI mapping
    common_docs = set(prior_doc_entities.keys()) & set(synthetic_doc_entities.keys())
    
    for doc_uri in common_docs:
        total_docs += 1
        # Make sure they are sorted by begin index to ensure correct order of entity comparisons
        prior_entities =         prior_doc_entities[doc_uri]
        prior_entities.sort(key=lambda x: x["beginIndex"])
        synthetic_entities = synthetic_doc_entities[doc_uri]
        synthetic_entities.sort(key=lambda x: x["beginIndex"])
        #print(f"prior_doc_entities[{doc_uri}]: ", prior_doc_entities[doc_uri])        
        # Unsorted prior_entities / synthetic_entities --> potential cause of inconsistency
        #prior_entities = prior_doc_entities[doc_uri]
        #synthetic_entities = synthetic_doc_entities[doc_uri]
        
        doc_consistency = {
            "prior_entities": len(prior_entities),
            "synthetic_entities": len(synthetic_entities),
            "entity_consistency": []
        }
        
        # Match entities by position/index when possible
        for i in range(min(len(prior_entities), len(synthetic_entities))):
            prior_entity_info = prior_entities[i]
            synthetic_entity_info = synthetic_entities[i]
            
            prior_uri = prior_entity_info["entity"]
            synthetic_uri = synthetic_entity_info["entity"]
            
            # Fix: Handle nested lists in entity_types
            prior_types_raw = prior_entity_types.get(prior_uri, [])
            synthetic_types_raw = synthetic_entity_types.get(synthetic_uri, [])
            
            # Flatten any nested lists and convert to set
            prior_types = set()
            if isinstance(prior_types_raw, list):
                if prior_types_raw and isinstance(prior_types_raw[0], list):
                    for sublist in prior_types_raw:
                        prior_types.update(sublist)
                else:
                    prior_types = set(prior_types_raw)
            else:
                prior_types = {prior_types_raw}
                
            synthetic_types = set()
            if isinstance(synthetic_types_raw, list):
                if synthetic_types_raw and isinstance(synthetic_types_raw[0], list):
                    for sublist in synthetic_types_raw:
                        synthetic_types.update(sublist)
                else:
                    synthetic_types = set(synthetic_types_raw)
            else:
                synthetic_types = {synthetic_types_raw}
            
            # Calculate consistency for this entity
            shared_types = prior_types & synthetic_types
            all_types = prior_types | synthetic_types
            consistency = len(shared_types) / len(all_types) if all_types else 0
            
            # Track type transformations globally
            for p_type in prior_types:
                if p_type not in type_transformations:
                    type_transformations[p_type] = Counter()
                
                for s_type in synthetic_types:
                    type_transformations[p_type][s_type] += 1
            
            # Store entity-level mapping with document context
            entity_mapping = {
                "document_uri": doc_uri,
                "prior_entity": prior_uri,
                "prior_mention": prior_entity_info["mention"],
                "prior_types": list(prior_types),
                "synthetic_entity": synthetic_uri,
                "synthetic_mention": synthetic_entity_info["mention"],
                "synthetic_types": list(synthetic_types),
                "consistency_score": consistency,
                "shared_types": list(shared_types),
                "prior_only_types": list(prior_types - synthetic_types),
                "synthetic_only_types": list(synthetic_types - prior_types)
            }
            
            # Track entity mappings by prior entity
            if prior_uri not in entity_mappings:
                entity_mappings[prior_uri] = []
            entity_mappings[prior_uri].append(entity_mapping)
            
            # Store entity consistency details in document
            entity_consistency = {
                "prior_entity": prior_uri,
                "prior_mention": prior_entity_info["mention"],
                "synthetic_entity": synthetic_uri,
                "synthetic_mention": synthetic_entity_info["mention"],
                "consistency_score": consistency,
                "shared_types": list(shared_types),
                "prior_only_types": list(prior_types - synthetic_types),
                "synthetic_only_types": list(synthetic_types - prior_types)
            }
            
            doc_consistency["entity_consistency"].append(entity_consistency)
        
        # Calculate overall document consistency
        if doc_consistency["entity_consistency"]:
            doc_avg_consistency = sum(e["consistency_score"] for e in doc_consistency["entity_consistency"]) / len(doc_consistency["entity_consistency"])
            doc_consistency["average_consistency"] = doc_avg_consistency
            
            if doc_avg_consistency > 0.5:  # Consider document consistent if average is above 50%
                consistent_docs += 1
        else:
            doc_consistency["average_consistency"] = 0
            
        doc_consistency_scores[doc_uri] = doc_consistency
    
    # Calculate overall consistency metrics
    overall_metrics = {
        "total_documents": total_docs,
        "consistent_documents": consistent_docs,
        "document_consistency_rate": consistent_docs / total_docs if total_docs > 0 else 0,
        "average_document_consistency": sum(doc["average_consistency"] for doc in doc_consistency_scores.values()) / len(doc_consistency_scores) if doc_consistency_scores else 0
    }
    
    # Convert Counter objects to dictionaries for JSON serialization
    type_transformations_dict = {
        src_type: dict(counter) for src_type, counter in type_transformations.items()
    }
    
    return doc_consistency_scores, overall_metrics, type_transformations_dict, entity_mappings

# Compute detailed type consistency metrics with enhanced output
doc_consistency, overall_metrics, type_transformations, entity_mappings = compute_detailed_type_consistency(
    prior_doc_entities, synthetic_doc_entities, prior_entity_types, synthetic_entity_types)

# Save metrics to the all_metrics dictionary
all_metrics["type_consistency"] = {
    "overall_metrics": overall_metrics,
    "document_consistency": doc_consistency
}

# Save entity-level mappings to a separate file
entity_mappings_file = os.path.join(results_dir, f"entity_mappings_{timestamp}.json")
with open(entity_mappings_file, 'w') as f:
    json.dump(entity_mappings, f, indent=2)
print(f"Entity mappings saved to {entity_mappings_file}")

# Save document-level consistency to a separate file
document_consistency_file = os.path.join(results_dir, f"document_consistency_{timestamp}.json")
with open(document_consistency_file, 'w') as f:
    json.dump(doc_consistency, f, indent=2)
print(f"Document consistency saved to {document_consistency_file}")

# Save type transformations to a separate file for detailed analysis
with open(type_transformations_file, 'w') as f:
    json.dump(type_transformations, f, indent=2)
print(f"Type transformations saved to {type_transformations_file}")

# Print summary metrics
print(f"Type Consistency Analysis:")
print(f"Total documents analyzed: {overall_metrics['total_documents']}")
print(f"Documents with consistent entity types: {overall_metrics['consistent_documents']}")
print(f"Document consistency rate: {overall_metrics['document_consistency_rate']:.2%}")
print(f"Average document type consistency: {overall_metrics['average_document_consistency']:.2%}")

# Generate a summarized analysis of type transformations
print("\nGenerating summarized type transformation analysis...")
summary_transformations = {}

# For each source type, find top 5 target types
for src_type, targets in type_transformations.items():
    # Get most common transformations
    most_common = Counter(targets).most_common(5)
    # Calculate transformation percentages
    total = sum(targets.values())
    summary_transformations[src_type] = {
        "total_occurrences": total,
        "top_transformations": [
            {
                "target_type": target,
                "count": count,
                "percentage": count / total if total > 0 else 0
            } for target, count in most_common
        ]
    }

# Save summarized type transformations
type_summary_file = os.path.join(results_dir, f"type_transformations_summary_{timestamp}.json")
with open(type_summary_file, 'w') as f:
    json.dump(summary_transformations, f, indent=2)
print(f"Type transformation summary saved to {type_summary_file}")

Entity mappings saved to /mnt/webscistorage/wf7467/agnos/analysis_results/entity_mappings_20250426_091420.json
Document consistency saved to /mnt/webscistorage/wf7467/agnos/analysis_results/document_consistency_20250426_091420.json
Type transformations saved to /mnt/webscistorage/wf7467/agnos/analysis_results/type_transformations_20250426_091420.json
Type Consistency Analysis:
Total documents analyzed: 888
Documents with consistent entity types: 331
Document consistency rate: 37.27%
Average document type consistency: 46.60%

Generating summarized type transformation analysis...
Type transformation summary saved to /mnt/webscistorage/wf7467/agnos/analysis_results/type_transformations_summary_20250426_091420.json


## Analyse Document Topic Coverage
Use a topic modeling library (e.g., `sklearn` or `gensim`) to compare the topics of the old and new texts and compute topic coverage.

In [8]:
# Analyse Document Topic Coverage
from sklearn.metrics.pairwise import cosine_similarity

def analyse_document_topic_coverage(prior_collection, synthetic_collection, num_topics=10):
    # Extract document text from both collections
    prior_documents = [context.mention for context in prior_collection.contexts]
    synthetic_documents = [context.mention for context in synthetic_collection.contexts]
    
    # Create document ID mapping for matching documents
    prior_uri_to_idx = {context.uri: i for i, context in enumerate(prior_collection.contexts)}
    synthetic_uri_to_idx = {context.uri: i for i, context in enumerate(synthetic_collection.contexts)}
    
    # Find common document URIs
    common_uris = set(prior_uri_to_idx.keys()) & set(synthetic_uri_to_idx.keys())
    
    # Extract matched document pairs
    matched_prior_docs = [prior_documents[prior_uri_to_idx[uri]] for uri in common_uris]
    matched_synthetic_docs = [synthetic_documents[synthetic_uri_to_idx[uri]] for uri in common_uris]
    
    # Perform topic modeling
    vectorizer = TfidfVectorizer(
        max_df=0.95, 
        min_df=2,
        stop_words=stopwords.words('english')
    )
    
    # Fit vectorizer on all documents
    all_docs = prior_documents + synthetic_documents
    vectorizer.fit(all_docs)
    
    # Transform document sets into TF-IDF matrices
    prior_tfidf = vectorizer.transform(matched_prior_docs)
    synthetic_tfidf = vectorizer.transform(matched_synthetic_docs)
    
    # Train LDA model on all documents
    lda = LatentDirichletAllocation(
        n_components=num_topics, 
        random_state=42,
        max_iter=10
    )
    lda.fit(vectorizer.transform(all_docs))
    
    # Get document-topic distributions
    prior_topic_dist = lda.transform(prior_tfidf)
    synthetic_topic_dist = lda.transform(synthetic_tfidf)
    
    # Compute similarity between topic distributions
    topic_similarities = []
    for i in range(len(matched_prior_docs)):
        sim = cosine_similarity([prior_topic_dist[i]], [synthetic_topic_dist[i]])[0][0]
        topic_similarities.append(sim)
    
    # Calculate average similarity
    avg_similarity = sum(topic_similarities) / len(topic_similarities) if topic_similarities else 0
    
    # Get top terms for each topic for interpretability
    feature_names = vectorizer.get_feature_names_out()
    topic_terms = []
    
    for topic_idx, topic in enumerate(lda.components_):
        top_terms = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        topic_terms.append(top_terms)
    
    # Create document mapping with similarities
    doc_topic_similarities = []
    for i, uri in enumerate(common_uris):
        doc_topic_similarities.append({
            "uri": uri,
            "prior_document": matched_prior_docs[i][:100] + "...",  # Truncate for readability
            "synthetic_document": matched_synthetic_docs[i][:100] + "...",
            "topic_similarity": topic_similarities[i]
        })
    
    # Compile results
    results = {
        "average_topic_similarity": avg_similarity,
        "document_count": len(matched_prior_docs),
        "topic_terms": topic_terms,
        "document_similarities": doc_topic_similarities
    }
    
    return results

# Analyze document topic coverage
print("Analyzing document topic coverage...")
doc_topic_results = analyse_document_topic_coverage(prior_collection, synthetic_collection)

# Save results to metrics
all_metrics["topic_coverage"]["document_topics"] = {
    "average_similarity": doc_topic_results["average_topic_similarity"],
    "document_count": doc_topic_results["document_count"],
    "topic_terms": doc_topic_results["topic_terms"]
}

print(f"Document topic analysis complete. Average similarity: {doc_topic_results['average_similarity']:.2%}")

Analyzing document topic coverage...


KeyError: 'average_similarity'

In [None]:
# Save all computed metrics to file
with open(metrics_file, 'w') as f:
    json.dump(all_metrics, f, indent=2)
print(f"All metrics saved to {metrics_file}")

# Create summary file for quick reference
summary_file = os.path.join(results_dir, f"summary_{timestamp}.txt")
with open(summary_file, 'w') as f:
    f.write(f"Dataset Analysis Summary ({timestamp})\n")
    f.write(f"=" * 50 + "\n\n")
    f.write(f"Prior Dataset: {prior_dataset_path}\n")
    f.write(f"Synthetic Dataset: {synthetic_dataset_path}\n\n")
    
    f.write(f"Document Counts:\n")
    f.write(f"  Prior: {prior_document_count}\n")
    f.write(f"  Synthetic: {synthetic_document_count}\n\n")
    
    f.write(f"Entity Counts:\n")
    f.write(f"  Prior: {prior_entity_count}\n")
    f.write(f"  Synthetic: {synthetic_entity_count}\n\n")
    
    f.write(f"Average Mentions per Document: {avg_mentions:.2f}\n")
    f.write(f"Average Candidates per Mention: {avg_candidates:.2f}\n\n")
    
    f.write(f"Type Consistency:\n")
    f.write(f"  Document Consistency Rate: {overall_metrics['document_consistency_rate']:.2%}\n")
    f.write(f"  Average Document Type Consistency: {overall_metrics['average_document_consistency']:.2%}\n\n")
    
    f.write(f"Topic Coverage:\n")
    f.write(f"  Document Topic Similarity: {doc_topic_results['average_similarity']:.2%}\n")

print(f"Summary saved to {summary_file}")

## Analyse Entity Topic Coverage
Use a topic modeling library to compare the topics of the old and new entities' descriptions and compute topic coverage.

In [None]:
# Analyse Entity Topic Coverage
def compute_entity_topic_coverage(entity_descriptions):
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
    tfidf_matrix = vectorizer.fit_transform(entity_descriptions)
    
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(tfidf_matrix)
    
    return lda.components_

# Example: Extract entity descriptions from synthetic dataset
synthetic_entity_descriptions = [str(entity) for entity in synthetic_graph.objects(
    predicate=None, object=None)]  # Replace with actual entity description extraction logic
synthetic_entity_topics = compute_entity_topic_coverage(synthetic_entity_descriptions)
print(f"Computed {len(synthetic_entity_topics)} topics for synthetic entities.")