In [6]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Using cached SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Using cached rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Using cached SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Using cached rdflib-7.1.1-py3-none-any.whl (562 kB)
Installing collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 rdflib-7.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jason/nltk_data...


True

# Task 1 (50%)

In [125]:
import requests
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from collections import defaultdict

# DBpedia SPARQL endpoint
DBPEDIA_ENDPOINT = "https://dbpedia.org/sparql"

def get_schema_terms():
    """
    Fetch class, property, and entity terms from DBpedia using requests
    """
    sparql_query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    
    SELECT DISTINCT ?entity_uri ?label ?entity_type ?abstract
    WHERE {
        {
            ?entity_uri rdf:type owl:Class ;
                       rdfs:label ?label .
            OPTIONAL { ?entity_uri dbo:abstract ?abstract }
            FILTER(LANG(?label) = "en")
            FILTER(LANG(?abstract) = "en" || !BOUND(?abstract))
            BIND("Class" AS ?entity_type)
        }
        UNION
        {
            ?entity_uri rdf:type rdf:Property ;
                       rdfs:label ?label .
            OPTIONAL { ?entity_uri dbo:abstract ?abstract }
            FILTER(LANG(?label) = "en")
            FILTER(LANG(?abstract) = "en" || !BOUND(?abstract))
            BIND("Property" AS ?entity_type)
        }
        UNION
        {
            ?entity_uri rdf:type ?someType ;
                       rdfs:label ?label .
            OPTIONAL { ?entity_uri dbo:abstract ?abstract }
            FILTER(LANG(?label) = "en")
            FILTER(LANG(?abstract) = "en" || !BOUND(?abstract))
            FILTER(?someType != owl:Class && ?someType != rdf:Property)
            BIND("Entity" AS ?entity_type)
        }
    }
    ORDER BY ?entity_type ?label
    LIMIT 10000
    """

    # Send the request to DBpedia
    response = requests.get(
        DBPEDIA_ENDPOINT,
        params={
            'query': sparql_query,
            'format': 'json'
        }
    )

    if response.status_code != 200:
        raise Exception(f"Error querying DBpedia: {response.status_code}")

    results = response.json()
    bindings = results.get('results', {}).get('bindings', [])

    # Process results into separate class, property, and entity lists
    classes = []
    properties = []
    entities = []
    
    for result in bindings:
        entity_uri = result['entity_uri']['value']
        label = result['label']['value']
        entity_type = result['entity_type']['value']
        
        if entity_type == 'Class':
            classes.append(entity_uri)
        elif entity_type == 'Property':
            properties.append(entity_uri)
        else:
            entities.append(entity_uri)

    # Create DataFrame for visualization
    data = [
        {
            'Entity URI': result['entity_uri']['value'],
            'Label': result['label']['value'],
            'Entity Type': result['entity_type']['value']
        }
        for result in bindings
    ]
    df = pd.DataFrame(data)
    
    # Add some basic analysis
    type_counts = df['Entity Type'].value_counts()
    print("\nDistribution of entity types:")
    print(type_counts)
    
    print("\nSample of retrieved data:")
    print(df.head())
    
    return df

def detect_ambiguity_dict(terms):
    """
    Detect ambiguous terms using WordNet and calculate normalized ambiguity scores.
    Returns a dictionary with terms and their normalized ambiguity scores (0-1),
    where 1 represents maximum ambiguity found in the dataset.
    """
    # First pass: collect all synset counts to find the maximum
    term_synset_counts = {}
    max_synsets = 0
    
    for term in terms:
        # Extract the last part of the URI as the term name
        term_name = term.split("/")[-1].split("#")[-1]
        # Split by camelCase or underscore and take the last meaningful part
        term_parts = term_name.replace("_", " ").split()
        
        if term_parts:
            # Check the last meaningful word for ambiguity
            word = term_parts[-1].lower()
            synsets = wn.synsets(word)
            synset_count = len(synsets)
            term_synset_counts[term] = {
                'word': word,
                'count': synset_count,
                'synsets': synsets
            }
            max_synsets = max(max_synsets, synset_count)

    # Second pass: normalize scores and structure the output
    ambiguous_terms = defaultdict(list)
    
    for term, data in term_synset_counts.items():
        if data['count'] > 0:  # Only include terms that have at least one synset
            # Calculate normalized score (0-1)
            normalized_score = data['count'] / max_synsets if max_synsets > 0 else 0
            
            ambiguous_terms[data['word']].append({
                'uri': term,
                'raw_synset_count': data['count'],
                'normalized_score': round(normalized_score, 3),
                'synsets': [syn.definition() for syn in data['synsets'][:3]]  # Get first 3 definitions
            })
    
    # Add summary statistics
    print(f"\nAmbiguity Analysis Summary:")
    print(f"Maximum number of synsets found: {max_synsets}")
    print(f"Number of terms analyzed: {len(term_synset_counts)}")
    print("\nSample of normalized scores:")
    # Print a few examples of terms with their scores
    sample_count = 0
    for word, entries in ambiguous_terms.items():
        if sample_count < 3:  # Show first 3 examples
            print(f"\nWord: {word}")
            print(f"Normalized ambiguity score: {entries[0]['normalized_score']}")
            print(f"Raw synset count: {entries[0]['raw_synset_count']}")
            sample_count += 1
            
    return ambiguous_terms

def detect_ambiguity_llm(terms):
    """
    Placeholder for LLM-based ambiguity detection
    """
    ambiguous_terms = []
    # Placeholder for LLM-based ambiguity detection
    return ambiguous_terms

In [121]:
df = get_schema_terms()


Distribution of entity types:
Entity Type
Property    4595
Entity      3898
Class       1507
Name: count, dtype: int64

Sample of retrieved data:
                                   Entity URI            Label Entity Type
0        http://dbpedia.org/ontology/Academic  Academic Person       Class
1          http://dbpedia.org/ontology/تعلیمی  Academic Person       Class
2       http://dbpedia.org/ontology/Algorithm        Algorithm       Class
3     http://dbpedia.org/ontology/حساب_و_شمار        Algorithm       Class
4  http://dbpedia.org/ontology/AmericanLeader  American Leader       Class


In [128]:
len(df)

10000

In [126]:
ambiguity_results = detect_ambiguity_dict(df)



Ambiguity Analysis Summary:
Maximum number of synsets found: 9
Number of terms analyzed: 3

Sample of normalized scores:

Word: label
Normalized ambiguity score: 1.0
Raw synset count: 9

Word: type
Normalized ambiguity score: 0.889
Raw synset count: 8


In [127]:
ambiguity_results

defaultdict(list,
            {'label': [{'uri': 'Label',
               'raw_synset_count': 9,
               'normalized_score': 1.0,
               'synsets': ['a brief description given for purposes of identification',
                'trade name of a company that produces musical recordings',
                'a radioactive isotope that is used in a compound in order to trace the mechanism of a chemical reaction']}],
             'type': [{'uri': 'Entity Type',
               'raw_synset_count': 8,
               'normalized_score': 0.889,
               'synsets': ['a subdivision of a particular kind of thing',
                'a person of a specified kind (usually with many eccentricities)',
                '(biology) the taxonomic group whose characteristics are used to define the next higher taxon']}]})

In [32]:
import re

def contains_english_characters(input_string):
    # Check for any English alphabet characters (A-Z or a-z)
    return bool(re.search(r'[A-Za-z]', input_string))

# Example usage
print(contains_english_characters("Hello123"))  # Output: True
print(contains_english_characters("1234!@#$"))  # Output: False


True
False


In [33]:
names_of_classes = [x.split("/")[-1] for x in classes]
names_of_classes = [x for x in names_of_classes if contains_english_characters(x)]
names_of_classes

['Academic',
 'Algorithm',
 'AmericanLeader',
 'Anime',
 'Annotation',
 'ArcherPlayer',
 'Archive',
 'ArtificialSatellite',
 'AustralianRulesFootballPlayer',
 'Band',
 'Biathlete',
 'BiologicalDatabase',
 'Biomolecule',
 'Blazon',
 'BobsleighAthlete',
 'BritishRoyalty',
 'Browser',
 'Covid19',
 'Capital',
 'CapitalOfRegion',
 'CardinalDirection',
 'Caterer',
 'Chief',
 'ChristianBishop',
 'ChristianDoctrine',
 'ChristianPatriarch',
 'Cipher',
 'ComedyGroup',
 'Community',
 'ConcentrationCamp',
 'Skos',
 'ControlledDesignationOfOriginWine',
 'DBpedian',
 'DTMRacer',
 'Database',
 'Desert',
 'DocumentType',
 'Election',
 'ElectionDiagram',
 'Employer',
 'EmployersOrganisation',
 'prov:Entity',
 'EurovisionSongContestEntry',
 'FileSystem',
 'Food',
 'FormulaOneRacer',
 'GaelicGamesPlayer',
 'GatedCommunity',
 'GeneLocation',
 'Globularswarm',
 'Gnetophytes',
 'GovernmentType',
 'GrandPrix',
 'HistoricalCountry',
 'HistoricalDistrict',
 'HistoricalProvince',
 'HistoricalRegion',
 'Historic

In [123]:
import pandas as pd
from nltk.corpus import wordnet as wn
import nltk

# Ensure WordNet is downloaded
nltk.download('wordnet')

# Function to check ambiguity using WordNet by counting senses
def is_ambiguous_with_wordnet(term):
    senses = wn.synsets(term)
    return len(senses) > 3  # Returns True if the term has more than one meaning

# Function to detect ambiguous entities based on WordNet from a DataFrame
def detect_ambiguous_entities_wordnet(df):
    ambiguous_entities = []

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        label = row['Label']

        # Check if the label has multiple meanings in WordNet
        if is_ambiguous_with_wordnet(label):
            ambiguous_entities.append({
                'Entity URI': row['Entity URI'],
                'Label': label,
                'Entity Type': row['Entity Type'],
                'Ambiguity': True
            })
        else:
            ambiguous_entities.append({
                'Entity URI': row['Entity URI'],
                'Label': label,
                'Entity Type': row['Entity Type'],
                'Ambiguity': False
            })

    # Convert results to DataFrame for easy display and further processing
    ambiguous_entities_df = pd.DataFrame(ambiguous_entities)
    return ambiguous_entities_df


# Run the ambiguity detection function
ambiguity_results = detect_ambiguous_entities_wordnet(df)

# Display the results
print(ambiguity_results)


[nltk_data] Downloading package wordnet to /home/jason/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                             Entity URI  \
0                  http://dbpedia.org/ontology/Academic   
1                    http://dbpedia.org/ontology/تعلیمی   
2                 http://dbpedia.org/ontology/Algorithm   
3               http://dbpedia.org/ontology/حساب_و_شمار   
4            http://dbpedia.org/ontology/AmericanLeader   
...                                                 ...   
9995  http://dbpedia.org/property/fastRiderMotoeCountry   
9996        http://dbpedia.org/property/fastRiderMotoeR   
9997   http://dbpedia.org/property/fastRiderMotoeR2Bike   
9998  http://dbpedia.org/property/fastRiderMotoeR2Co...   
9999        http://dbpedia.org/property/fastRiderMotogp   

                            Label Entity Type  Ambiguity  
0                 Academic Person       Class      False  
1                 Academic Person       Class      False  
2                       Algorithm       Class      False  
3                       Algorithm       Class      Fals

In [124]:
ambiguity_results

Unnamed: 0,Entity URI,Label,Entity Type,Ambiguity
0,http://dbpedia.org/ontology/Academic,Academic Person,Class,False
1,http://dbpedia.org/ontology/تعلیمی,Academic Person,Class,False
2,http://dbpedia.org/ontology/Algorithm,Algorithm,Class,False
3,http://dbpedia.org/ontology/حساب_و_شمار,Algorithm,Class,False
4,http://dbpedia.org/ontology/AmericanLeader,American Leader,Class,False
...,...,...,...,...
9995,http://dbpedia.org/property/fastRiderMotoeCountry,Fast Rider MotoE Country,Property,False
9996,http://dbpedia.org/property/fastRiderMotoeR,Fast Rider MotoE R,Property,False
9997,http://dbpedia.org/property/fastRiderMotoeR2Bike,Fast Rider MotoE R2 Bike,Property,False
9998,http://dbpedia.org/property/fastRiderMotoeR2Co...,Fast Rider MotoE R2 Country,Property,False


# # Task 2 (50%)

In [1]:
import pandas as pd


In [69]:
esco_skills_short = pd.read_csv('esco_skills_en_2.csv')
esco_skills_short

Unnamed: 0,conceptType,conceptUri,skillType,preferredLabel,altLabels,description
0,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/000f1d3d-220f...,knowledge,Haskell,,The techniques and principles of software deve...
1,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00506f28-e884...,knowledge,sport and exercise medicine,sports injury treatment\nsports medicine\nexer...,Prevention and treatement of injuries or condi...
2,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c04e40-35ea...,knowledge,Incremental development,,The incremental development model is a methodo...
3,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c51318-4ea9...,knowledge,use of special equipment for daily activities,,"The types of special equipment, prosthetics an..."
4,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c85cbc-2b24...,knowledge,sawing techniques,sawing technologies\nsawing methods\nsawing te...,Various sawing techniques for using manual as ...
...,...,...,...,...,...,...
2897,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffc1e455-ced2...,knowledge,Capture One,,The computer program Capture One is a graphica...
2898,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffc2465f-d2b5...,knowledge,precious metal processing,processing precious metal\nmethods for process...,Various processing methods on precious metals ...
2899,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffc7b5ae-1bae...,knowledge,dependency on drugs,substance dependency\ndependency on substances...,"Dependency on substances such as alcohol, pres..."
2900,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffddfc7c-a9dd...,knowledge,Scala,,The techniques and principles of software deve...


In [53]:
from SPARQLWrapper import SPARQLWrapper, JSON

def query_dbpedia():
    # Set up the SPARQL endpoint and query
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    # Define the SPARQL query
    query = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbp: <http://dbpedia.org/property/>

    SELECT DISTINCT ?entityLabel ?classLabel
    WHERE {
      ?entity rdfs:label ?entityLabel .
      ?entity a ?class .
      
      # Filter for English labels only
      FILTER (LANG(?entityLabel) = "en")

      # Optional: Retrieve the class label if available
      OPTIONAL {
        ?class rdfs:label ?classLabel .
        FILTER (LANG(?classLabel) = "en")
      }
    }
    """

    # Set the query and retrieve results
    sparql.setQuery(query)
    results = sparql.query().convert()

    # Process and return results
    result_list = []
    for result in results["results"]["bindings"]:
        entity_label = result.get("entityLabel", {}).get("value", "")
        class_label = result.get("classLabel", {}).get("value", "")
        result_list.append((entity_label, class_label))
    
    return result_list

# Example usage
data = query_dbpedia()
for entity, cls in data[:10]:  # Print the first 10 results for example
    print(f"Entity: {entity}, Class: {cls}")


Entity: OpenLink Software, Class: company
Entity: Cabaiguán (cigar), Class: company
Entity: Cabal (software), Class: company
Entity: Cabalen, Class: company
Entity: Caballero Home Video, Class: company
Entity: Cabana Cachaça, Class: company
Entity: Cabart, Class: company
Entity: Cabasse (company), Class: company
Entity: Cabe Rawit Marketing Communications, Class: company
Entity: Cabela's, Class: company


In [None]:
# Run the DBpedia query function to get the list of (entity, class) pairs
query_results = query_dbpedia()

# Extract entity labels from the query results and store in a set for faster lookups
entity_labels_from_dbpedia = set(entity for entity, _ in query_results)

# Define a new column to indicate if each entityLabel in the DataFrame is found in the DBpedia results
esco_skills_short['FoundInDbpedia'] = esco_skills_short['preferredLabel'].apply(lambda x: x in entity_labels_from_dbpedia)

# Display the DataFrame with the new column
display(esco_skills_short)


Unnamed: 0,conceptType,conceptUri,skillType,preferredLabel,altLabels,description,FoundInDbpedia
0,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/000f1d3d-220f...,knowledge,Haskell,,The techniques and principles of software deve...,False
1,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00506f28-e884...,knowledge,sport and exercise medicine,sports injury treatment\nsports medicine\nexer...,Prevention and treatement of injuries or condi...,False
2,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c04e40-35ea...,knowledge,Incremental development,,The incremental development model is a methodo...,False
3,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c51318-4ea9...,knowledge,use of special equipment for daily activities,,"The types of special equipment, prosthetics an...",False
4,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c85cbc-2b24...,knowledge,sawing techniques,sawing technologies\nsawing methods\nsawing te...,Various sawing techniques for using manual as ...,False
...,...,...,...,...,...,...,...
2897,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffc1e455-ced2...,knowledge,Capture One,,The computer program Capture One is a graphica...,False
2898,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffc2465f-d2b5...,knowledge,precious metal processing,processing precious metal\nmethods for process...,Various processing methods on precious metals ...,False
2899,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffc7b5ae-1bae...,knowledge,dependency on drugs,substance dependency\ndependency on substances...,"Dependency on substances such as alcohol, pres...",False
2900,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffddfc7c-a9dd...,knowledge,Scala,,The techniques and principles of software deve...,False


In [61]:
esco_skills_short[esco_skills_short['FoundInDbpedia'] == True]

Unnamed: 0,conceptType,conceptUri,skillType,preferredLabel,altLabels,description,FoundInDbpedia


# Use an LLM to judge if the entity-class pairs are accurate and describe your findings, including which DBPedia classes tend to have the most mistakes.

- we use an improved query in order to search the terms in batches

In [70]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm  # Useful for displaying a progress bar

# DBpedia SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)  # Set return format once

# Optimized function to perform a SPARQL query for a list of skill labels
def query_dbpedia(skills):
    # Prepare SPARQL query with placeholders for skills
    skill_filters = " || ".join(
        [f'CONTAINS(LCASE(?entityLabel), "{skill.lower()}")' for skill in skills]
    )

    sparql_query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbp: <http://dbpedia.org/property/>

    SELECT ?entity ?entityLabel ?class ?classLabel
    WHERE {{
      ?entity rdfs:label ?entityLabel .
      ?entity a ?class .

      # Filter for entities matching any of the skills in English
      FILTER (LANG(?entityLabel) = "en" && ({skill_filters}))

      # Optional: Retrieve the class label if available
      OPTIONAL {{
        ?class rdfs:label ?classLabel .
        FILTER (LANG(?classLabel) = "en")
      }}
    }}
    LIMIT 100
    """

    # Execute the query and parse results
    sparql.setQuery(sparql_query)
    try:
        results = sparql.query().convert()
        output = [
            {
                "entity": result.get("entity", {}).get("value", ""),
                "entityLabel": result.get("entityLabel", {}).get("value", ""),
                "class": result.get("class", {}).get("value", ""),
                "classLabel": result.get("classLabel", {}).get("value", ""),
            }
            for result in results["results"]["bindings"]
        ]
    except Exception as e:
        print(f"Error querying DBpedia:", e)
        output = []

    return output

# Batch skills into chunks to reduce the number of queries
batch_size = 50  # Adjust as needed based on performance or endpoint limits
all_results = []

esco_skills_short = esco_skills_short
for i in tqdm(range(0, len(esco_skills_short["preferredLabel"]), batch_size)):
    skill_batch = esco_skills_short["preferredLabel"][i : i + batch_size]
    skill_results = query_dbpedia(skill_batch)
    for result in skill_results:
        result["preferredLabel"] = next(
            (skill for skill in skill_batch if skill.lower() in result["entityLabel"].lower()), ""
        )
        all_results.append(result)

# Convert results to a DataFrame
results_df = pd.DataFrame(all_results)

# Display the first few rows of the results
print(results_df.head())


100%|██████████| 59/59 [10:29<00:00, 10.67s/it]

                                              entity  \
0  http://dbpedia.org/resource/Cambridge_Institut...   
1  http://dbpedia.org/resource/Cambridge_Institut...   
2  http://dbpedia.org/resource/Cambridge_Institut...   
3  http://dbpedia.org/resource/Cambridge_Institut...   
4  http://dbpedia.org/resource/Cambridge_Institut...   

                          entityLabel  \
0  Cambridge Institute of Criminology   
1  Cambridge Institute of Criminology   
2  Cambridge Institute of Criminology   
3  Cambridge Institute of Criminology   
4  Cambridge Institute of Criminology   

                                               class               classLabel  \
0                  http://dbpedia.org/ontology/Agent                    agent   
1                  http://dbpedia.org/ontology/Agent                    agent   
2  http://dbpedia.org/ontology/EducationalInstitu...  educational institution   
3  http://dbpedia.org/ontology/EducationalInstitu...  educational institution   
4          




In [71]:
results_df

Unnamed: 0,entity,entityLabel,class,classLabel,preferredLabel
0,http://dbpedia.org/resource/Cambridge_Institut...,Cambridge Institute of Criminology,http://dbpedia.org/ontology/Agent,agent,criminology
1,http://dbpedia.org/resource/Cambridge_Institut...,Cambridge Institute of Criminology,http://dbpedia.org/ontology/Agent,agent,criminology
2,http://dbpedia.org/resource/Cambridge_Institut...,Cambridge Institute of Criminology,http://dbpedia.org/ontology/EducationalInstitu...,educational institution,criminology
3,http://dbpedia.org/resource/Cambridge_Institut...,Cambridge Institute of Criminology,http://dbpedia.org/ontology/EducationalInstitu...,educational institution,criminology
4,http://dbpedia.org/resource/Cambridge_Institut...,Cambridge Institute of Criminology,http://dbpedia.org/ontology/Organisation,organisation,criminology
...,...,...,...,...,...
5655,http://dbpedia.org/resource/Cangrande_I_della_...,Cangrande I della Scala,http://dbpedia.org/class/yago/Communicator1096...,,Scala
5656,http://dbpedia.org/resource/Cangrande_I_della_...,Cangrande I della Scala,http://dbpedia.org/class/yago/HeadOfState11016...,,Scala
5657,http://dbpedia.org/resource/Cangrande_I_della_...,Cangrande I della Scala,http://dbpedia.org/class/yago/LivingThing10000...,,Scala
5658,http://dbpedia.org/resource/Cangrande_I_della_...,Cangrande I della Scala,http://dbpedia.org/class/yago/Negotiator110351874,,Scala


In [72]:
results_df.preferredLabel.unique()

array(['criminology', 'Haskell', 'cardiovascular system', 'Erlang',
       'preventive medicine', 'legal studies', 'mergers and acquisitions',
       'environmental policy', 'osteology', 'literature', 'Lisp',
       'Christianity', 'Sass', 'communication', 'Spanish', 'fine arts',
       'audiology', 'energy', 'astrology', 'Malay', 'Montenegrin',
       'sexology', 'LESS', 'logic', 'history', 'cartography', 'radiology',
       'political science', 'journalism', 'Arabic', 'Ukrainian',
       'pharmacy law', 'business law', 'Japanese', 'cameras',
       'photography', 'politics', 'yoga', 'viticulture', 'herpetology',
       'law enforcement', 'Perl', 'PHP', 'mathematics', 'database',
       'German', 'football', 'social sciences', 'Romanian',
       'comparative literature', 'orthodontics', 'textile industry',
       'landscape architecture', 'consultation', 'R', 'public relations',
       'algebra', 'financial management', 'genetics', 'plastic surgery',
       'archaeology', 'APL', 'auti

In [77]:
all_labels = results_df.preferredLabel.unique()

saved_entities = dict()
for label in all_labels:

    sub_df = results_df[results_df['preferredLabel'] == label]
    saved_entities[label] = list(set(list(sub_df['entityLabel'].unique())))

saved_entities

{'criminology': ['Cambridge Institute of Criminology'],
 'Haskell': ['Captain Robert Haskell House'],
 'cardiovascular system': ['Cardiovascular system'],
 'Erlang': ['Carlos Berlanga', 'Carlo von Erlanger'],
 'preventive medicine': ['American Journal of Preventive Medicine',
  'Preventive medicine',
  'Preventive medicine journals',
  'American Board of Preventive Medicine'],
 'legal studies': ['Aminu Kano College of Islamic Legal Studies',
  'American College of History & Legal Studies'],
 'mergers and acquisitions': ['1962 mergers and acquisitions',
  '1920 mergers and acquisitions',
  '1907 mergers and acquisitions'],
 'environmental policy': ['Environmental policy in the United States'],
 'osteology': ['Osteology'],
 'literature': ['Canadian Literature', 'Canadian literature'],
 'Lisp': ['Catalina de los Ríos y Lisperguer', 'Canalispira fluctuata'],
 'Christianity': ['1400s in Christianity',
  '1501 in Christianity',
  '1500s in Christianity',
  '1110s in Christianity',
  '1360s i

In [111]:
input_prompt = """
You are tasked with evaluating the accuracy of entity-class matches from DBpedia, derived from Wikipedia and returning them in VALID JSON strings. For each entity, assess the matched DBpedia classes and assign a score:

Instructions: For each entity-class pair, assign a score from 1 to 5 based on the match accuracy:

5: Exact match
4: Strong match
3: Moderate match
2: Weak match
1: Incorrect match
After scoring, calculate the average score for each DBpedia class and identify "error-prone" classes with an average below 3. Provide a brief description of misclassification patterns.

Example Input:

Entity: "New York City" -> Matched Classes: City, Place, AdministrativeRegion, Settlement
Output: {{"New York City": [5, 4, 5, 5] }}

Entity: "Mars Rover" → Classes: Spacecraft, Vehicle, Machine
Output: {{"Mars Rover": [5, 4, 2] }}


Assign a score for the following:
Entity: "{preferred_label}" -> Matched Classes: {matched_labels}


# Rules
- Do not write anything before or after the JSON. Directly output the JSON object.
- Do not add any explanation or comments.
- Return only valid JSON objects


JSON:{{"""


# Prompting the LLM to characterise

In [82]:
from llama_index.llms.ollama import Ollama
from llama_index.llms.gemini import Gemini
import sys
import os
from dotenv import load_dotenv
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import os
import typing
from typing import Any, Dict, Optional, Sequence
from dotenv import load_dotenv, find_dotenv

# Locate .env file, even if it's up in parent directories
load_dotenv(find_dotenv())



False

In [90]:
from llama_index.llms.ollama import Ollama
ollama = Ollama(model="llama3.1:latest", request_timeout=60.0, temperature=0, max_tokens=2000)


In [85]:
saved_entities['literature']


['Canadian Literature', 'Canadian literature']

In [112]:
response = ollama.complete(input_prompt.format(preferred_label='literature', matched_labels=saved_entities['literature']))


In [115]:
saved_entities

{'criminology': ['Cambridge Institute of Criminology'],
 'Haskell': ['Captain Robert Haskell House'],
 'cardiovascular system': ['Cardiovascular system'],
 'Erlang': ['Carlos Berlanga', 'Carlo von Erlanger'],
 'preventive medicine': ['American Journal of Preventive Medicine',
  'Preventive medicine',
  'Preventive medicine journals',
  'American Board of Preventive Medicine'],
 'legal studies': ['Aminu Kano College of Islamic Legal Studies',
  'American College of History & Legal Studies'],
 'mergers and acquisitions': ['1962 mergers and acquisitions',
  '1920 mergers and acquisitions',
  '1907 mergers and acquisitions'],
 'environmental policy': ['Environmental policy in the United States'],
 'osteology': ['Osteology'],
 'literature': ['Canadian Literature', 'Canadian literature'],
 'Lisp': ['Catalina de los Ríos y Lisperguer', 'Canalispira fluctuata'],
 'Christianity': ['1400s in Christianity',
  '1501 in Christianity',
  '1500s in Christianity',
  '1110s in Christianity',
  '1360s i

# We will focus on the first 10 max

In [117]:
generated_outputs = []
for item in saved_entities:
    response = ollama.complete(input_prompt.format(preferred_label=item, matched_labels=saved_entities[item][0:10]))
    print(response.text)
    generated_outputs.append({"item": item, "db_pedia_classes": saved_entities[item], "response": response.text})

{"criminology": [5]}
{"Haskell": [5]}
{"cardiovascular system": [5]}
{"Erlang": [1, 1]}
{"preventive medicine": [5, 4, 2, 1]}
{"legal studies": [1, 5]}
{"mergers and acquisitions": [1, 1, 1]}
{"environmental policy": [5]}
{"osteology": [5]}
{"literature": [5, 1]}
{"Lisp": [1, 1]}
{"Christianity": [5, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"Sass": [1, 5, 1, 1]}
{"communication": [1, 1, 1, 1, 1, 1, 1]}
{"Spanish": [1, 1, 1, 1, 1]}
{"fine arts": [1, 2, 5, 3]}
{"audiology": [5]}
{"energy": [1, 1, 1, 1, 1, 1]}
{"astrology": [5, 1]}
{"Malay": [1, 1, 1, 1, 1, 1, 1, 1, 5, 1]}
{"Montenegrin": [3, 1, 1]}
{"sexology": [5]}
{"LESS": [5, 1, 1, 1, 1, 2, 3, 1]}
{"logic": [1, 1, 1, 1, 1, 1, 1]}
{"history": [1, 2]}
{"cartography": [5, 1]}
{"radiology": [1, 5, 5, 5]}
{"political science": [1, 5]}
{"journalism": [5, 1, 2, 3, 4, 1, 2, 3, 4]}
{"Arabic": [1, 5, 1, 1]}
{"Ukrainian": [1, 2, 3]}
{"pharmacy law": [5]}
{"business law": [5]}
{"Entity": "Japanese", "Matched Classes": [1, 1, 1]}
{"cameras": [1]}
{"photograph