# Knowledge Graphs Assignment 3

## Task I - DBPedia schema ambiguity

In [2]:
# Install rdflib
!pip install rdflib

Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.1.1


In [3]:
from collections import defaultdict
from rdflib import Graph
import spacy
from itertools import islice
import time

# attempts #2 to fetch data

### Fetch classes

In [13]:
import requests
import pandas as pd

# Define the SPARQL endpoint URL
sparql_endpoint = "https://dbpedia.org/sparql"

# Define the SPARQL query
sparql_query = """
SELECT DISTINCT ?entity_uri ?label ?entity_type
WHERE {
    {
        ?entity_uri rdf:type owl:Class.
        ?entity_uri rdfs:label ?label.
        FILTER(LANG(?label) = "en")
        BIND("Class" AS ?entity_type)
    }
    UNION
    {
        ?entity_uri rdf:type rdf:Property.
        ?entity_uri rdfs:label ?label.
        FILTER(LANG(?label) = "en")
        BIND("Property" AS ?entity_type)
    }
}
LIMIT 10000
"""

# Send the request to the DBpedia SPARQL endpoint
response = requests.get(sparql_endpoint, params={
    'query': sparql_query,
    'format': 'json'  # Request results in JSON format
})

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    results = response.json()

    # Extract the results into a list of dictionaries
    bindings = results.get('results', {}).get('bindings', [])

    # Convert the results into a pandas DataFrame
    data = []
    for result in bindings:
        entity_uri = result['entity_uri']['value']
        label = result['label']['value']
        entity_type = result['entity_type']['value']
        data.append({'Entity URI': entity_uri, 'Label': label, 'Entity Type': entity_type})

    df = pd.DataFrame(data)

    # Display the DataFrame
    print(df.head())  # Display the first few rows
else:
    print("Error querying DBpedia:", response.status_code)


                             Entity URI     Label Entity Type
0   http://dbpedia.org/ontology/Company   company       Class
1  http://dbpedia.org/ontology/Activity  activity       Class
2      http://dbpedia.org/ontology/Name      name       Class
3    http://dbpedia.org/ontology/Person    person       Class
4     http://dbpedia.org/ontology/Actor     actor       Class


In [27]:
# Split the DataFrame into two based on 'Entity Type'
df_class = df[df['Entity Type'] == 'Class']
df_property = df[df['Entity Type'] == 'Property']

# Show the resulting DataFrames
print("Entity Type 'Class':")
print(df_class)

print("\nEntity Type 'Property':")
print(df_property)

Entity Type 'Class':
                                             Entity URI  \
0                   http://dbpedia.org/ontology/Company   
1                  http://dbpedia.org/ontology/Activity   
2                      http://dbpedia.org/ontology/Name   
3                    http://dbpedia.org/ontology/Person   
4                     http://dbpedia.org/ontology/Actor   
...                                                 ...   
1502  http://dbpedia.org/ontology/یوروویژن_گانا_مقاب...   
1503  http://dbpedia.org/ontology/یوٹیب_پر_وڈیو_لگان...   
1504              http://dbpedia.org/ontology/یوکاریوٹ۔   
1505            http://dbpedia.org/ontology/یہودی_رہنما   
1506   http://dbpedia.org/ontology/یہودیوں_کی_عبادت_گاہ   

                              Label Entity Type  
0                           company       Class  
1                          activity       Class  
2                              name       Class  
3                            person       Class  
4                   

In [14]:
# Count how many "Class" and "Property" values are in the 'Entity Type' column
entity_type_counts = df['Entity Type'].value_counts()

# Display the results
print(entity_type_counts)

Entity Type
Property    8493
Class       1507
Name: count, dtype: int64


# Dictionary-Based Ambiguity Detection

In [2]:
import pandas as pd
from nltk.corpus import wordnet as wn
import nltk

# Ensure WordNet is downloaded
nltk.download('wordnet')

# Function to check ambiguity using WordNet by counting senses
def is_ambiguous_with_wordnet(term):
    senses = wn.synsets(term)
    return len(senses) > 3  # Returns True if the term has more than one meaning

# Function to detect ambiguous entities based on WordNet from a DataFrame
def detect_ambiguous_entities_wordnet(df):
    ambiguous_entities = []

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        label = row['Label']

        # Check if the label has multiple meanings in WordNet
        if is_ambiguous_with_wordnet(label):
            ambiguous_entities.append({
                'Entity URI': row['Entity URI'],
                'Label': label,
                'Entity Type': row['Entity Type'],
                'Ambiguity': True
            })
        else:
            ambiguous_entities.append({
                'Entity URI': row['Entity URI'],
                'Label': label,
                'Entity Type': row['Entity Type'],
                'Ambiguity': False
            })

    # Convert results to DataFrame for easy display and further processing
    ambiguous_entities_df = pd.DataFrame(ambiguous_entities)
    return ambiguous_entities_df


# Run the ambiguity detection function
ambiguity_results = detect_ambiguous_entities_wordnet(df)

# Display the results
print(ambiguity_results)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                   Entity URI           Label Entity Type  \
0         http://dbpedia.org/ontology/Company         company       Class   
1        http://dbpedia.org/ontology/Activity        activity       Class   
2            http://dbpedia.org/ontology/Name            name       Class   
3          http://dbpedia.org/ontology/Person          person       Class   
4           http://dbpedia.org/ontology/Actor           actor       Class   
..                                        ...             ...         ...   
95       http://dbpedia.org/ontology/Canoeist        canoeist       Class   
96       http://dbpedia.org/ontology/Cardinal        cardinal       Class   
97  http://dbpedia.org/ontology/CareerStation  career station       Class   
98         http://dbpedia.org/ontology/Castle          castle       Class   
99           http://dbpedia.org/ontology/Cave            cave       Class   

    Ambiguity  
0        True  
1        True  
2        True  
3       Fal

In [3]:
# Count how many True and False values there are in the 'Ambiguity' column
ambiguity_counts = ambiguity_results['Ambiguity'].value_counts()

# Display the result
print(ambiguity_counts)


Ambiguity
False    74
True     26
Name: count, dtype: int64


# LLM ambiguity Detection

In [4]:
!pip install openai==0.28



In [None]:
import openai
import pandas as pd

# Set your OpenAI API key
openai.api_key = ""

# Function to check ambiguity using the LLM
def is_ambiguous_with_llm(term):
    prompt = f"Is the term '{term}' ambiguous? List any distinct meanings if it has multiple interpretations."

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )

    answer = response.choices[0].message['content']
    return "multiple interpretations" in answer.lower() or "multiple meanings" in answer.lower()

# Detect ambiguous entities based on the LLM from a DataFrame
def detect_ambiguous_entities_llm(df):
    ambiguous_entities = []

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        label = row['Label']

        # Check if the label is ambiguous using the LLM
        if is_ambiguous_with_llm(label):
            ambiguous_entities.append({
                'Entity URI': row['Entity URI'],
                'Label': label,
                'Entity Type': row['Entity Type'],
                'Ambiguity': True
            })
        else:
            ambiguous_entities.append({
                'Entity URI': row['Entity URI'],
                'Label': label,
                'Entity Type': row['Entity Type'],
                'Ambiguity': False
            })

    # Convert the results into a DataFrame for easy display and further processing
    ambiguous_entities_df = pd.DataFrame(ambiguous_entities)
    return ambiguous_entities_df


# Run the ambiguity detection function
ambiguity_results_llm = detect_ambiguous_entities_llm(df)

# Display the results
print(ambiguity_results_llm)


                                   Entity URI           Label Entity Type  \
0         http://dbpedia.org/ontology/Company         company       Class   
1        http://dbpedia.org/ontology/Activity        activity       Class   
2            http://dbpedia.org/ontology/Name            name       Class   
3          http://dbpedia.org/ontology/Person          person       Class   
4           http://dbpedia.org/ontology/Actor           actor       Class   
..                                        ...             ...         ...   
95       http://dbpedia.org/ontology/Canoeist        canoeist       Class   
96       http://dbpedia.org/ontology/Cardinal        cardinal       Class   
97  http://dbpedia.org/ontology/CareerStation  career station       Class   
98         http://dbpedia.org/ontology/Castle          castle       Class   
99           http://dbpedia.org/ontology/Cave            cave       Class   

    Ambiguity  
0        True  
1        True  
2        True  
3        Tr

In [6]:
# Count how many True and False values there are in the 'Ambiguity' column
ambiguity_counts_llm = ambiguity_results_llm['Ambiguity'].value_counts()

# Display the result
print(ambiguity_counts_llm)

Ambiguity
False    56
True     44
Name: count, dtype: int64


## Part 2 - compare esco with DBPedia

In [29]:
import pandas as pd

# Load ESCO skills dataset
esco_skills = pd.read_csv("esco_skills_en.csv")  # Adjust the file path if necessary

# Inspect the data
esco_skills.head()


Unnamed: 0,conceptType,conceptUri,skillType,preferredLabel,altLabels,description
0,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/000f1d3d-220f...,knowledge,Haskell,,The techniques and principles of software deve...
1,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00506f28-e884...,knowledge,sport and exercise medicine,sports injury treatment\nsports medicine\nexer...,Prevention and treatement of injuries or condi...
2,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c04e40-35ea...,knowledge,Incremental development,,The incremental development model is a methodo...
3,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c51318-4ea9...,knowledge,use of special equipment for daily activities,,"The types of special equipment, prosthetics an..."
4,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00c85cbc-2b24...,knowledge,sawing techniques,sawing technologies\nsawing methods\nsawing te...,Various sawing techniques for using manual as ...


In [30]:
# Select the first 10 rows
esco_skills_short = esco_skills.head(10)


In [31]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON


# DBpedia SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Function to perform SPARQL query for a given skill label
def query_dbpedia(skill_label):
    # Define the SPARQL query, using skill_label dynamically
    sparql_query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbp: <http://dbpedia.org/property/>

    SELECT ?entity ?entityLabel ?class ?classLabel
    WHERE {{
      ?entity rdfs:label ?entityLabel .
      ?entity a ?class .

      # Filter for entities labeled as skill_label in English
      FILTER (LANG(?entityLabel) = "en" && CONTAINS(LCASE(?entityLabel), "{skill_label.lower()}")).

      # Optional: Retrieve the class label if available
      OPTIONAL {{
        ?class rdfs:label ?classLabel .
        FILTER (LANG(?classLabel) = "en")
      }}
    }}
    LIMIT 10
    """

    # Set query and return format
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)

    # Execute the query and fetch results
    try:
        results = sparql.query().convert()
        # Parse results into a list of dictionaries
        output = [
            {
                "entity": result["entity"]["value"],
                "entityLabel": result["entityLabel"]["value"],
                "class": result["class"]["value"],
                "classLabel": result.get("classLabel", {}).get("value", "")
            }
            for result in results["results"]["bindings"]
        ]
    except Exception as e:
        print(f"Error querying DBpedia for skill '{skill_label}':", e)
        output = []

    return output

# Prepare an empty list to store the results
all_results = []

# Loop through each skill in esco_skills['preferredLabel'] and query DBpedia
for skill in esco_skills_short["preferredLabel"]:
    skill_results = query_dbpedia(skill)
    # Append each result with the original skill label for traceability
    for result in skill_results:
        result["preferredLabel"] = skill
        all_results.append(result)

# Convert results to a DataFrame
results_df = pd.DataFrame(all_results)

# Display the first few rows of the results
print(results_df.head())



                                              entity  \
0  http://dbpedia.org/resource/Captain_Robert_Has...   
1  http://dbpedia.org/resource/Captain_Robert_Has...   
2  http://dbpedia.org/resource/Captain_Robert_Has...   
3  http://dbpedia.org/resource/Captain_Robert_Has...   
4  http://dbpedia.org/resource/Captain_Robert_Has...   

                    entityLabel  \
0  Captain Robert Haskell House   
1  Captain Robert Haskell House   
2  Captain Robert Haskell House   
3  Captain Robert Haskell House   
4  Captain Robert Haskell House   

                                               class               classLabel  \
0  http://dbpedia.org/ontology/ArchitecturalStruc...  architectural structure   
1  http://dbpedia.org/ontology/ArchitecturalStruc...  architectural structure   
2               http://dbpedia.org/ontology/Building                 building   
3               http://dbpedia.org/ontology/Building                 building   
4                http://www.w3.org/2002/07/owl#

In [9]:
from SPARQLWrapper import SPARQLWrapper, JSON

# DBPedia SPARQL endpoint
sparql = SPARQLWrapper("https://dbpedia.org/sparql")

# Function to query DBPedia and get the matching entity URI and class
def get_dbpedia_entity_and_class(skill_name):
    query = f"""
    SELECT DISTINCT ?entity ?class ?label
    WHERE {{
        ?entity rdfs:label ?label .
        ?entity rdf:type ?class .
        FILTER(LANG(?label) = "en" && CONTAINS(LCASE(?label), "{skill_name.lower()}"))
    }} LIMIT 1
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # Check if results are found and return
    if len(results["results"]["bindings"]) > 0:
        result = results["results"]["bindings"][0]
        entity_uri = result['entity']['value']
        class_uri = result['class']['value']
        label = result['label']['value']
        return entity_uri, class_uri, label
    return None, None, None

# Apply the function to all ESCO skills
esco_skills[['dbpedia_entity_uri', 'dbpedia_class_uri', 'dbpedia_class_label']] = esco_skills['preferredLabel'].apply(lambda x: pd.Series(get_dbpedia_entity_and_class(x)))


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-bb66b53fe12f>", line 31, in <cell line: 31>
    esco_skills[['dbpedia_entity_uri', 'dbpedia_class_uri', 'dbpedia_class_label']] = esco_skills['preferredLabel'].apply(lambda x: pd.Series(get_dbpedia_entity_and_class(x)))
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/series.py", line 4924, in apply
    ).apply()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1427, in apply
    return self.apply_standard()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1507, in apply_standard
    mapped = obj._map_values(
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/base.py", line 921, in _map_values
    return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
  File "/usr/local/lib

TypeError: object of type 'NoneType' has no len()

In [15]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Define the variable for the skill name (replace with your desired skill)
skill_name = "literature"

# Construct the SPARQL query (using DBpedia ontology properties)
query = f"""
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT DISTINCT ?entity ?entity_label WHERE {{
  ?entity a dbo:Occupation ;
     rdfs:label ?entity_label .
  FILTER (CONTAINS(LCASE(?entity_label), LCASE("{skill_name}")) OR
         ?entity skos:relatedMatch ?skill .
         FILTER (LCASE(?skill_label) = LCASE("{skill_name}")) )
  OPTIONAL {{ ?entity dbo:abstract ?abstract }}
  FILTER (langMatches(lang(?abstract), "en")) .
}}
"""

# Create a SPARQLWrapper object
sparql = SPARQLWrapper("https://dbpedia.org/sparql")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

# Execute the query and handle potential errors
try:
  results = sparql.query().convert()
except Exception as e:
  print(f"Error executing query: {e}")
  results = None

# Process the results (if successful)
if results:
  for result in results["results"]["bindings"]:
    entity_uri = result["entity"]["value"]
    entity_label = result["entity_label"]["value"]
    abstract = result.get("abstract", {}).get("value")
    print(f"Entity: {entity_label} ({entity_uri})")
    if abstract:
      print(f"Abstract: {abstract}\n")
else:
  print("No matching entities found for the given skill name.")

Error executing query: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'Virtuoso 37000 Error SP030: SPARQL compiler, line 0: Bad character \'\xc2\' (0xc2) in SPARQL expression at \'\xc2\'\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\nPREFIX dbo: <http://dbpedia.org/ontology/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n\nSELECT \xc2\xa0 \n DISTINCT ?entity ?entity_label WHERE {\n  ?entity a dbo:Occupation ;\n     rdfs:label ?entity_label .\n  FILTER (CONTAINS(LCASE(?entity_label), LCASE("literature")) OR\n         ?entity skos:relatedMatch ?skill .\n         FILTER (LCASE(?skill_label) = LCASE("literature")) )\n  OPTIONAL { ?entity dbo:abstract ?abstract }\n  FILTER (langMatches(lang(?abstract), "en")) .\n}\n\n'
No matching entities found for the given skill name.
