In [8]:
import pandas as pd
import requests
from dotenv import load_dotenv
import os

In [9]:
data_path = "../Data/non_missing_company_data.csv"
random_seed = 9483

In [10]:
gh_data = pd.read_csv(data_path)

In [11]:
# select needed columns
gh_data = gh_data[["company", "website", "bio"]]
gh_data.head()

Unnamed: 0,company,website,bio
0,CallitAbhi,https://www.callitabhi.com/,Former Deep Learning Intern | Lead Programmer...
1,Sarvajanik College of Engineering & Technology,,
2,IIT Guwahati,https://abhinav3.github.io/,Machine learning & Deep Learning Practitioner....
3,KTH Royal Insitute Of Technology,https://www.linkedin.com/in/adithya-u-r-795866...,IoT | Deep Learning | Machine Learning | Embed...
4,Tata Consultancy Services,,Machine Learning Enthusiast and loves to innov...


In [12]:
print(gh_data["company"].sample(100).to_markdown())

|        | company                                                                           |
|-------:|:----------------------------------------------------------------------------------|
|  29812 | Radolyn Labs                                                                      |
|  96588 | OkCupid                                                                           |
| 191511 | EPSOFT                                                                            |
|  17742 | Cash App                                                                          |
| 113078 | New York University                                                               |
| 127030 | http://www.linkedin.com/in/federicomartini                                        |
| 162944 | Samsung Electronics                                                               |
| 167842 | Heritage Institute of Technology, Kolkata                                         |
|  69610 | CIAE                                   

In [13]:
test_company = gh_data["company"].sample(1, random_state = random_seed)
test_company

109384    Xerox technology services
Name: company, dtype: object

# Test LamAPI

In [14]:
# Load .env file
load_dotenv("..")

# Read variables from .env
endpoint = os.getenv("ENTITY_RETRIEVAL_ENDPOINT")
token = os.getenv("ENTITY_RETRIEVAL_TOKEN")

# Define query parameters
params = {
    "name": test_company,
    "limit": "50",
    "kind": "entity",
    "kg": "wikidata",
    "fuzzy": "True",
    "language": "en",
    "cache": "False",
    "token": token
}

# Send GET request
response = requests.get(endpoint, params=params)

# Handle response
if response.status_code == 200:
    print("Results:", response.json())
else:
    print(f"Request failed: {response.status_code}")
    print(response.text)


Results: [{'id': 'Q121126637', 'name': 'Xerox', 'description': 'production company', 'types': [{'id': 'Q11396960', 'name': 'production company'}], 'kind': 'entity', 'NERtype': 'ORG', 'ambiguity_mention': 0.0, 'corrects_tokens': 0.667, 'ntoken_mention': 3, 'ntoken_entity': 1, 'length_mention': 25, 'length_entity': 5, 'popularity': 0.0, 'pos_score': 0.02, 'es_score': 1.0, 'ed_score': 0.2, 'jaccard_score': 0.33, 'jaccardNgram_score': 0.18}, {'id': 'Q152433', 'name': 'Xerox', 'description': 'American document management corporation', 'types': [{'id': 'Q4830453', 'name': 'business'}, {'id': 'Q6881511', 'name': 'enterprise'}, {'id': 'Q891723', 'name': 'public company'}], 'kind': 'entity', 'NERtype': 'ORG', 'ambiguity_mention': 0.0, 'corrects_tokens': 0.667, 'ntoken_mention': 3, 'ntoken_entity': 1, 'length_mention': 25, 'length_entity': 5, 'popularity': 0.06, 'pos_score': 0.04, 'es_score': 1.0, 'ed_score': 0.2, 'jaccard_score': 0.33, 'jaccardNgram_score': 0.18}, {'id': 'Q110678467', 'name': '

In [15]:
response.json()[0]

{'id': 'Q121126637',
 'name': 'Xerox',
 'description': 'production company',
 'types': [{'id': 'Q11396960', 'name': 'production company'}],
 'kind': 'entity',
 'NERtype': 'ORG',
 'ambiguity_mention': 0.0,
 'corrects_tokens': 0.667,
 'ntoken_mention': 3,
 'ntoken_entity': 1,
 'length_mention': 25,
 'length_entity': 5,
 'popularity': 0.0,
 'pos_score': 0.02,
 'es_score': 1.0,
 'ed_score': 0.2,
 'jaccard_score': 0.33,
 'jaccardNgram_score': 0.18}

In [16]:
test_candidate_id = response.json()[0]["id"]
test_candidate_id

'Q121126637'

In [17]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_all_properties_for_entity(entity_id):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{entity_id} ?property ?value .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    properties = []
    for result in results["results"]["bindings"]:
        properties.append({
            "property": result["property"]["value"],
            "propertyLabel": result["propertyLabel"]["value"],
            "value": result["value"]["value"],
            "valueLabel": result.get("valueLabel", {}).get("value", "")
        })
    return properties

# Example usage
props = get_all_properties_for_entity(test_candidate_id)
for p in props:
    print(f"{p['propertyLabel']}: {p['valueLabel'] or p['value']}")

http://schema.org/version: 2016295146
http://schema.org/dateModified: 2023-11-23T09:15:18Z
http://schema.org/description: production company
http://schema.org/description: продюсерська компанія
http://www.w3.org/2000/01/rdf-schema#label: Xerox
http://wikiba.se/ontology#statements: 2
http://wikiba.se/ontology#sitelinks: 0
http://wikiba.se/ontology#identifiers: 1
http://www.wikidata.org/prop/direct/P31: production company
http://www.wikidata.org/prop/P31: statement/Q121126637-969D2D42-2637-4680-97F0-33806DFF3323
http://www.wikidata.org/prop/direct/P7003: creators/14821
http://www.wikidata.org/prop/P7003: statement/Q121126637-0F75DAD4-BC71-4D1B-A7B5-85A844435D53


In [19]:
# Example usage
props = get_all_properties_for_entity("Q152433")
for p in props:
    print(f"{p['propertyLabel']}: {p['valueLabel'] or p['value']}")

http://www.wikidata.org/prop/P268: statement/Q152433-22E9DE7C-8B4C-4C9F-BF39-BCE2FD767236
http://www.wikidata.org/prop/P269: statement/Q152433-C2F243AF-B3DD-4D57-B5B8-B07FE27804B2
http://www.wikidata.org/prop/P271: statement/Q152433-02C3B257-F672-49EB-B7B9-5E4434AA6336
http://www.wikidata.org/prop/P355: statement/Q152433-27ac28a6-4bad-d230-5826-b3471ae99244
http://www.wikidata.org/prop/P355: statement/Q152433-9c176f12-477b-bc29-fb32-917e42ae480f
http://www.wikidata.org/prop/P355: statement/Q152433-A4D0EF58-EF98-414B-BB94-FD609E8F2CC4
http://www.wikidata.org/prop/P355: statement/Q152433-a898e331-4989-9bd6-9d0c-82d8eeba968b
http://www.wikidata.org/prop/P355: statement/Q152433-BDEB90F8-5E1E-4E28-8C92-0771D57301C1
http://www.wikidata.org/prop/P355: statement/Q152433-DBE8C509-1A5D-43B6-9A72-5943B708958A
http://www.wikidata.org/prop/P361: statement/Q152433-6038123A-6405-4260-B182-3E27AD8748EA
http://www.wikidata.org/prop/P373: statement/q152433-D6765741-3708-498F-9D05-21090357867E
http://www