### Ultima Correccion más reciente

In [1]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, DCTERMS
import pandas as pd
import re

# Helper function to clean and format strings for URIs
def clean_for_uri(text):
    if not text:
        return "unknown"
    a, b = 'áéíóúüñÁÉÍÓÚÜÑ', 'aeiouunAEIOUUN'
    trans = str.maketrans(a, b)
    text = text.translate(trans)
    text = re.sub(r'[<>:"/\\|?*(){}[\].,;\']', '', text)
    text = re.sub(r'\s+', '_', text)
    text = re.sub(r'[^a-zA-Z0-9_-]', '', text)
    return text.lower()[:100]

# Namespaces
g = Graph()
schema = Namespace("https://schema.org/")
dbp = Namespace("http://dbpedia.org/resource/")
myv = Namespace("http://example.org/myv/")
sko = SKOS
g.bind("schema", schema)
g.bind("dbp", dbp)
g.bind("myv", myv)
g.bind("skos", sko)
g.bind("dcterms", DCTERMS)

# Load CSVs
citations_df = pd.read_csv('citations.csv')
datos_con_referencias_df = pd.read_csv('scopus_detailed_papers.csv')
scopus_papers_df = pd.read_csv('scopus_papers.csv')

# Add Countries and Cities
locations = scopus_papers_df[['affiliation_country', 'affiliation_city']].dropna().drop_duplicates()
for _, row in locations.iterrows():
    countries = [country.strip() for country in row['affiliation_country'].split(';')]
    cities = [city.strip() for city in row['affiliation_city'].split(';')]
    for country, city in zip(countries, cities):
        country_uri = URIRef(dbp + clean_for_uri(country))
        city_uri = URIRef(dbp + clean_for_uri(city))
        g.add((country_uri, RDF.type, schema.Country))
        g.add((country_uri, schema.name, Literal(country)))
        g.add((city_uri, RDF.type, schema.City))
        g.add((city_uri, schema.name, Literal(city)))
        g.add((city_uri, schema.containsPlace, country_uri))

# Add Organizations and link to cities
organizations = scopus_papers_df[['affilname', 'affiliation_city', 'affiliation_country']].dropna().drop_duplicates()
for _, row in organizations.iterrows():
    org_uri = URIRef(schema + clean_for_uri(row['affilname']))
    cities = [city.strip() for city in row['affiliation_city'].split(';')]
    countries = [country.strip() for country in row['affiliation_country'].split(';')]
    g.add((org_uri, RDF.type, schema.Organization))
    g.add((org_uri, schema.name, Literal(row['affilname'])))
    for city, country in zip(cities, countries):
        city_uri = URIRef(dbp + clean_for_uri(city))
        country_uri = URIRef(dbp + clean_for_uri(country))
        g.add((org_uri, schema.location, city_uri))
        g.add((city_uri, schema.containsPlace, country_uri))

# Add Persons and their Affiliations (Schema:Person -> Schema:affiliation -> Schema:Organization -> Schema:location -> Schema:City -> Schema:containsPlace -> Schema:Country)
for _, row in datos_con_referencias_df[['author_names', 'affilname', 'affiliation_city', 'affiliation_country']].dropna().iterrows():
    authors = row['author_names'].split(';')
    affiliations = row['affilname'].split(';')
    cities = row['affiliation_city'].split(';')
    countries = row['affiliation_country'].split(';')
    
    # Ensure all lists have the same length by truncating or padding with "unknown"
    max_len = max(len(authors), len(affiliations), len(cities), len(countries))
    authors += ["unknown"] * (max_len - len(authors))
    affiliations += ["unknown"] * (max_len - len(affiliations))
    cities += ["unknown"] * (max_len - len(cities))
    countries += ["unknown"] * (max_len - len(countries))
    
    for author, affiliation, city, country in zip(authors, affiliations, cities, countries):
        # Create URIs for each entity
        author_uri = URIRef(schema + clean_for_uri(author.strip()))
        org_uri = URIRef(schema + clean_for_uri(affiliation.strip()))
        city_uri = URIRef(dbp + clean_for_uri(city.strip()))
        country_uri = URIRef(dbp + clean_for_uri(country.strip()))
        
        # Add Person and their affiliation
        g.add((author_uri, RDF.type, schema.Person))
        g.add((author_uri, schema.name, Literal(author.strip())))
        g.add((author_uri, schema.affiliation, org_uri))
        
        # Add Organization and its location
        g.add((org_uri, RDF.type, schema.Organization))
        g.add((org_uri, schema.name, Literal(affiliation.strip())))
        g.add((org_uri, schema.location, city_uri))
        
        # Add City and its country
        g.add((city_uri, RDF.type, schema.City))
        g.add((city_uri, schema.name, Literal(city.strip())))
        g.add((city_uri, schema.containsPlace, country_uri))
        
        # Add Country
        g.add((country_uri, RDF.type, schema.Country))
        g.add((country_uri, schema.name, Literal(country.strip())))

# Normalize and Add Keywords (SKOS Concepts)
keywords = scopus_papers_df['authkeywords'].dropna().str.split('|').explode().str.strip().str.lower().drop_duplicates()
for keyword in keywords:
    concept_uri = URIRef(myv + "concept/" + clean_for_uri(keyword))
    g.add((concept_uri, RDF.type, SKOS.Concept))
    g.add((concept_uri, RDFS.label, Literal(keyword)))

# Add Periodicals
journals = datos_con_referencias_df[datos_con_referencias_df['aggregationType'] == 'Journal']
for _, row in journals.iterrows():
    periodical_uri = URIRef(schema + clean_for_uri(row['publicationName'] if not pd.isna(row['publicationName']) else f"journal_{_}"))
    g.add((periodical_uri, RDF.type, schema.Periodical))
    g.add((periodical_uri, schema.identifier, Literal(row['publicationName'] if not pd.isna(row['publicationName']) else f"journal_{_}")))
    g.add((periodical_uri, schema.name, Literal(row['aggregationType'] if not pd.isna(row['aggregationType']) else "Journal")))

# Add Articles
for _, row in scopus_papers_df.iterrows():
    if pd.isna(row['doi']):
        continue
    article_uri = URIRef(schema + clean_for_uri(row['doi']))
    g.add((article_uri, RDF.type, schema.Article))
    g.add((article_uri, schema.identifier, Literal(row['doi'])))
    g.add((article_uri, schema.title, Literal(row['title'])))
    if not pd.isna(row['description']):
        g.add((article_uri, schema.description, Literal(row['description'])))
    if not pd.isna(row['coverDate']):
        g.add((article_uri, schema.datePublished, Literal(row['coverDate'], datatype=XSD.date)))
    if not pd.isna(row['citedby_count']):
        g.add((article_uri, myv.citationCount, Literal(row['citedby_count'], datatype=XSD.integer)))
    if not pd.isna(row['authkeywords']):
        for keyword in row['authkeywords'].split('|'):
            concept_uri = URIRef(myv + "concept/" + clean_for_uri(keyword.strip().lower()))
            g.add((article_uri, DCTERMS.subject, concept_uri))
    if not pd.isna(row['author_names']):
        for author_name in row['author_names'].split(';'):
            author_uri = URIRef(schema + clean_for_uri(author_name.strip()))
            g.add((author_uri, RDF.type, schema.Person))
            g.add((author_uri, schema.name, Literal(author_name.strip())))
            g.add((article_uri, schema.author, author_uri))
    if not pd.isna(row['publicationName']):
        periodical_uri = URIRef(schema + clean_for_uri(row['publicationName']))
        g.add((article_uri, schema.isPartOf, periodical_uri))
        g.add((periodical_uri, schema.hasPart, article_uri))

# Add References as myv:Observation
for _, row in citations_df.dropna(subset=['DOI', 'Paper DOI']).iterrows():
    source_uri = URIRef(schema + clean_for_uri(row['DOI']))
    target_uri = URIRef(schema + clean_for_uri(row['Paper DOI']))
    observation_uri = URIRef(myv + "observation/" + clean_for_uri(row['DOI'] + "_to_" + row['Paper DOI']))
    g.add((observation_uri, RDF.type, myv.Observation))
    g.add((observation_uri, schema.citation, source_uri))
    g.add((observation_uri, schema.citation, target_uri))

# Serialize RDF to Turtle format
g.serialize("articles_data_cleaned_10.ttl", format="turtle")


<Graph identifier=N170aa100e90c40b6a06cd75cb439e65a (<class 'rdflib.graph.Graph'>)>

# Extraccion de Keyword. Enlazado de datos e enrequecimiento de datos directo a GraphDB
 ## Existen errores  al cargar los datos a graphDB
 

In [7]:
# Configuración del endpoint de DBpedia Spotlight
DBPEDIA_ENDPOINT = "https://api.dbpedia-spotlight.org/en/annotate"
HEADERS = {"Accept": "application/json"}

# Configuración de GraphDB
GRAPHDB_ENDPOINT = "http://DESKTOP-OPG5BNF:7200/repositories/articles_data_cleaned_10_conexion"
GRAPHDB_UPDATE = "http://DESKTOP-OPG5BNF:7200/repositories/articles_data_cleaned_10_conexion"

# Función para limpiar texto para URIs
def clean_for_uri(text):
    import re
    if not text:
        return "unknown"
    text = re.sub(r"[^\w\s-]", "", text)  # Remover caracteres especiales
    text = re.sub(r"\s+", "_", text.strip())  # Reemplazar espacios por guiones bajos
    return text.lower()[:100]

# Función para enlazar palabras clave con DBpedia
def link_keywords_to_dbpedia(keywords, confidence=0.5, support=20):
    linked_entities = {}
    for keyword in keywords:
        params = {
            "text": keyword,
            "confidence": confidence,
            "support": support
        }
        response = requests.get(DBPEDIA_ENDPOINT, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            if "Resources" in data:
                linked_entities[keyword] = [
                    {"uri": res["@URI"], "surfaceForm": res["@surfaceForm"]} for res in data["Resources"]
                ]
    return linked_entities

# Paso 1: Consultar las palabras clave del grafo en GraphDB
def get_keywords_from_graphdb(endpoint, limit=10):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(f"""
        PREFIX myv: <http://example.org/myv/>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        
        SELECT DISTINCT ?keyword
        WHERE {{
            ?concept a skos:Concept ;
                     rdfs:label ?keyword .
        }}
        LIMIT {limit}
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return [result["keyword"]["value"] for result in results["results"]["bindings"]]

# Paso 2: Crear triples enriquecidos y cargarlos a GraphDB
def upload_enriched_triples_to_graphdb(linked_data, graphdb_update_url):
    sparql_update = """
        PREFIX schema: <http://schema.org/>
        INSERT DATA {
    """
    for keyword, entities in linked_data.items():
        keyword_uri = f"<http://example.org/myv/concept/{clean_for_uri(keyword)}>"
        for entity in entities:
            dbpedia_uri = f"<{entity['uri']}>"
            sparql_update += f"    {keyword_uri} schema:sameAs {dbpedia_uri} .\n"
    sparql_update += "}"

    # Enviar la consulta SPARQL
    response = requests.post(
        graphdb_update_url,
        data=sparql_update,
        headers={"Content-Type": "application/sparql-update"}
    )
    if response.status_code == 204:
        print("Datos enriquecidos cargados exitosamente a GraphDB.")
    else:
        print(f"Error al cargar los datos a GraphDB: {response.status_code}, {response.text}")

# Ejecutar el proceso completo
keywords = get_keywords_from_graphdb(GRAPHDB_ENDPOINT, limit=10)  # Consultar 10 palabras clave
linked_data = link_keywords_to_dbpedia(keywords, confidence=0.5)  # Enlazar con DBpedia Spotlight
upload_enriched_triples_to_graphdb(linked_data, GRAPHDB_UPDATE)  # Cargar datos enriquecidos en GraphDB

Error al cargar los datos a GraphDB: 415, Unsupported MIME type: application/sparql-update


# Extracion de keywords y enlazado de datos a la DBpedia  desde GraphDB

In [11]:
import requests
from SPARQLWrapper import SPARQLWrapper, JSON

# Configuración del endpoint de DBpedia Spotlight
DBPEDIA_ENDPOINT = "https://api.dbpedia-spotlight.org/en/annotate"
HEADERS = {"Accept": "application/json"}

# Configuración de GraphDB
GRAPHDB_ENDPOINT = "http://DESKTOP-OPG5BNF:7200/repositories/articles_data_cleaned_10_conexion"

# Función para consultar palabras clave desde GraphDB
def get_keywords_from_graphdb(endpoint, limit=10):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(f"""
        PREFIX myv: <http://example.org/myv/>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        
        SELECT DISTINCT ?keyword
        WHERE {{
            ?concept a skos:Concept ;
                     rdfs:label ?keyword .
        }}
        LIMIT {limit}
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return [result["keyword"]["value"] for result in results["results"]["bindings"]]

# Función para consultar entidades relacionadas en DBpedia Spotlight
def link_keywords_to_dbpedia(keywords, confidence=0.7, support=20):
    linked_entities = {}
    for keyword in keywords:
        print(f"Consultando DBpedia Spotlight para: {keyword}")
        params = {
            "text": keyword,
            "confidence": confidence,
            "support": support
        }
        response = requests.get(DBPEDIA_ENDPOINT, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            if "Resources" in data:
                linked_entities[keyword] = [
                    {"uri": res["@URI"], "surfaceForm": res["@surfaceForm"]}
                    for res in data["Resources"]
                ]
        else:
            print(f"Error consultando {keyword}: {response.status_code}")
    return linked_entities

# Ejecutar la prueba
keywords = get_keywords_from_graphdb(GRAPHDB_ENDPOINT, limit=20)  # Obtener 10 palabras clave
linked_data = link_keywords_to_dbpedia(keywords, confidence=0.5)  # Enlazar con DBpedia Spotlight

# Mostrar los resultados
for keyword, entities in linked_data.items():
    print(f"\nPalabra clave: {keyword}")
    for entity in entities:
        print(f"  - DBpedia URI: {entity['uri']}")
        print(f"    Superficie relacionada: {entity['surfaceForm']}")


Consultando DBpedia Spotlight para: adam optimizer
Consultando DBpedia Spotlight para: amazon product reviews
Consultando DBpedia Spotlight para: and machine-learning algorithms
Consultando DBpedia Spotlight para: aspect based sentiment analysis (absa)
Consultando DBpedia Spotlight para: aspect based sentiment classification
Consultando DBpedia Spotlight para: aspect classification
Consultando DBpedia Spotlight para: business. economics
Consultando DBpedia Spotlight para: ceur-ws
Consultando DBpedia Spotlight para: classification sentiment analysis
Consultando DBpedia Spotlight para: code-mixed dravidian language
Consultando DBpedia Spotlight para: computational design
Consultando DBpedia Spotlight para: convolution neural network (cnn)
Consultando DBpedia Spotlight para: corpus-based
Consultando DBpedia Spotlight para: customer review
Consultando DBpedia Spotlight para: cyber threat intelligence
Consultando DBpedia Spotlight para: cyptocurrencies
Consultando DBpedia Spotlight para: da

# Extraccion de Keywords desde el TTL generado y enlazado de datos a la DBpedia

In [14]:
from rdflib import Graph, Namespace
import requests
import csv

# Configuración del endpoint de DBpedia Spotlight
DBPEDIA_ENDPOINT = "https://api.dbpedia-spotlight.org/en/annotate"
HEADERS = {"Accept": "application/json"}

# Cargar el archivo Turtle
TTL_FILE = "articles_data_cleaned_10.ttl"
g = Graph()
g.parse(TTL_FILE, format="turtle")

# Namespace
MYV = Namespace("http://example.org/myv/")

# Función para extraer palabras clave del archivo Turtle
def get_keywords_from_ttl(graph, limit=10):
    query = f"""
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX myv: <http://example.org/myv/>
        
        SELECT DISTINCT ?keyword
        WHERE {{
            ?concept a skos:Concept ;
                     rdfs:label ?keyword .
        }}
        LIMIT {limit}
    """
    results = graph.query(query)
    return [str(row["keyword"]) for row in results]

# Función para consultar entidades relacionadas en DBpedia Spotlight
def link_keywords_to_dbpedia(keywords, confidence=0.5, support=20):
    linked_entities = {}
    for keyword in keywords:
        print(f"Consultando DBpedia Spotlight para: {keyword}")
        params = {
            "text": keyword,
            "confidence": confidence,
            "support": support
        }
        response = requests.get(DBPEDIA_ENDPOINT, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            if "Resources" in data:
                linked_entities[keyword] = [
                    {"uri": res["@URI"], "surfaceForm": res["@surfaceForm"]}
                    for res in data["Resources"]
                ]
        else:
            print(f"Error consultando {keyword}: {response.status_code}")
    return linked_entities

# Función para guardar resultados en un CSV
def save_results_to_csv(linked_data, output_file="linked_keywords.csv"):
    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Keyword", "DBpedia URI", "Surface Form"])
        for keyword, entities in linked_data.items():
            for entity in entities:
                writer.writerow([keyword, entity["uri"], entity["surfaceForm"]])
    print(f"Resultados guardados en {output_file}")

# Ejecutar la prueba
keywords = get_keywords_from_ttl(g, limit=200)  
linked_data = link_keywords_to_dbpedia(keywords, confidence=0.5)  # Enlazar con DBpedia Spotlight

# Guardar resultados en un archivo CSV
save_results_to_csv(linked_data, output_file="linked_keywords.csv")



Consultando DBpedia Spotlight para: adam optimizer
Consultando DBpedia Spotlight para: amazon product reviews
Consultando DBpedia Spotlight para: and machine-learning algorithms
Consultando DBpedia Spotlight para: aspect based sentiment analysis (absa)
Consultando DBpedia Spotlight para: aspect based sentiment classification
Consultando DBpedia Spotlight para: aspect classification
Consultando DBpedia Spotlight para: business. economics
Consultando DBpedia Spotlight para: ceur-ws
Consultando DBpedia Spotlight para: classification sentiment analysis
Consultando DBpedia Spotlight para: code-mixed dravidian language
Consultando DBpedia Spotlight para: computational design
Consultando DBpedia Spotlight para: convolution neural network (cnn)
Consultando DBpedia Spotlight para: corpus-based
Consultando DBpedia Spotlight para: customer review
Consultando DBpedia Spotlight para: cyber threat intelligence
Consultando DBpedia Spotlight para: cyptocurrencies
Consultando DBpedia Spotlight para: da

In [20]:
from rdflib import Graph, URIRef, Namespace, Literal
import requests
import json
from rdflib.namespace import SKOS 
# Namespaces
dbpedia = Namespace("http://dbpedia.org/resource/")
dbo = Namespace("http://dbpedia.org/ontology/")

# Configuración del endpoint de DBpedia Spotlight
DBPEDIA_SPOTLIGHT = "https://api.dbpedia-spotlight.org/en/annotate"
HEADERS = {"Accept": "application/json"}

# Función para consultar DBpedia Spotlight
def get_dbpedia_uri(keyword):
    params = {
        "text": keyword,
        "confidence": 0.5
    }
    response = requests.get(DBPEDIA_SPOTLIGHT, headers=HEADERS, params=params)
    if response.status_code == 200:
        data = response.json()
        resources = data.get("Resources", [])
        if resources:
            return resources[0]["@URI"]  # Devuelve el primer URI encontrado
    return None

# Función para consultar categorías en DBpedia
def get_dbpedia_categories(uri):
    query = f"""
    SELECT ?category WHERE {{
        <{uri}> dcterms:subject ?category .
    }}
    """
    sparql_endpoint = "http://dbpedia.org/sparql"
    params = {
        "query": query,
        "format": "json"
    }
    response = requests.get(sparql_endpoint, params=params)
    if response.status_code == 200:
        data = response.json()
        return [result["category"]["value"] for result in data["results"]["bindings"]]
    return []

# Cargar palabras clave del archivo Turtle
input_file = "articles_data_cleaned_10.ttl"
g = Graph()
g.parse(input_file, format="turtle")

# Extraer las primeras 100 palabras clave para la prueba
keywords = list(set(g.objects(None, RDFS.label)))[:100]

if keywords:
    enriched_graph = Graph()
    enriched_graph.bind("dbo", dbo)
    enriched_graph.bind("dbpedia", dbpedia)
    enriched_graph.bind("skos", SKOS)

    for keyword in keywords:
        keyword = str(keyword)
        print(f"Procesando palabra clave: {keyword}")

        # Consultar DBpedia Spotlight
        dbpedia_uri = get_dbpedia_uri(keyword)
        if dbpedia_uri:
            print(f"URI encontrado en DBpedia: {dbpedia_uri}")

            # Consultar categorías en DBpedia
            categories = get_dbpedia_categories(dbpedia_uri)
            if categories:
                print(f"Categorías encontradas para {keyword}: {categories}")

                keyword_uri = URIRef(dbpedia_uri)
                enriched_graph.add((keyword_uri, SKOS.prefLabel, Literal(keyword)))
                for category in categories:
                    enriched_graph.add((keyword_uri, dbo.category, URIRef(category)))
            else:
                print("No se encontraron categorías para la URI proporcionada.")
        else:
            print("No se encontró una URI para la palabra clave seleccionada.")

    # Guardar el resultado en un archivo RDF
    output_file = "enriched_data.ttl"
    enriched_graph.serialize(output_file, format="turtle")
    print(f"Datos enriquecidos guardados en {output_file}")
else:
    print("No se encontraron palabras clave en el archivo RDF.")

Procesando palabra clave: bidirection encoder representations from transformers
No se encontró una URI para la palabra clave seleccionada.
Procesando palabra clave: classification algorithm
URI encontrado en DBpedia: http://dbpedia.org/resource/Algorithm
Categorías encontradas para classification algorithm: ['http://dbpedia.org/resource/Category:Articles_with_example_pseudocode', 'http://dbpedia.org/resource/Category:Algorithms', 'http://dbpedia.org/resource/Category:Theoretical_computer_science', 'http://dbpedia.org/resource/Category:Mathematical_logic']
Procesando palabra clave: aspect-level sentiment analysis
URI encontrado en DBpedia: http://dbpedia.org/resource/Sentiment_analysis
Categorías encontradas para aspect-level sentiment analysis: ['http://dbpedia.org/resource/Category:Sociology_of_technology', 'http://dbpedia.org/resource/Category:Social_information_processing', 'http://dbpedia.org/resource/Category:Social_media', 'http://dbpedia.org/resource/Category:Natural_language_pr

In [19]:
for s, p, o in g.triples((None, RDFS.label, None)):
    print(s, p, o)


http://example.org/myv/concept/adam_optimizer http://www.w3.org/2000/01/rdf-schema#label adam optimizer
http://example.org/myv/concept/amazon_product_reviews http://www.w3.org/2000/01/rdf-schema#label amazon product reviews
http://example.org/myv/concept/and_machine-learning_algorithms http://www.w3.org/2000/01/rdf-schema#label and machine-learning algorithms
http://example.org/myv/concept/aspect_based_sentiment_analysis_absa http://www.w3.org/2000/01/rdf-schema#label aspect based sentiment analysis (absa)
http://example.org/myv/concept/aspect_based_sentiment_classification http://www.w3.org/2000/01/rdf-schema#label aspect based sentiment classification
http://example.org/myv/concept/aspect_classification http://www.w3.org/2000/01/rdf-schema#label aspect classification
http://example.org/myv/concept/business_economics http://www.w3.org/2000/01/rdf-schema#label business. economics
http://example.org/myv/concept/ceur-ws http://www.w3.org/2000/01/rdf-schema#label ceur-ws
http://example.or