## Data Model
I start defining my data model using Python classes.

I need to focus on the Publications and their Authors. 

In [39]:
# Assuming IdentifiableEntity is defined somewhere, e.g.:
class IdentifiableEntity:
    def __init__(self, id):
        self.id = id

# Class for Publications
class Publication(IdentifiableEntity):
    def __init__(self, id, title, publication_type, publication_year, issue, volume, chapter, publication_venue, venue_type, publisher):
        self.id = id
        self.title = title     
        self.publication_type = publication_type  # renamed `type` to `publication_type` to avoid conflict with the built-in `type` function
        self.publication_year = publication_year
        self.issue = issue
        self.volume = volume
        self.chapter = chapter        
        self.publication_venue = publication_venue
        self.venue_type = venue_type  
        self.publisher = publisher 

# Class for Authors
class Person(IdentifiableEntity):
    def __init__(self, id, given_name, family_name, orcid):
        super().__init__(id)  # Call the parent constructor
        self.given_name = given_name
        self.family_name = family_name
        self.orcid = orcid  

    # Optionally, you can add methods to display information about the author
    def full_name(self):
        return f"{self.given_name} {self.family_name}"

    def __str__(self):
        return f"{self.given_name} {self.family_name} (ORCID: {self.orcid})"


In [33]:
import pandas as pd

# Classi già definite
class IdentifiableEntity:
    def __init__(self, id):
        self.id = id

class Publication:
    def __init__(self, id, title, publication_type, publication_year, issue, volume, chapter, publication_venue, venue_type, publisher):
        self.id = id
        self.title = title     
        self.publication_type = publication_type  
        self.publication_year = publication_year
        self.issue = issue
        self.volume = volume
        self.chapter = chapter        
        self.publication_venue = publication_venue
        self.venue_type = venue_type  
        self.publisher = publisher 

class Person(IdentifiableEntity):
    def __init__(self, id, given_name, family_name, orcid):
        super().__init__(id)
        self.given_name = given_name
        self.family_name = family_name
        self.orcid = orcid

# Crea un esempio di dati per le pubblicazioni e gli autori
publications = [
    Publication("12345", "Advanced Python Programming", "Book", 2021, None, None, None, "Tech Publisher", "Academic Publisher", "Tech Publishing"),
    Publication("67890", "Data Science in Python", "Article", 2020, "10", "5", "1", "Data Journal", "Academic Journal", "Data Publishing")
]

authors = [
    Person("A001", "John", "Smith", "0000-0002-1825-0097"),
    Person("A002", "Emily", "Johnson", "0000-0003-1825-5678"),
    Person("A003", "James", "Taylor", "0000-0004-1825-3456")
]

# Creiamo un dizionario per mappare gli autori alle pubblicazioni
publication_author_map = {
    "12345": ["A001", "A002"],  # La pubblicazione con id 12345 ha come autori John Smith ed Emily Johnson
    "67890": ["A002", "A003"]   # La pubblicazione con id 67890 ha come autori Emily Johnson e James Taylor
}

# Creiamo una lista di dizionari per il DataFrame
data_for_df = []

# Mappa le pubblicazioni e gli autori
for pub in publications:
    for author_id in publication_author_map.get(pub.id, []):
        author = next((a for a in authors if a.id == author_id), None)
        if author:
            data_for_df.append({
                "publication_id": pub.id,
                "publication_title": pub.title,
                "publication_type": pub.publication_type,
                "publication_year": pub.publication_year,
                "publication_venue": pub.publication_venue,
                "publication_publisher": pub.publisher,
                "author_id": author.id,
                "author_given_name": author.given_name,
                "author_family_name": author.family_name,
                "author_orcid": author.orcid
            })

# Creiamo il DataFrame
df = pd.DataFrame(data_for_df)

# Visualizziamo il DataFrame risultante
print(df)


  publication_id            publication_title publication_type  \
0          12345  Advanced Python Programming             Book   
1          12345  Advanced Python Programming             Book   
2          67890       Data Science in Python          Article   
3          67890       Data Science in Python          Article   

   publication_year publication_venue publication_publisher author_id  \
0              2021    Tech Publisher       Tech Publishing      A001   
1              2021    Tech Publisher       Tech Publishing      A002   
2              2020      Data Journal       Data Publishing      A002   
3              2020      Data Journal       Data Publishing      A003   

  author_given_name author_family_name         author_orcid  
0              John              Smith  0000-0002-1825-0097  
1             Emily            Johnson  0000-0003-1825-5678  
2             Emily            Johnson  0000-0003-1825-5678  
3             James             Taylor  0000-0004-1825-

In [36]:
# Class for IdentifiableEntity
class IdentifiableEntity:
    def __init__(self, id):
        self.id = id

# Class for Publication, inheriting from IdentifiableEntity
class Publication(IdentifiableEntity):
    def __init__(self, id, title, publication_type, publication_year, issue, volume, chapter, publication_venue, venue_type, publisher):
        # Initialize the parent class (IdentifiableEntity)
        super().__init__(id)
        
        # Initialize other attributes for Publication
        self.title = title     
        self.publication_type = publication_type  
        self.publication_year = publication_year
        self.issue = issue
        self.volume = volume
        self.chapter = chapter        
        self.publication_venue = publication_venue
        self.venue_type = venue_type  
        self.publisher = publisher

# Class for Author, inheriting from IdentifiableEntity
class Author(IdentifiableEntity):
    def __init__(self, id, given_name, family_name, orcid):
        super().__init__(id)
        self.given_name = given_name
        self.family_name = family_name
        self.orcid = orcid

# Create some sample data for publications
publications = [
    Publication("12345", "Advanced Python Programming", "Book", 2021, None, None, None, "Tech Publisher", "Academic Publisher", "Tech Publishing"),
    Publication("67890", "Data Science in Python", "Article", 2020, "10", "5", "1", "Data Journal", "Academic Journal", "Data Publishing")
]

# Create some sample data for authors (formerly Person)
authors = [
    Author("A001", "John", "Smith", "0000-0002-1825-0097"),
    Author("A002", "Emily", "Johnson", "0000-0003-1825-5678"),
    Author("A003", "James", "Taylor", "0000-0004-1825-3456")
]

# Print the IDs of publications and authors to verify inheritance
for pub in publications:
    print(f"Publication ID: {pub.id}")  # Inherited from IdentifiableEntity

for auth in authors:
    print(f"Author ID: {auth.id}")  # Inherited from IdentifiableEntity

print(df)

Publication ID: 12345
Publication ID: 67890
Author ID: A001
Author ID: A002
Author ID: A003
  publication_id            publication_title publication_type  \
0          12345  Advanced Python Programming             Book   
1          12345  Advanced Python Programming             Book   
2          67890       Data Science in Python          Article   
3          67890       Data Science in Python          Article   

   publication_year publication_venue publication_publisher author_id  \
0              2021    Tech Publisher       Tech Publishing      A001   
1              2021    Tech Publisher       Tech Publishing      A002   
2              2020      Data Journal       Data Publishing      A002   
3              2020      Data Journal       Data Publishing      A003   

  author_given_name author_family_name         author_orcid  
0              John              Smith  0000-0002-1825-0097  
1             Emily            Johnson  0000-0003-1825-5678  
2             Emily      

In [37]:
import pandas as pd

# Class for IdentifiableEntity
class IdentifiableEntity:
    def __init__(self, id):
        self.id = id

# Class for Publication, inheriting from IdentifiableEntity
class Publication(IdentifiableEntity):
    def __init__(self, id, title, publication_type, publication_year, issue, volume, chapter, publication_venue, venue_type, publisher):
        super().__init__(id)
        self.title = title     
        self.publication_type = publication_type  
        self.publication_year = publication_year
        self.issue = issue
        self.volume = volume
        self.chapter = chapter        
        self.publication_venue = publication_venue
        self.venue_type = venue_type  
        self.publisher = publisher

# Class for Author, inheriting from IdentifiableEntity
class Author(IdentifiableEntity):
    def __init__(self, id, given_name, family_name, orcid):
        super().__init__(id)
        self.given_name = given_name
        self.family_name = family_name
        self.orcid = orcid

# Create some sample data for publications
publications = [
    Publication("12345", "Advanced Python Programming", "Book", 2021, None, None, None, "Tech Publisher", "Academic Publisher", "Tech Publishing"),
    Publication("67890", "Data Science in Python", "Article", 2020, "10", "5", "1", "Data Journal", "Academic Journal", "Data Publishing")
]

# Create some sample data for authors
authors = [
    Author("12345", "John", "Smith", "0000-0002-1825-0097"),  # Matching ID with Publication 12345
    Author("67890", "Emily", "Johnson", "0000-0003-1825-5678"),  # Matching ID with Publication 67890
    Author("99999", "James", "Taylor", "0000-0004-1825-3456")  # No matching publication ID
]

# Create a list for the DataFrame
data_for_df = []

# Map publications to authors based on the matching ID
for pub in publications:
    for auth in authors:
        if pub.id == auth.id:  # Check if Publication ID matches Author ID
            # Create a unique DOI based on the matching ID
            doi = pub.id  # Use the publication ID as the unique DOI
            data_for_df.append({
                "doi": doi,
                "publication_id": pub.id,
                "publication_title": pub.title,
                "publication_type": pub.publication_type,
                "publication_year": pub.publication_year,
                "publication_venue": pub.publication_venue,
                "publication_publisher": pub.publisher,
                "author_id": auth.id,
                "author_given_name": auth.given_name,
                "author_family_name": auth.family_name,
                "author_orcid": auth.orcid
            })

# Create the DataFrame
df = pd.DataFrame(data_for_df)

# Display the DataFrame
print(df)


     doi publication_id            publication_title publication_type  \
0  12345          12345  Advanced Python Programming             Book   
1  67890          67890       Data Science in Python          Article   

   publication_year publication_venue publication_publisher author_id  \
0              2021    Tech Publisher       Tech Publishing     12345   
1              2020      Data Journal       Data Publishing     67890   

  author_given_name author_family_name         author_orcid  
0              John              Smith  0000-0002-1825-0097  
1             Emily            Johnson  0000-0003-1825-5678  


In [38]:
import pandas as pd

# Class for IdentifiableEntity
class IdentifiableEntity:
    def __init__(self, id):
        self.id = id

# Class for Publication, inheriting from IdentifiableEntity
class Publication(IdentifiableEntity):
    def __init__(self, id, title, publication_type, publication_year, issue, volume, chapter, publication_venue, venue_type, publisher):
        super().__init__(id)
        self.title = title     
        self.publication_type = publication_type  
        self.publication_year = publication_year
        self.issue = issue
        self.volume = volume
        self.chapter = chapter        
        self.publication_venue = publication_venue
        self.venue_type = venue_type  
        self.publisher = publisher

# Class for Author, inheriting from IdentifiableEntity
class Author(IdentifiableEntity):
    def __init__(self, id, given_name, family_name, orcid):
        super().__init__(id)
        self.given_name = given_name
        self.family_name = family_name
        self.orcid = orcid

# Create some sample data for publications
publications = [
    Publication("12345", "Advanced Python Programming", "Book", 2021, None, None, None, "Tech Publisher", "Academic Publisher", "Tech Publishing"),
    Publication("67890", "Data Science in Python", "Article", 2020, "10", "5", "1", "Data Journal", "Academic Journal", "Data Publishing")
]

# Create some sample data for authors
authors = [
    Author("12345", "John", "Smith", "0000-0002-1825-0097"),  # Matching ID with Publication 12345
    Author("67890", "Emily", "Johnson", "0000-0003-1825-5678"),  # Matching ID with Publication 67890
    Author("99999", "James", "Taylor", "0000-0004-1825-3456")  # No matching publication ID
]

# Create a list for the DataFrame
data_for_df = []

# Map publications to authors based on the matching ID and combine into one entry
for pub in publications:
    for auth in authors:
        if pub.id == auth.id:  # Check if Publication ID matches Author ID
            # Create a unique DOI based on the matching ID
            doi = pub.id  # Use the publication ID as the unique DOI
            data_for_df.append({
                "doi": doi,
                "publication_id": pub.id,
                "publication_title": pub.title,
                "publication_type": pub.publication_type,
                "publication_year": pub.publication_year,
                "publication_venue": pub.publication_venue,
                "publication_publisher": pub.publisher,
                "author_id": auth.id,
                "author_given_name": auth.given_name,
                "author_family_name": auth.family_name,
                "author_orcid": auth.orcid
            })

# Create the DataFrame
df = pd.DataFrame(data_for_df)

# Display the DataFrame
print(df)


     doi publication_id            publication_title publication_type  \
0  12345          12345  Advanced Python Programming             Book   
1  67890          67890       Data Science in Python          Article   

   publication_year publication_venue publication_publisher author_id  \
0              2021    Tech Publisher       Tech Publishing     12345   
1              2020      Data Journal       Data Publishing     67890   

  author_given_name author_family_name         author_orcid  
0              John              Smith  0000-0002-1825-0097  
1             Emily            Johnson  0000-0003-1825-5678  


### Focus on the implementation of the query: search Publication by author's name input.
I start running the code using the sample sources: csv and json.

In [41]:
import pandas as pd
import json
from sqlalchemy import create_engine

class RelationalProcessor:
    def __init__(self, db_path):
        self.db_path = db_path
        self.engine = create_engine(f"sqlite:///{db_path}") 

    def upload_data(self, file_path):
        """
        Uploads data from a CSV or JSON file to the database.

        Args:
            file_path (str): Path to the file to be uploaded.

        Returns:
            bool: True if the upload was successful, False otherwise.
        """
        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path, dtype={'publication_year': 'Int64'}) 
                df['internal_id'] = df.index  # Assign unique internal IDs

            elif file_path.endswith('.json'):
                with open(file_path, 'r') as f:
                    data = json.load(f)
                df = pd.json_normalize(data) 

            else:
                raise ValueError("Unsupported file type. Please provide a CSV or JSON file.")

            # Create tables in database (if not exist)
            df.to_sql('publications', self.engine, if_exists='replace', index=False) 

            return True
        except Exception as e:
            print(f"Error uploading data: {e}")
            return False

# Example usage
db_path = 'my_database.db' 
processor = RelationalProcessor(db_path)

csv_file_path = 'import/relational_publications.csv'
json_file_path = 'import/relational_other_data.json'
csv_file_path = 'import/graph_publications.csv'
json_file_path = 'import/graph_other_data.json'

if processor.upload_data(csv_file_path):
    print("CSV data uploaded successfully.")

if processor.upload_data(json_file_path):
    print("JSON data uploaded successfully.")


CSV data uploaded successfully.
Error uploading data: (sqlite3.InterfaceError) Error binding parameter 0 - probably unsupported type.
[SQL: INSERT INTO publications ("authors.doi:10.1016/j.websem.2021.100655", "authors.doi:10.1007/s10115-017-1100-y", "authors.doi:10.1016/j.websem.2014.03.003", "authors.doi:10.1093/nar/gkz997", "authors.doi:10.3390/publications7030050", "authors.doi:10.1017/s0269888920000065", "authors.doi:10.3390/info11030129", "authors.doi:10.1007/s00778-018-0528-3", "authors.doi:10.21105/joss.02731", "authors.doi:10.1016/j.websem.2014.06.002", "authors.doi:10.1007/s10115-019-01401-x", "authors.doi:10.1007/978-3-030-30793-6_22", "authors.doi:10.1007/978-3-030-33220-4_25", "authors.doi:10.1080/17538947.2020.1738568", "authors.doi:10.1007/s10462-020-09826-5", "authors.doi:10.1007/978-3-030-49461-2_25", "authors.doi:10.1007/s10462-020-09866-x", "authors.doi:10.1007/s10796-020-10035-2", "authors.doi:10.1007/978-3-030-60276-5_25", "authors.doi:10.1007/978-3-030-54956-5_2",

## Data Model
Then I create the data model for the Query Processors, adding classes, relations and methods.

In [42]:
import pandas as pd
from pandas import read_csv
from pandas import read_json
from pandas import read_sql
from pandas import merge
from pandas import concat
from pandas import Series
import sqlite3
import json
from sqlite3 import connect


# class for Identifiable Entity
class IdentifiableEntity(object):

    def __init__(self, id):
        self.id = id
        self.id_array = set()
        for identifier in id:
            self.id_array.add(identifier)

    # methods of identifiableEntity

    def getIds(self):
        result = []
        for identifier in self.id_array:
            result.append(identifier)
        result.sort()
        return result

    def addId(self, id):
        result = True
        if id not in self.id:
            self.id.add(id)
        else:
            result = False
        return result

    def removeId(self, identifier):
        result = True
        if identifier in self.id:
            self.id.remove(identifier)
        else:
            result = False
        return result


# class for Person
class Person(IdentifiableEntity):

    def __init__(self, id, givenName, familyName, orcid):
        super().__init__(id)
        self.givenName = givenName
        self.familyName = familyName

    # methods of person
    def getGivenName(self):
        return self.givenName

    def getFamilyName(self):
        return self.familyName
    
    def getOrcid(self):
        return self.orcid


# class for publication
class Publication(IdentifiableEntity):
    def __init__(self, id, publicationYear, title, publicationVenue, author, cites):
        self.publicationYear = publicationYear
        self.title = title
        self.publicationVenue = publicationVenue
        self.author = set(author)
        self.cites = set(cites)

        super().__init__(id)

    # methods of publications
    def getPublicationYear(self):
        return self.publicationYear

    def getTitle(self):
        return self.title

    def getPublicationVenue(self):
        return self.publicationVenue

    def getCitedPublications(self):
        self.cites = []
        for p in self.cites:
            self.id.add(p)

        return self.cites

    def getAuthors(self):
        self.author = []
        for p in self.author:
            self.author.add(p)

        return self.author


# class for journal article
class JournalArticle(Publication):
    def __init__(self, id, publicationYear, title, publicationVenue, author, cites, issue, volume):
        self.issue = issue
        self.volume = volume
        super().__init__(id, publicationYear, title, publicationVenue, author, cites)

    # methods of journal article
    def getIssue(self):
        return self.issue

    def getVolume(self):
        return self.volume


# class for book chapter
class BookChapter(Publication):
    def __init__(self, id, publicationYear, title, publicationVenue, author, cites, chapterNumber):
        self.chapterNumber = chapterNumber
        super().__init__(id, publicationYear, title, publicationVenue, author, cites)

    def getChapterNumber(self):
        return self.chapterNumber

class ProceedingsPaper(Publication):
    pass


# class for venue
class Venue(IdentifiableEntity):
    def __init__(self, id, title, publisher):
        self.title = title
        self.publisher = set(publisher)
        super().__init__(id)

    def getTitle(self):
        return self.title

    def getPublisher(self):
        return self.publisher


class Journal(Venue):
    pass


class Book(Venue):
    pass


# class for proceedings
class Proceedings(Venue):
    def __init__(self, id, title, publisher, event):
        self.event = event
        super().__init__(id, title, publisher)

    def getEvent(self):
        return self.event


# class for organization
class Organization(IdentifiableEntity):
    def __init__(self, id, name):
        self.name = name
        super().__init__(id)

    def getName(self):
        return self.name


class QueryProcessor(object):
    def __init__(self):
        pass


# classes for the processors
class RelationalProcessor(object):

    def __init__(self):
        self.dbPath = ''  # dbPath: the variable containing the path of the database,
        # initially set as an empty string, that will be updated with the method setDbPath.

    # Methods
    def getDbPath(self):  # it returns the path of the database.
        return self.dbPath

    def setDbPath(self, path):
        self.dbPath = path


class RelationalDataProcessor(RelationalProcessor):

    def __init__(self):
        super().__init__()

    def uploadData(self, path): #method for uploading data. Here, in case the file is not a csv nor a json
        self.path = path        #an exception should be raised
        result = True

        while True:
            try:

                if self.path.endswith('csv'):
                    with open(path, "r", encoding="utf-8") as file:
                        publications = pd.read_csv(file, keep_default_na=False,
                                                   dtype={
                                                       "id": "string",
                                                       "title": "string",
                                                       "type": "string",
                                                       "publication_year": "int",
                                                       "issue": "string",
                                                       "volume": "string",
                                                       "chapter": "string",
                                                       "publication_venue": "string",
                                                       "venue_type": "string",
                                                       "publisher": "string",
                                                       "event": "string"
                                                   })
                        #creating empty data frames to be populated with the information coming from csv file
                        journal_article = pd.DataFrame({
                            "internalId", "issue", "volume", "publication_year", "title", "publication_venue", "id"})

                        book_chapter = pd.DataFrame({
                            "internalId", "chapter_number", "publication_year", "title", "publication_venue", "id"})

                        journal = pd.DataFrame({
                            "internalId", "doi", "title", "publisher"})

                        book = pd.DataFrame({
                            "internalId", "doi", "title", "publisher"})

                        proceedings_paper = pd.DataFrame(
                            {'internalId', 'doi', 'title', 'publication_year', 'publication_venue'})

                        proceedings = pd.DataFrame({
                            "internalId", "title", "event", "id", "publisher"})

                        publications = publications.drop_duplicates()

                        pub_ids = pd.DataFrame(publications['id'])
                        #creating internal Ids
                        publications_internal_id = []

                        for idx, row in pub_ids.iterrows():
                            publications_internal_id.append("publications-" + str(idx))

                        publications['internalId'] = pd.Series(publications_internal_id)

                        # Data Frame for publications

                        publications_df = pd.DataFrame(
                            {'internalId', 'doi', 'title', 'publication_year', 'publication_venue', 'publisher'})
                        publications_df['internalId'] = publications['internalId']
                        publications_df['doi'] = publications['id'].astype('str')
                        publications_df['title'] = publications['title'].astype('str')
                        publications_df['type'] = publications['type'].astype('str')
                        publications_df['publication_year'] = publications['publication_year'].astype('int')
                        publications_df['publication_venue'] = publications['publication_venue'].astype('str')
                        publications_df['publisher'] = publications['publisher'].astype('str')

                        # Data Frame for journal article

                        journal_article['internalId'] = publications[publications['type'] == "journal-article"][
                            'internalId'].astype('str')
                        journal_article['doi'] = publications[publications['type'] == "journal-article"]['id'].astype(
                            'str')
                        journal_article['issue'] = publications[publications['type'] == "journal-article"][
                            'issue'].astype('str')
                        journal_article['volume'] = publications[publications['type'] == "journal-article"][
                            'volume'].astype('str')
                        journal_article['publication_year'] = publications[publications['type'] == "journal-article"][
                            'publication_year'].astype('str')
                        journal_article['publication_venue'] = publications[publications['type'] == "journal-article"][
                            'publication_venue'].astype('str')
                        journal_article['title'] = publications[publications['type'] == "journal-article"][
                            'title'].astype('str')

                        # Data Frame for Journal

                        journal['internalId'] = publications[publications['venue_type'] == "journal"][
                            'internalId'].astype('str')
                        journal['doi'] = publications[publications['venue_type'] == "journal"]['id'].astype(
                            'str')
                        journal['title'] = publications[publications['venue_type'] == "journal"][
                            'issue'].astype('str')
                        journal['publisher'] = publications[publications['venue_type'] == "journal"][
                            'volume'].astype('str')

                        # Data Frame for book chapter

                        book_chapter['internalId'] = publications[publications['type'] == "book-chapter"][
                            'internalId'].astype('str')
                        book_chapter['doi'] = publications[publications['type'] == "book-chapter"]['id'].astype('str')
                        book_chapter['chapter'] = publications[publications['type'] == "book-chapter"][
                            'chapter'].astype('str')
                        book_chapter['publication_year'] = publications[publications['type'] == "book-chapter"][
                            'publication_year'].astype('str')
                        book_chapter['publication_venue'] = publications[publications['type'] == "book-chapter"][
                            'publication_venue'].astype('str')
                        book_chapter['title'] = publications[publications['type'] == "book-chapter"][
                            'title'].astype('str')

                        # Data Frame for book

                        book['internalId'] = publications[publications['venue_type'] == "book"][
                            'internalId'].astype('str')
                        book['doi'] = publications[publications['venue_type'] == "book"]['id'].astype(
                            'str')
                        book['title'] = publications[publications['venue_type'] == "book"][
                            'issue'].astype('str')
                        book['publisher'] = publications[publications['venue_type'] == "book"][
                            'volume'].astype('str')

                        # Data Frame for Proceedings Paper

                        proceedings_paper['internalId'] = publications['internalId'].astype('str')
                        proceedings_paper['doi'] = publications['id'].astype('str')
                        proceedings_paper['title'] = publications['title'].astype('str')
                        proceedings_paper['publisher'] = publications['publisher'].astype('str')
                        proceedings_paper['event'] = publications['event'].astype('str')
                        proceedings_paper['publication_venue'] = publications['publication_venue'].astype('str')
                        proceedings_paper['publication_year'] = publications['publication_year'].astype('str')

                        # Data Frame for Proceedings 

                        proceedings['internalId'] = publications['internalId'].astype('str')
                        proceedings['doi'] = publications['id'].astype('str')
                        proceedings['title'] = publications['title'].astype('str')
                        proceedings['publisher'] = publications['publisher'].astype('str')
                        proceedings['event'] = publications['event'].astype('str')
                        proceedings['publication_venue'] = publications['publication_venue'].astype('str')

                    with connect(self.dbPath) as con:
                        publications_df.to_sql("Publications", con, if_exists="append", index=False)
                        journal_article.to_sql("JournalArticle", con, if_exists="append", index=False)
                        book_chapter.to_sql("BookChapter", con, if_exists="append", index=False)
                        journal.to_sql("Journal", con, if_exists="append", index=False)
                        book.to_sql("Book", con, if_exists="append", index=False)
                        proceedings_paper.to_sql("ProceedingsPaper", con, if_exists="append", index=False)
                        proceedings.to_sql("Proceedings", con, if_exists="append", index=False)

                    con.commit()

                elif self.path.endswith('.json'):

                    with open(path, "r", encoding="utf-8") as file:
                        venue = json.load(file)

                        # DataFrame for authors being populated
                        authors_df = pd.DataFrame({
                            "doi_authors": pd.Series(dtype="str"),
                            "family": pd.Series(dtype="str"),
                            "given": pd.Series(dtype="str"),
                            "orcid": pd.Series(dtype="str")
                        })

                        family = []
                        given = []
                        orcid = []
                        doi_authors = []

                        authors = venue['authors']
                        for key in authors:
                            for value in authors[key]:
                                doi_authors.append(key)
                                family.append(value['family'])
                                given.append(value['given'])
                                orcid.append(value['orcid'])

                        authors_df['doi_authors'] = doi_authors
                        authors_df['family'] = family
                        authors_df['given'] = given
                        authors_df['orcid'] = orcid
                        authors_df = authors_df.drop_duplicates()

                        # Data Frame for internal ID Venue

                        venues_id_df = pd.DataFrame({
                            "doi_venues_id": pd.Series(dtype="str"),
                            "issn_isbn": pd.Series(dtype="str"),
                        })
                        doi_venues_id = []
                        issn_isbn = []

                        venues_id = venue["venues_id"]
                        for key in venues_id:
                            for value in venues_id[key]:
                                doi_venues_id.append(key)
                                issn_isbn.append(value)

                        venues_id_df["doi_venues_id"] = doi_venues_id
                        venues_id_df["issn_isbn"] = pd.Series(issn_isbn)

                        venues_id_df = venues_id_df.drop_duplicates()

                        venue_int = []
                        for idx, row in venues_id_df.iterrows():
                            venue_int.append("venue-" + str(idx))

                        venues_id_df["internalId"] = venue_int

                        # Data Frame for references
                        references_df = pd.DataFrame({
                            "idCited": pd.Series(dtype="str"),
                            "idCites": pd.Series(dtype="str"),
                            "no": pd.Series(dtype="int64")
                        })

                        id_ref = []
                        id_ref_doi = []

                        references = venue["references"]

                        for key in references:
                            for value in references[key]:
                                id_ref.append(key)
                                id_ref_doi.append(value)

                        references_df["idCited"] = id_ref
                        references_df["idCites"] = id_ref_doi
                        references_df["no"] = references_df.index

                        references_df = references_df.drop_duplicates()

                        # Data Frame for publishers

                        publishers_df = pd.DataFrame({
                            "id_pub": pd.Series(dtype="str"),
                            "name": pd.Series(dtype="str")
                        })

                        doi_pub = []
                        name = []
                        id_pub = []

                        publishers = venue["publishers"]

                        for key in publishers:
                            for value in publishers[key]:
                                doi_pub.append(key)
                                id_pub.append(publishers[key]["id"])
                                name.append(publishers[key]["name"])

                        publishers_df["doi"] = doi_pub
                        publishers_df["id_pub"] = id_pub
                        publishers_df["name"] = name
                        publishers_df["doi_venue"] = venues_id_df["doi_venues_id"]
                        publishers_df = publishers_df.drop_duplicates()

                        publishers_df["internalId"] = venues_id_df['internalId']

                        authors_df["internalId"] = venues_id_df['internalId']

                        references_df["internalId"] = venues_id_df['internalId']

                        # Creating the tables into the database

                    with connect(self.dbPath) as con:

                        authors_df.to_sql("Authors", con, if_exists="append", index=False)
                        venues_id_df.to_sql("Venues", con, if_exists="append", index=False)
                        references_df.to_sql("ReferencesTable", con, if_exists="append", index=False)
                        publishers_df.to_sql("Publishers", con, if_exists="append", index=False)

                    con.commit()

                else:
                    result = False

            except ValueError:
                print("Oops! This doesn't seem a valid file.")
                result = False

            return result
   



Now we can configure the two Query Processors and add the specific queries for the case:
Search Publication by Author's name.

The primary key to be used in order to match the sources is **doi**, which is the common value of the entities: Publications and Authors.

Our aim is to get a list of Publications that has author's names matching the input field.
In particular we need to add methods to the query processors classes.

Classes RelationalQueryProcessor and TriplestoreQueryProcessor:

>## ***searchByAuthor(name : str) : DataFrame**
It returns a data frame with all the publications (i.e. the rows) that have been authored by people having the input string included either in their family name or given name. The match must be done lowercase and can be also partial, e.g. the input string “jo" will match with “Jo", “John", “Johnathan" and “Guido".

Classes for the GenericQueryProcessor:

>## **searchByAuthor(name : str) : list[Publication]**
It returns a list of Publication objects referring to all the publications (i.e. the rows) that have been authored by people having the input string included either in their family name or given name. The match must be done lowercase and can be also partial, e.g. the input string “jo" will match with “Jo", “John", “Johnathan" and “Guido".



> **getPublicationsByAuthorName**: It returns a list of Publication objects referring to all the publications that have been authored by the people having their name matching (in lowercase), even partially, with the name specified as input (e.g. "doe").

In [43]:
def load_author_data_from_json(self, json_file_path):
    with open(json_file_path, 'r') as f:
        author_data = json.load(f)
        print("DEBUG: Loaded JSON data:", author_data)  # Debug statement

    # Initialize author list
    author_list = []

    # Process each record in the JSON
    for record in author_data:
        if isinstance(record, str):
            # If record is a string, parse it
            record = json.loads(record)

        # Extract DOI and author information
        doi = record.get("doi", "")
        authors = record.get("authors", {})
        author_list.append({
            "publication_id": doi,
            "first_name": authors.get("given", ""),
            "last_name": authors.get("family", ""),
            "ORCID": authors.get("orcid", "")
        })

    # Convert to DataFrame
    author_df = pd.DataFrame(author_list)
    print("DEBUG: Prepared author DataFrame:", author_df.head())  # Debug statement

    # Load author



In [40]:
class RelationalQueryProcessor:
    def __init__(self, db_path):
        self.db_path = db_path

    def create_tables(self):
        conn = sqlite3.connect(self.db_path)
        cur = conn.cursor()

        # Create authors table
        cur.execute("""
            CREATE TABLE IF NOT EXISTS authors (
                publication_id TEXT,
                first_name TEXT,
                last_name TEXT,
                ORCID TEXT,
                FOREIGN KEY (publication_id) REFERENCES publications(publication_id)
            )
        """)
        conn.commit()
        conn.close()

    def load_author_data_from_json(self, json_file_path):
        with open(json_file, 'r') as f:
            author_data = json.load(f)

        # Extract author information from JSON 
        author_list = []
        for doi, author_list in author_data.items(): 
            for author in author_list:
                author_list.append({
                    "publication_id": doi,
                    "first_name": author.get("given", ""),
                    "last_name": author.get("family", ""),
                    "ORCID": author.get("orcid", "") 
                })
        author_df = pd.DataFrame(author_list)
        
        # Load author data to database
        author_df.to_sql("authors", self.db_path, index=False, if_exists='append')

    def searchByAuthorName(self, name):
        conn = sqlite3.connect(self.db_path)
        cur = conn.cursor()
        query = """
        SELECT p.publication_id, p.title, a.first_name, a.last_name
        FROM publications p
        JOIN authors a ON p.publication_id = a.publication_id
        WHERE LOWER(a.first_name) LIKE ? OR LOWER(a.last_name) LIKE ?
        """
        cur.execute(query, (f"%{name.lower()}%", f"%{name.lower()}%"))
        results = cur.fetchall()
        conn.close()
        return pd.DataFrame(results, columns=["publication_id", "title", "first_name", "last_name"])
    

class TriplestoreQueryProcessor:
    def __init__(self):
        self.graph = Graph()

    def load_rdf(self, publication_data, author_data):
        """
        Load publication and author data into an RDF graph.
        """
        
    # Method:
    store = f()
    def uploadData(self, path): # it enables to upload the collection of data specified in the input file path (either in CSV or JSON) into the database.
        dsd_graph = Graph()
    store.open((endpoint, endpoint))
    for triple in dsd_graph.triples((None, None, None)):
        store.add(triple)
    store.close()
       

# classes of resources
    JournalArticle = URIRef("https://schema.org/ScholarlyArticle")
    BookChapter = URIRef("https://schema.org/Chapter")
    ProceedingsPaper = URIRef("http://purl.org/spar/fabio/ProceedingsPaper")
    Journal = URIRef("https://schema.org/Periodical")
    Book = URIRef("https://schema.org/Book")
    Proceedings = URIRef("http://purl.org/spar/fabio/AcademicProceedings")
    Organization = URIRef("https://schema.org/Organization")
    IdentifiableEntity = URIRef("https://schema.org/identifier")
    Publication = URIRef("https://schema.org/publication")
    Venue = URIRef("https://schema.org/VenueMap")
    Person = URIRef("https://schema.org/Person")

        # attributes related to classes
    publicationYear = URIRef("https://schema.org/datePublished")
    title = URIRef("http://purl.org/dc/terms/title")
    issue = URIRef("https://schema.org/issueNumber")
    volume = URIRef("https://schema.org/volumeNumber")
    doi = URIRef("https://schema.org/identifier")
    identifier = URIRef("https://schema.org/identifier")
    name = URIRef("https://schema.org/name")
    event = URIRef("https://schema.org/Event")
    chapterNumber = URIRef("https://github.com/lelax/D_Sign_Data/blob/main/URIRef/chapterNumber")
    givenName = URIRef("https://schema.org/givenName")
    familyName = URIRef("https://schema.org/familyName")
    orcid = URIRef("https://schema.org/orcid")
    proceedingpapers = URIRef("https://schema.org/proceedingpapers")
    doiPublisher = URIRef("https://schema.org/doiPublisher")
    doiId = URIRef("https://schema.org/doiId")

        # relations among classes
    publicationVenue = URIRef("https://schema.org/isPartOf")
    publisher = URIRef("https://schema.org/publishedBy")
    author = URIRef("http://purl.org/saws/ontology#isWrittenBy")
    citation = URIRef("https://schema.org/citation")

        #literal
    a_string = Literal("a string ")
    a_number = Literal(4)
    a_boolean = Literal(True)

    base_url = "https://github.com/lelax/New_Design_Data/"

    def searchByAuthorName(self, name):
        query = """
        SELECT ?publication ?publicationTitle
        WHERE {
            ?publication dc:title ?publicationTitle.
            ?publication dc:creator ?author.
            ?author foaf:firstName ?firstName.
            ?author foaf:familyName ?lastName.
            FILTER(
                REGEX(?firstName, "{name}", "i") || 
                REGEX(?lastName, "{name}", "i")
            )
        }
        """.format(name=name)
        results = self.graph.query(query)
        # Process results and return in desired format (e.g., DataFrame)
        return results 

# Testing code
# Load data from CSV and JSON (replace with your file paths)
publications_df = pd.read_csv("import/relational_publications.csv")
author_data = json.load(open("import/relational_other_data.json")) 

# Create and populate database (RelationalProcessor)
relational_processor = RelationalQueryProcessor("publications.db")  

# Create RDF graph (TriplestoreQueryProcessor)
rdf_processor = TriplestoreQueryProcessor()
rdf_processor.load_rdf(publications_df, author_data) 

# Search for publications by author name
author_name = "input here"
relational_results = relational_processor.searchByAuthorName(author_name)
rdf_results = rdf_processor.searchByAuthorName(author_name)



TypeError: '_io.TextIOWrapper' object is not callable

In [24]:
import sqlite3
import pandas as pd
import json
from rdflib import Graph, Literal, URIRef, Namespace

# Relational Query Processor
class RelationalQueryProcessor:
    def __init__(self, db_path):
        self.db_path = db_path

    def create_tables(self):
        conn = sqlite3.connect(self.db_path)
        cur = conn.cursor()

        # Create publications table
        cur.execute("""
            CREATE TABLE IF NOT EXISTS publications (
                publication_id TEXT PRIMARY KEY,
                title TEXT
            )
        """)

        # Create authors table
        cur.execute("""
            CREATE TABLE IF NOT EXISTS authors (
                publication_id TEXT,
                first_name TEXT,
                last_name TEXT,
                ORCID TEXT,
                FOREIGN KEY (publication_id) REFERENCES publications(publication_id)
            )
        """)
        conn.commit()
        conn.close()

    def load_author_data_from_json(self, json_file_path):
        with open(json_file_path, 'r') as f:
            author_data = json.load(f)

        # Extract author information from JSON
        author_list = []
        for record in author_data:
            doi = record.get("doi")
            authors = record.get("authors", {})
            author_list.append({
                "publication_id": doi,
                "first_name": authors.get("given", ""),
                "last_name": authors.get("family", ""),
                "ORCID": authors.get("orcid", "")
            })

        author_df = pd.DataFrame(author_list)

        # Load author data to database
        conn = sqlite3.connect(self.db_path)
        author_df.to_sql("authors", conn, index=False, if_exists='append')
        conn.close()

    def searchByAuthorName(self, name):
        conn = sqlite3.connect(self.db_path)
        cur = conn.cursor()
        query = """
        SELECT p.publication_id, p.title, a.first_name, a.last_name
        FROM publications p
        JOIN authors a ON p.publication_id = a.publication_id
        WHERE LOWER(a.first_name) LIKE ? OR LOWER(a.last_name) LIKE ?
        """
        cur.execute(query, (f"%{name.lower()}%", f"%{name.lower()}%"))
        results = cur.fetchall()
        conn.close()
        return pd.DataFrame(results, columns=["publication_id", "title", "first_name", "last_name"])


# Triplestore Query Processor
class TriplestoreQueryProcessor:
    def __init__(self):
        self.graph = Graph()

    def load_rdf(self, publications, authors):
        base = Namespace("http://example.org/")
        self.graph.bind("ex", base)

        for index, row in publications.iterrows():
            pub_uri = base[row["id"]]
            self.graph.add((pub_uri, base.title, Literal(row["title"])))

        for author in authors:
            pub_uri = base[author["doi"]]
            author_uri = base[author["family"] + "_" + author["given"]]
            self.graph.add((pub_uri, base.author, author_uri))
            self.graph.add((author_uri, base.familyName, Literal(author["family"])))
            self.graph.add((author_uri, base.givenName, Literal(author["given"])))
            if "orcid" in author:
                self.graph.add((author_uri, base.orcid, Literal(author["orcid"])))

    def searchByAuthorName(self, name):
        query = f"""
        PREFIX ex: <http://example.org/>
        SELECT ?publication ?publicationTitle
        WHERE {{
            ?publication ex:title ?publicationTitle.
            ?publication ex:author ?author.
            ?author ex:givenName ?firstName.
            ?author ex:familyName ?lastName.
            FILTER(
                REGEX(?firstName, "{name}", "i") || 
                REGEX(?lastName, "{name}", "i")
            )
        }}
        """
        results = self.graph.query(query)
        return [{"publication": str(row.publication), "title": str(row.publicationTitle)} for row in results]


# Testing Code
if __name__ == "__main__":
    # Load data from CSV and JSON
    publications_df = pd.read_csv("import/relational_publications.csv")
    author_data = json.load(open("import/relational_other_data.json"))

    # Relational Processor
    relational_processor = RelationalQueryProcessor("publications.db")
    relational_processor.create_tables()
    relational_processor.load_author_data_from_json("import/relational_other_data.json")

    # RDF Processor
    rdf_processor = TriplestoreQueryProcessor()
    rdf_processor.load_rdf(publications_df, author_data)

    # Search by author name
    author_name = "Smith"  # Example input
    relational_results = relational_processor.searchByAuthorName(author_name)
    print("Relational DB Results:")
    print(relational_results)

    rdf_results = rdf_processor.searchByAuthorName(author_name)
    print("RDF Graph Results:")
    print(rdf_results)


AttributeError: 'str' object has no attribute 'get'

In [25]:
with open(json_file_path, 'r') as f:
    author_data = json.load(f)
    print("DEBUG: Loaded JSON data:", author_data)


DEBUG: Loaded JSON data: {'authors': {'doi:10.1016/j.websem.2021.100655': [{'family': 'Espinoza-Arias', 'given': 'Paola', 'orcid': '0000-0002-3938-2064'}, {'family': 'Garijo', 'given': 'Daniel', 'orcid': '0000-0003-0454-7145'}, {'family': 'Corcho', 'given': 'Oscar', 'orcid': '0000-0002-9260-0753'}], 'doi:10.1007/s10115-017-1100-y': [{'family': 'Diefenbach', 'given': 'Dennis', 'orcid': '0000-0002-0046-2219'}], 'doi:10.1016/j.websem.2014.03.003': [{'family': 'Groth', 'given': 'Paul', 'orcid': '0000-0003-0183-6910'}, {'family': 'Gray', 'given': 'Alasdair J.G.', 'orcid': '0000-0002-5711-4872'}, {'family': 'Harland', 'given': 'Lee', 'orcid': '0000-0003-0461-0028'}], 'doi:10.1093/nar/gkz997': [{'family': 'Shefchek', 'given': 'Kent A', 'orcid': '0000-0001-6439-2224'}, {'family': 'Vasilevsky', 'given': 'Nicole', 'orcid': '0000-0001-5208-3432'}, {'family': 'Balhoff', 'given': 'James P', 'orcid': '0000-0002-8688-6599'}, {'family': 'Jupp', 'given': 'Simon', 'orcid': '0000-0002-0643-3144'}, {'fami