In [None]:
pip install semanticscholar

In [None]:
pip install pandas

In [1]:
from semanticscholar import SemanticScholar
import csv
import pandas as pd
import random
import json


In [2]:
topics = [
    "machine learning"
    # , "artificial intelligence", "computer vision",
    # "quantum mechanics", "cancer treatment", "COVID-19", 
    # "ancient civilizations", "economics", "psychology", "marketing",
    # "music", "film", "literature", "education"
]


In [3]:
sch = SemanticScholar()

In [4]:

# sch = SemanticScholar()
all_papers = []

# Fetch papers for each topic and add to the list
for topic in topics:
    try:
        print(f"Fetching papers for: {topic}")
        response = sch.search_paper(query=topic, bulk=True)
        
        # Extract raw data from the response
        raw_data = response.raw_data
        
        # Append the papers to the all_papers list
        all_papers.extend(raw_data)
        
    except Exception as e:
        print(f"Error fetching papers for {topic}: {e}")



Fetching papers for: machine learning


In [5]:
len(all_papers)

1000

In [6]:
with open("semantic_scholar_combined_results.json", "w", encoding="utf-8") as f:
    json.dump(all_papers, f, indent=4)

In [7]:
with open("semantic_scholar_combined_results.json", "r", encoding="utf-8") as f:
    all_papers = json.load(f)

In [8]:
response = all_papers

In [9]:
csv_filename = "papers_og.csv"

papers_data = []
author_paper_relationship = []  # List to store paper-author relationship
paper_field_relationship = []  # List to store paper-field relationship

filtered_responses = []
author_data = []  # List to store author names and IDs
field_data = {}  # Dictionary to store unique fields and their IDs
field_id_counter = 1  # Counter for assigning unique field IDs


Journals_data = []
Conferences_data = []
Conf_editions = []

# Open CSV file for writing
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write header row with exact headers you provided
    writer.writerow([
        "Paper_ID", "DOI", "Title", "Abstract", "Venue", "publicationType", 
        "publication_type_2", "venue_id", "journal_name", "pages", "volume", "Edition_id", "Year", 
        "publicationDate", "Citations", "References", "Authors", "Author_Ids", 
        "URL", "s2FieldsOfStudys", "Corresponding_Author_ID"
    ])

    # Write data rows
    for paper in response:
        paper_id = paper.get("paperId", "Unknown") 
        doi = paper["externalIds"].get("DOI", "Unknown")  # Get DOI if available
        title = paper.get("title", "Unknown")
        abstract = paper.get("abstract", "")
        venue = paper.get("venue", "Unknown")
        publication_type = (paper.get("publicationVenue") or {}).get("type", "Unknown")
        venue_id = (paper.get("publicationVenue") or {}).get("id", "Unknown")
        publication_type_2 = paper.get("publicationTypes", [])
        journal_name = (paper.get("journal") or {}).get("name", "Unknown") 
        pages = (paper.get("journal") or {}).get("pages", "Unknown") 
        volume = (paper.get("journal") or {}).get("volume", "Unknown") 

        year = paper.get("year", "2025")
        publicationDate = paper.get("publicationDate", "Unknown")
        citation_count = paper.get("citationCount", "Unknown")
        reference_count = paper.get("referenceCount", "Unknown")
        s2FieldsOfStudys = ', '.join(sorted({s2FieldsOfStudy.get('category', "Unknown") for s2FieldsOfStudy in paper.get("s2FieldsOfStudy", [])}))

        url = paper.get("url", "Unknown")

        # Extract authors and author IDs
        authors = [author.get('name', 'Unknown') for author in paper.get("authors", [])]
        author_ids = [author.get('authorId', 'Unknown') for author in paper.get("authors", [])]
        if any(value == "Unknown" or value == "" for value in [doi, title, venue_id, publication_type, journal_name, year, authors, author_ids]):
            continue

        if len(authors) != len(author_ids):
            continue


        authors_str = ", ".join(f"{author.get('name', 'Unknown')}" for author in paper.get("authors", []))
        author_ids_str = ", ".join(f"{author.get('authorId', 'Unknown')}" for author in paper.get("authors", []))

        # Store authors and their IDs in the list for later use
        for author_name, author_id in zip(authors, author_ids):
            author_data.append({"Author_Name": author_name, "Author_ID": author_id})

        # Mark the first author as the corresponding author
        corresponding_author_id = author_ids[0] if author_ids else "Unknown"
        
        # Create the paper-author relationship for each author
        for author_id in author_ids:
            is_corresponding = (author_id == corresponding_author_id)
            author_paper_relationship.append({"DOI": doi, "Author_ID": author_id, "Corresponding": is_corresponding})
    

        # Handle fields of study
        fields_of_study = {s2FieldsOfStudy.get('category', "Unknown") for s2FieldsOfStudy in paper.get("s2FieldsOfStudy", [])}

        # Assign unique IDs to fields of study
        for field in fields_of_study:
            if field not in field_data:
                field_data[field] = field_id_counter
                field_id_counter += 1

        # Create paper-field relationship
        for field in fields_of_study:
            paper_field_relationship.append({"DOI": doi, "Field_ID": field_data[field]})


        if publication_type == "conference":
            Conferences_data.append({"ID": venue_id, "Name": venue, "url": url})
            Conf_editions.append({
                    "Edition_ID": f"{year}{venue_id}",
                    "Venue_ID": f"{venue_id}",
                    "Conference_Edition_Name": f"{year} {venue}",
                    "Year": year
            })
            # Generate synthetic editions for the past 3 years
            # n=random.randint(2,6)
            for i in range(1,5):
                edition_year = int(year) - i

                Conf_editions.append({
                    "Edition_ID": f"{edition_year}{venue_id}",
                    "Venue_ID": f"{venue_id}",
                    "Conference_Edition_Name": f"{edition_year} {venue}",
                    "Year": edition_year
                })                                
                synthetic_paper_id = f"{edition_year}{paper_id}"
                synthetic_doi = f"{edition_year}{doi}"
                synthetic_title = f"{edition_year} {title}"
                paper_edition=f"{edition_year}{venue_id}"

                papers_data.append({
                    "Paper_ID":synthetic_paper_id, "DOI":synthetic_doi, "Title":synthetic_title, "Abstract":abstract,
                    "Venue":venue, "publicationType":publication_type, 
                    "venue_id":venue_id, "journal_name":journal_name, "pages":pages, "volume":volume, 
                    "Edition_id":paper_edition, "Year":edition_year, "publicationDate":publicationDate, 
                    "Authors":authors_str, "Author_Ids":author_ids_str, "URL":url, 
                    "s2FieldsOfStudys":s2FieldsOfStudys, "Corresponding_Author_ID":corresponding_author_id
                })
                # Add synthetic authors to author-paper relationship
                for author_id in author_ids:
                    author_paper_relationship.append({"DOI": synthetic_doi, "Author_ID": author_id, "Corresponding": author_id == corresponding_author_id})
                for field in fields_of_study:
                    paper_field_relationship.append({"DOI": synthetic_doi, "Field_ID": field_data[field]})

        elif publication_type == "journal":
            Journals_data.append({
                "ID":venue_id, 
                "Name":(paper.get("publicationVenue") or {}).get("name", "Unknown"), 
                "issn":(paper.get("publicationVenue") or {}).get("issn", "Unknown"), 
                "url":(paper.get("publicationVenue") or {}).get("url", "Unknown")
            })
            paper_edition = "N/A"
            for i in range(1,3):
                journal_year=int(year) - i
                synthetic_paper_id = f"{journal_year}{paper_id}"
                synthetic_doi = f"{journal_year}{doi}"
                synthetic_title = f"{journal_year} {title}"
                
                papers_data.append({
                    "Paper_ID":synthetic_paper_id, "DOI":synthetic_doi, "Title":synthetic_title, "Abstract":abstract,
                    "Venue":venue, "publicationType":publication_type, 
                    "venue_id":venue_id, "journal_name":journal_name, "pages":pages, "volume":volume, 
                    "Edition_id":paper_edition, "Year":journal_year, "publicationDate":publicationDate, 
                    "Authors":authors_str, "Author_Ids":author_ids_str, "URL":url, 
                    "s2FieldsOfStudys":s2FieldsOfStudys, "Corresponding_Author_ID":corresponding_author_id
                })
                for author_id in author_ids:
                    author_paper_relationship.append({"DOI": synthetic_doi, "Author_ID": author_id, "Corresponding": author_id == corresponding_author_id})
                for field in fields_of_study:
                    paper_field_relationship.append({"DOI": synthetic_doi, "Field_ID": field_data[field]})
        else:
            paper_edition = "unknown"    
        filtered_responses.append(paper)
        writer.writerow([paper_id, doi, title, abstract, venue, publication_type, publication_type_2, venue_id, journal_name, pages, volume, paper_edition, year, publicationDate, citation_count, reference_count, authors_str, author_ids_str, url, s2FieldsOfStudys, corresponding_author_id])

# Create DataFrame for author data
author_df = pd.DataFrame(author_data).drop_duplicates()
author_paper_df = pd.DataFrame(author_paper_relationship).drop_duplicates()
field_df = pd.DataFrame(list(field_data.items()), columns=["Field_Name", "Field_ID"]).drop_duplicates()
paper_field_df = pd.DataFrame(paper_field_relationship).drop_duplicates()
editions_df = pd.DataFrame(Conf_editions).drop_duplicates()
conferences_df = pd.DataFrame(Conferences_data).drop_duplicates()
journals_df = pd.DataFrame(Journals_data).drop_duplicates()

papers_data_df = pd.DataFrame(papers_data).drop_duplicates()

papers_data_df.to_csv("papers_synth.csv", index=False)
# Save to CSV
journals_df.to_csv("journals.csv", index=False)
editions_df.to_csv("conference_editions.csv", index=False)
conferences_df.to_csv("conferences.csv", index=False)
author_df.to_csv("authors.csv", index=False)
author_paper_df.to_csv("author_paper_relationship.csv", index=False)
field_df.to_csv("fields_of_study.csv", index=False)
paper_field_df.to_csv("paper_field_relationship.csv", index=False)

print(f"CSV file '{csv_filename}' has been created successfully.")
print("Author data has been saved to 'authors.csv'.")
print("Paper-Author relationship data has been saved to 'author_paper_relationship.csv'.")
print("Field data has been saved to 'fields_of_study.csv'.")
print("Paper-Field relationship data has been saved to 'paper_field_relationship.csv'.")

df1 = pd.read_csv("papers_synth.csv")
df2 = pd.read_csv("papers_og.csv")

# Merge (concatenate) them
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save the merged CSV
merged_df.to_csv("papers.csv", index=False)

print("CSV files merged successfully!")

CSV file 'papers_og.csv' has been created successfully.
Author data has been saved to 'authors.csv'.
Paper-Author relationship data has been saved to 'author_paper_relationship.csv'.
Field data has been saved to 'fields_of_study.csv'.
Paper-Field relationship data has been saved to 'paper_field_relationship.csv'.
CSV files merged successfully!


In [10]:
# Load the dataset
df = pd.read_csv("papers.csv")

# Define the journal names (Replace with actual journal names)
journal_1 = "Algorithms"
journal_2 = "Applied Sciences"

# Filter papers from the two journals
journal_papers = df[(df["journal_name"] == journal_1) | (df["journal_name"] == journal_2)]

# Select 5 random papers that are NOT from these journals
random_papers = df[~df["DOI"].isin(journal_papers["DOI"])].sample(5, random_state=42)

# Create a list to store references
references = []

# Assign references: Each random paper will reference all journal papers
for _, ref_paper in random_papers.iterrows():
    for _, journal_paper in journal_papers.iterrows():
        references.append({
            "Paper_DOI": journal_paper["DOI"],
            "Reference_DOI": ref_paper["DOI"]
        })

# Convert to DataFrame
references_df = pd.DataFrame(references)

# Save as CSV
references_df.to_csv("references_synth.csv", index=False)

print("References CSV created successfully!")


References CSV created successfully!


In [11]:
# paper=sch.get_paper('10.1109/ICCNS58795.2023.10193141')
# # paper=sch.get_paper('00000c33779acab142af6c7a6dae8b36fac0805d')
# print(json.dumps(paper.raw_data, indent=4))

For references, retrieve the necessary info to store the papers as well

In [12]:
# Initialize lists for storing relationships and field data
ref_author_paper_relationship = []  # List to store paper-author relationship
ref_paper_field_relationship = []  # List to store paper-field relationship
ref_author_data = []  # List to store author names and IDs

paper_ids_arr = [paper["externalIds"].get("DOI") for paper in filtered_responses if paper["externalIds"].get("DOI")]

# Open CSV file for writing
with open("references_og.csv", "w", newline="") as ref_file, open("referenced_papers.csv", "w", newline="") as details_file:
    ref_writer = csv.writer(ref_file)
    details_writer = csv.writer(details_file)

    # Write headers for both CSV files
    ref_writer.writerow(["Paper_DOI", "Reference_DOI"])
    details_writer.writerow([
       "Paper_ID", "DOI", "Title", "Abstract", "Venue", "publicationType", 
        "publication_type_2", "venue_id", "journal_name", "pages", "volume", "Edition_id", "Year", 
        "publicationDate", "Citations", "References", "Authors", "Author_Ids", 
        "URL", "s2FieldsOfStudys", "Corresponding_Author_ID"
    ])

    # Iterate over each DOI from the paper list (referenced papers) / try with first 500 now
    for doi in paper_ids_arr[:500]:
        paper = sch.get_paper(doi)  # Fetch paper details
        ref_count = getattr(paper, "referenceCount", "Unknown")

        if paper and hasattr(paper, "references"):  # Ensure valid response
            if ref_count < 1:
                n = 0  # No references available
            elif ref_count < 5:
                n = random.randint(1, ref_count)
            elif 5 < ref_count < 10:
                n = random.randint(5, 6)
            else:  # ref_count >= 10
                n = random.randint(7, 10)

            top_references = paper.references[:n]  # Get top n references

            for reference in top_references:
                ref_doi = reference.externalIds["DOI"] if reference.externalIds and "DOI" in reference.externalIds else "Unknown"
                if ref_doi == "Unknown":
                    continue  # Skip if DOI is unknown

                # Extract reference details
                ref_paper_id = getattr(reference, "paperId", "Unknown")
                ref_title = getattr(reference, "title", "Unknown")
                ref_abstract = getattr(reference, "abstract", "")
                ref_venue = getattr(reference, "venue", "Unknown")
                ref_publication_type = getattr(getattr(reference, "publicationVenue", {}), "type", "Unknown")
                ref_venue_id = getattr(getattr(reference, "publicationVenue", {}), "id", "Unknown")

                ref_publication_type_2 = getattr(reference, "publicationTypes", [])
                ref_journal_name = getattr(getattr(reference, "journal", {}), "name", "Unknown")
                ref_pages = getattr(getattr(reference, "journal", {}), "pages", "Unknown")
                ref_volume = getattr(getattr(reference, "journal", {}), "volume", "Unknown")
                ref_year = getattr(reference, "year", "Unknown")
                ref_publication_date = getattr(reference, "publicationDate", "Unknown")
                ref_citation_count = getattr(reference, "citationCount", "Unknown")
                ref_reference_count = getattr(reference, "referenceCount", "Unknown")
                ref_s2_fields_of_study = ', '.join(sorted({s2fs.get('category', "Unknown") for s2fs in getattr(reference, "s2FieldsOfStudy", [])}))
                
                # Format authors
                ref_authors = [str(getattr(author, 'name', 'Unknown')) for author in getattr(reference, "authors", [])]
                ref_authors_ids = [str(getattr(author, 'authorId', 'Unknown')) for author in getattr(reference, "authors", [])]

                ref_url = getattr(reference, "url", "Unknown")
                
                # Ensure authors and author IDs match in length
                if len(ref_authors) != len(ref_authors_ids):
                    continue

                ref_corresponding_author_id = ref_authors_ids[0] if ref_authors_ids else "Unknown"

                # Handle authors and relationships
                for author_name, author_id in zip(ref_authors, ref_authors_ids):
                    ref_author_data.append({"Author_Name": author_name, "Author_ID": author_id})
                    ref_author_paper_relationship.append({"DOI": ref_doi, "Author_ID": author_id, "Corresponding": (author_id == ref_corresponding_author_id)})

                # Handle fields of study
                fields_of_study = {s2fs.get('category', "Unknown") for s2fs in getattr(reference, "s2FieldsOfStudy", [])}

                # Assign unique IDs to fields of study
                for field in fields_of_study:
                    if field not in field_data:
                        field_data[field] = field_id_counter
                        field_id_counter += 1

                # Create paper-field relationship
                for field in fields_of_study:
                    ref_paper_field_relationship.append({"DOI": ref_doi, "Field_ID": field_data[field]})


                if ref_publication_type == "journal":
                    Journals_data.append({
                        "ID":venue_id, 
                        "Name":getattr(getattr(reference, "publicationVenue", {}), "name", "Unknown"),
                        "issn":getattr(getattr(reference, "publicationVenue", {}), "issn", "Unknown"),
                        "url":getattr(getattr(reference, "publicationVenue", {}), "url", "Unknown")
                        })
                    paper_edition = "N/A"
                elif ref_publication_type == "conference":
                    Conferences_data.append({
                        "ID":venue_id, 
                        "Name":getattr(getattr(reference, "publicationVenue", {}), "name", "Unknown"),
                        "url":getattr(getattr(reference, "publicationVenue", {}), "url", "Unknown")
                        })
                    paper_edition=f"{ref_year}{ref_venue_id}"
                    Conf_editions.append({
                            "Edition_ID": paper_edition,
                            "Venue_ID": ref_venue_id,
                            "Conference_Edition_Name": f"{ref_year} {venue}",
                            "Year": ref_year
                    })
                else:
                    paper_edition = "Unknown"

                ref_writer.writerow([doi, ref_doi])

                # Write the reference data to CSV
                details_writer.writerow([
                    ref_paper_id, ref_doi, ref_title, ref_abstract, ref_venue, ref_publication_type, 
                    ref_publication_type_2, ref_venue_id, ref_journal_name, ref_pages, ref_volume, paper_edition, ref_year, 
                    ref_publication_date, ref_citation_count, ref_reference_count, 
                    ", ".join(ref_authors), ", ".join(ref_authors_ids), ref_url, ref_s2_fields_of_study, 
                     ref_corresponding_author_id 
                ])

# Create DataFrames for authors, paper-author, fields, and paper-field relationships
author_df = pd.DataFrame(ref_author_data).drop_duplicates()
author_paper_df = pd.DataFrame(ref_author_paper_relationship).drop_duplicates()
field_df = pd.DataFrame(list(field_data.items()), columns=["Field_Name", "Field_ID"]).drop_duplicates()
paper_field_df = pd.DataFrame(ref_paper_field_relationship).drop_duplicates()
editions_df = pd.DataFrame(Conf_editions).drop_duplicates()
conferences_df = pd.DataFrame(Conferences_data).drop_duplicates()
journals_df = pd.DataFrame(Journals_data).drop_duplicates()
# Save to CSV
journals_df.to_csv("journals.csv", index=False)
editions_df.to_csv("conference_editions.csv", index=False)
conferences_df.to_csv("conferences.csv", index=False)
# Save the DataFrames to CSV files
author_df.to_csv("ref_authors.csv", index=False)
author_paper_df.to_csv("ref_author_paper_relationship.csv", index=False)
field_df.to_csv("fields_of_study.csv", index=False)
paper_field_df.to_csv("ref_paper_field_relationship.csv", index=False)

print("Data has been written to 'referenced_papers.csv' and other CSV files.")
print("Author data has been saved to 'ref_authors.csv'.")
print("Paper-Author relationship data has been saved to 'ref_author_paper_relationship.csv'.")
print("Field data has been saved to 'fields_of_study.csv'.")
print("Paper-Field relationship data has been saved to 'ref_paper_field_relationship.csv'.")


Data has been written to 'referenced_papers.csv' and other CSV files.
Author data has been saved to 'ref_authors.csv'.
Paper-Author relationship data has been saved to 'ref_author_paper_relationship.csv'.
Field data has been saved to 'fields_of_study.csv'.
Paper-Field relationship data has been saved to 'ref_paper_field_relationship.csv'.


In [13]:
df1 = pd.read_csv("references_og.csv")
df2 = pd.read_csv("references_synth.csv")

# Merge (concatenate) them
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save the merged CSV
merged_df.to_csv("references.csv", index=False)

print("CSV files merged successfully!")

CSV files merged successfully!
