This file generates fake data for the initial graph using Faker. This is then stored in csv file ready to be uploaded to neo4j.

* Nodes:
    * Author: { Id:ID, name, email }
    * Keyword: { Id:ID, keyword }
    * Conference_Workshop: { Id:ID, name, type, city, edition:INT, date, year:INT, month:INT, day:INT }
    * Journal: { Id:ID, name, volume:INT, year:INT }
    * Paper: { Id:ID, title, abstract, date, year:INT, month:INT, day:INT, pages:INT, doi, type }
* Relationships:
    * WRITTEN_BY: (:Author)-[:WRITTEN_BY]->(:Paper)
    * CORRESPONDING_AUTHOR: (:Author)-[:CORRESPONDING_AUTHOR]->(:Paper)
    * PUBLISHED_IN: (:Paper)-[:PUBLISHED_IN]->(:Journal|:Conference|:Workshop)
    * CITES: (:Paper)-[:CITES]->(:Paper)
    * HAS_KEYWORD: (:Paper)-[:HAS_KEYWORD]->(:Keyword)
    * REVIEWED_BY: (:Review)-[:REVIEWED_BY]->(:Author)

In [98]:
import csv
import random
from faker import Faker
from datetime import datetime
import calendar
from tqdm import tqdm

In [99]:
fake = Faker()

# Define the number of each entity to generate
NUM_AUTHORS = 100
NUM_PAPERS = 5000
NUM_CONFERENCES = 5
NUM_JOURNALS = 5
NUM_KEYWORDS = 1000

START_YEAR = 2010
END_YEAR = 2023
START_DATE = datetime(START_YEAR, 1, 1)
END_DATE = datetime(END_YEAR, 12, 31)

In [100]:
fake = Faker()

# Define the number of each entity to generate
NUM_AUTHORS = 100
NUM_PAPERS = 5000
NUM_CONFERENCES = 5
NUM_JOURNALS = 2
NUM_KEYWORDS = 500

START_YEAR = 2015
END_YEAR = 2023
START_DATE = datetime(START_YEAR, 1, 1)
END_DATE = datetime(END_YEAR, 12, 31)

In [101]:
# Generate Authors
authors = []
for _ in range(NUM_AUTHORS):
    authors.append({
        'Id:ID': fake.uuid4(),
        'name': fake.name(),
        'email': fake.email()
    })

# Generate Keywords
keywords = []
for _ in range(NUM_KEYWORDS):
    keywords.append({
        'Id:ID': fake.uuid4(),
        'keyword': fake.word()
    })

print("keywords generated")

# Generate Conferences and Workshops 
conference_workshop = []
for _ in range(NUM_CONFERENCES):
    type = random.choice(['Conference', 'Workshop'])
    name = f"{fake.company()} {type}"
    city = fake.city()
    month = random.randint(1, 12)
    edition = random.randint(1, 100)
    
    # Create editions held in the same city and month
    for year in range(START_YEAR, END_YEAR):
        # Get the number of days in the month
        num_days = calendar.monthrange(year, month)[1]
        day = random.randint(1, num_days)
        
        # Create the full date
        full_date = datetime(year, month, day).strftime('%Y-%m-%d')
        
        conference_workshop.append({
            'Id:ID': fake.uuid4(),
            'name': name,
            'type': type,
            'city': city,
            'edition:INT': edition,
            'date': full_date,
            'year:INT': year,
            'month:INT': month,
            'day:INT': day
        })
        edition += 1

print("conferences and workshops generated")

# Create journals
journals = []
for _ in range(NUM_JOURNALS):
    name= f"{fake.company()} Journal"
    volume = random.randint(1, 100) # Starting volume

    # Create volumes (1-5 per year)
    volumes_per_year = random.randint(1, 5)
    for year in range(START_YEAR, END_YEAR):
        for yearly_volume in range(volumes_per_year):
            journals.append({
                'Id:ID': fake.uuid4(),
                'name': name,
                'volume:INT': volume,
                'year:INT': year
            })
            volume += 1

print("journals generated")

# Generate Papers
papers = []
written_by = []
corresponding_author = []
published_in = []
journal_author_map = {}
conference_author_map = {}
for _ in tqdm(range(NUM_PAPERS), desc="Generating Papers", unit="paper"):
    paper_id = fake.uuid4()
    num_authors = random.randint(1, 5)
    authors_for_paper = random.sample(authors, num_authors)
    date = fake.date_time_between_dates(START_DATE, END_DATE)
    year = date.year
    month = date.month
    day = date.day
    paper = {
        'Id:ID': paper_id,
        'title': fake.sentence(nb_words=6),
        'abstract': fake.text(),
        'date': date,
        'year:INT': year,
        'month:INT': month,
        'day:INT': day,
        'pages:INT': fake.random_int(min=1, max=20),
        'doi': fake.uuid4(),
        'type': random.choice(['Conference', 'Workshop', 'Journal']),
    }
    
    # Assign authors
    for author in authors_for_paper:
        written_by.append({
            ':START_ID': author['Id:ID'],
            ':END_ID': paper_id
        })

    # Assign corresponding author
    corresponding_author.append({
        ':START_ID': random.choice(authors_for_paper)['Id:ID'],
        ':END_ID': paper_id
    })
    
    # Assign publication 
    if paper['type'] in ['Conference', 'Workshop']:
        candidates = [c for c in conference_workshop if c['type'] == paper['type']]
    else:
        candidates = journals

    # Assign paper to conference/workshop or journal
    publication_id = random.choice(candidates)
    published_in.append({
        ':START_ID': paper_id,
        ':END_ID': publication_id['Id:ID']
    })

    # Create journal/conference author maps to speed up review assignment
    if paper['type'] in ['Conference', 'Workshop']:
        if publication_id['Id:ID'] not in conference_author_map:
            conference_author_map[publication_id['Id:ID']] = set()
        conference_author_map[publication_id['Id:ID']].update([a['Id:ID'] for a in authors_for_paper])
    else:
        if publication_id['Id:ID'] not in journal_author_map:
            journal_author_map[publication_id['Id:ID']] = set()
        journal_author_map[publication_id['Id:ID']].update([a['Id:ID'] for a in authors_for_paper]) 

    
    papers.append(paper)

print("papers generated")

# Generate Citations (Paper to Paper relationships)
cites = []
for paper in papers:
    num_citations = random.randint(0, 15)
    cited_papers = random.sample(papers, num_citations)
    for cited_paper in cited_papers:
        # Ensure papers can't cite themselves or papers publised after them
        if paper['Id:ID'] != cited_paper['Id:ID'] and paper['date'] > cited_paper['date']:
            cites.append({
                ':START_ID': paper['Id:ID'],
                ':END_ID': cited_paper['Id:ID']
            })

print("citations generated")

# Generate Paper Keywords
has_keyword = []
for paper in papers:
    num_keywords = random.randint(1, 20)
    has_keyword.extend([{
        ':START_ID': paper['Id:ID'],
        ':END_ID': keyword['Id:ID']
    } for keyword in random.sample(keywords, num_keywords)])

print("paper keywords generated")

keywords generated
conferences and workshops generated
journals generated


Generating Papers: 100%|██████████| 5000/5000 [00:00<00:00, 12529.95paper/s]


papers generated
citations generated
paper keywords generated


In [102]:
# Assign Reviewers
reviewed_paper = []
reviews = []
for paper in tqdm(papers, desc="Assigning Reviewers", unit="paper"):
    paper_id = paper['Id:ID']
    # Get authors for this paper
    authors_for_paper = set(relation[':START_ID'] for relation in written_by if relation[':END_ID'] == paper_id)
    
    # Get the journal or conference this paper is published in
    paper_journal = next((pub[':END_ID'] for pub in published_in if pub[':START_ID'] == paper_id and pub[':END_ID'] in journal_author_map), None)
    paper_conference = next((pub[':END_ID'] for pub in published_in if pub[':START_ID'] == paper_id and pub[':END_ID'] in conference_author_map), None)
    
    eligible_reviewers = set()
    if paper_journal:
        eligible_reviewers = journal_author_map[paper_journal] - authors_for_paper
    elif paper_conference:
        eligible_reviewers = conference_author_map[paper_conference] - authors_for_paper

    if not eligible_reviewers:
        print("no eligible reviewers")
        continue

    # Convert eligible reviewers to a list
    eligible_reviewers = list(eligible_reviewers)

    # Assign 3 reviewers to each paper
    reviewers = random.sample(eligible_reviewers, min(3, len(eligible_reviewers)))
    for reviewer_id in reviewers:
        
        # Assign reviewer to paper
        reviewed_paper.append({
            ':START_ID': reviewer_id,
            ':END_ID': paper_id
        })

print("reviews generated")

Assigning Reviewers: 100%|██████████| 5000/5000 [00:05<00:00, 951.68paper/s] 

reviews generated





In [103]:
def output_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for item in data:
            writer.writerow(item)

# Output nodes and relationships to CSV
output_to_csv(authors, 'data/authors.csv')
output_to_csv(keywords, 'data/keywords.csv')
output_to_csv(conference_workshop, 'data/conference_workshop.csv')
output_to_csv(journals, 'data/journals.csv')
output_to_csv(papers, 'data/papers.csv')
output_to_csv(written_by, 'data/r_written_by.csv')
output_to_csv(corresponding_author, 'data/r_corresponding_author.csv')
output_to_csv(published_in, 'data/r_published_in.csv')
output_to_csv(cites, 'data/r_cites.csv')
output_to_csv(has_keyword, 'data/r_has_keyword.csv')
output_to_csv(reviewed_paper, 'data/r_reviewed_paper.csv')

print("CSV files generated.")

CSV files generated.
