### Importing the necessary libraries, modules and files

In [1]:
import json
import pandas as pd
import datetime
import random
random.seed(10)

In [2]:
# Add Database as Domain to fieldOfStudy when the paper title contains the word "database".
with open(file = 'data1/dataset.json', encoding = 'utf-8') as inputfile:
    df = json.load(inputfile)

with open(file = 'data1/dataset2.json', mode = 'w+', encoding = 'utf-8') as outputfile:
    df2 = df
    for i in range(len(df2)):
        details = df2[i]['_data']  
        paperTitle = details['title']
        if 'database' in paperTitle.lower():
            if details['fieldsOfStudy']:
                details['fieldsOfStudy'].append('Database')
            else:
                details['fieldsOfStudy'] = ['Database']
    json.dump(df2, outputfile)

In [3]:
with open('data1/dataset2.json', encoding='utf-8') as inputfile:
    df = json.load(inputfile)

In [4]:
conferences = []
conferenceUrls = []

journals = []
journalUrls = []

authors = []
authorUrls = []

domains = []

for i in range(len(df)):
    details = df[i]['_data']  
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if jName not in journals:
                    journals.append(jName)
                    journalUrls.append(myId)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                myId = details['publicationVenue']['id']
                if cName not in conferences:
                    conferences.append(cName)
                    conferenceUrls.append(myId)
    if details['authors']:
        for i in range(len(details['authors'])):
            aName = details['authors'][i]['name']
            myId = details['authors'][i]['authorId']
            if aName not in authors:
                authors.append(aName)
                authorUrls.append(str(myId))
    if details['fieldsOfStudy']:
        for i in details['fieldsOfStudy']:
            if i not in domains:
                domains.append(i)

#### Proceedings

In [5]:
# assigning proceedings ids
proceedingChoices = []
proceedingYears = []
ids = []
for i in range(len(conferences)):
    ids.append('cp'+str(i)) # stands for conferenceProceedings
    
    # Creating proceeding names as integers
    proceedingChoices.append('proceeding' + str(i))
    
    # Creating proceedings year as a random year between 2000 and 2024
    proceedingYears.append(random.randint(2001, 2024))
        
proceedings_df = pd.DataFrame(ids, columns = ['proceedingId'])
proceedings_df['proceedingName'] = proceedingChoices
proceedings_df['proceedingYear'] = proceedingYears
proceedings_df.to_csv('data1/proceedings.csv',index = False, header = True, mode = 'w')

#### Conferences

In [6]:
# assign conference ids which can be added to the uri's in the graph

ids = []
proceedings = []
for i in range(len(conferences)):
    ids.append('c'+str(i))

conferences_df = pd.DataFrame(ids, columns = ['conferenceId'])

In [7]:
# assigning conference subclasses
cTypes = ['workshop','mainConference']

conferenceTypes = []
conferenceTypes = random.choices(cTypes, weights=[0.25,0.75], k=len(conferences_df))

In [8]:
conferences_df['conferenceUrl'] = conferenceUrls
conferences_df['conferenceTitle'] = conferences
conferences_df['conferenceType'] = conferenceTypes

# Creating proceedings data for conferences
conferences_df['conferenceProceedingIds'] = proceedings_df['proceedingId']
conferences_df.to_csv('data1/conferences.csv', index = False, header = True, mode = 'w')

#### ConferenceProceedings

In [9]:
conferenceProceedings_df = conferences_df.merge(
    proceedings_df,
    how = 'inner',
    left_on = ['conferenceProceedingIds'],
    right_on = ['proceedingId']
    ).drop(columns = ['proceedingId'], axis = 1)
conferenceProceedings_df.to_csv('data1/conferenceProceedings.csv', index = False, header = True, mode = 'w')

#### Volumes

In [10]:
# assigning proceedings ids
volumeChoices = []
volumeYears = []
ids = []
for i in range(len(journals)):
    ids.append('jv'+str(i)) # stands for journalVolume
    
    # Creating proceedings attributes
    volumeChoices.append('journal' + str(i))
    volumeYears.append(random.randint(2001, 2024))
        
volumes_df = pd.DataFrame(ids, columns = ['volumeId'])
volumes_df['volumeName'] = volumeChoices
volumes_df['volumeYear'] = volumeYears
volumes_df.to_csv('data1/volumes.csv',index = False, header = True, mode = 'w')

#### Journals

In [11]:
# assigning journals ids which can be added to the uri's in the graph
ids = []
volumes = []
for i in range(len(journals)):
    ids.append('j'+str(i))

journals_df = pd.DataFrame(ids, columns = ['journalId'])
journals_df['journalUrl'] = journalUrls
journals_df['journalTitle'] = journals

# Creating volumes data for journals
journals_df['journalVolumeIds'] = volumes_df['volumeId']
journals_df.to_csv('data1/journals.csv',index = False, header = True, mode = 'w')

#### JournalVolumes

In [12]:
journalVolumes_df = journals_df.merge(
    volumes_df,
    how = 'inner',
    left_on = ['journalVolumeIds'],
    right_on = ['volumeId']
    ).drop(columns = ['volumeId'], axis = 1)
journalVolumes_df.to_csv('data1/journalVolumes.csv',index = False, header = True, mode = 'w')

#### Authors

In [13]:
# assigning author ids which can be added to the uri's in the graph
ids = []
for i in range(len(authors)):
    ids.append('a'+str(i))

authors_df = pd.DataFrame(ids, columns = ['authorId'])
authors_df['authorUrl'] = authorUrls
authors_df['authorName'] = authors
authors_df.to_csv('data1/authors.csv',index = False, header = True, mode = 'w')

#### Domain

In [14]:
# assigning domain ids which can be added to the uri's in the graph
ids = []
for i in range(len(domains)):
    ids.append('d'+str(i))

domain_df = pd.DataFrame(ids, columns = ['domainId'])
domain_df['domainName'] = domains
domain_df.to_csv('data1/domains.csv',index = False, header = True, mode = 'w')

### Collecting information about papers

In [15]:
papers = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    
    currPaperUrlNum = 'p'+str(paperUrlNum)
    paperCurr.append(currPaperUrlNum)
    paperCurr.append(details['paperId'])
    
    if details['publicationVenue']:
        if 'type' in details['publicationVenue']:
            if details['publicationVenue']['type'] == 'journal':
                jName = details['publicationVenue']['name']
                paperCurr.append(journals_df[journals_df['journalTitle'] == jName]['journalId'].values[0])
                paperCurr.append(jName)
            elif details['publicationVenue']['type'] == 'conference':
                cName = details['publicationVenue']['name']
                paperCurr.append(conferences_df[conferences_df['conferenceTitle'] == cName]['conferenceId'].values[0])
                paperCurr.append(cName)

            else:
                paperCurr.append('None')
                paperCurr.append('None')
        else:
            paperCurr.append('None')
            paperCurr.append('None')
    else:
        paperCurr.append('None') # conferenceJournalUrl
        paperCurr.append('None') # conferenceJournal
    
    paperCurr.append(details['title'])
    paperCurr.append(details['abstract'])
    paperUrlNum += 1
    papers.append(paperCurr)

In [16]:
paperColumns = ['paperId','paperUrl','conferenceJournalId','conferenceJournalTitle','paperTitle','paperAbstract']

papers_df = pd.DataFrame(papers, columns = paperColumns)

In [17]:
# Assign paper subclasses
# Assign type poster if not a conference/journal
# Else, randomly choose from pTypes

paperTypes = []
pTypes = ['fullPaper']
for index, row in papers_df.iterrows():
    if row['conferenceJournalId'] == 'None':
        paperTypes.append('poster')
    else:
        paperTypes.append(random.choice(pTypes))
papers_df['paperType'] = paperTypes

In [18]:
papers_df['conferenceJournalId'] = papers_df.apply(lambda x: random.choice(conferences_df.conferenceId) if x['paperType'] == 'poster' else x['conferenceJournalId'], axis = 1)
papers_df['conferenceJournalTitle'] = papers_df.apply(lambda x: conferences_df[conferences_df['conferenceId'] == x['conferenceJournalId']]['conferenceTitle'].item() if x['paperType'] == 'poster' else x['conferenceJournalTitle'], axis = 1)

In [19]:
# Get proceedings and volumes for conferences/journals for each paper
procVols = []
for k in range(len(papers_df['paperId'])):
    currConfJourId = papers_df['conferenceJournalId'][k]
    if currConfJourId[0] == 'c':
        getProcVol = conferences_df[conferences_df['conferenceId'] == currConfJourId]['conferenceProceedingIds'].item()
    else:
        getProcVol = journals_df[journals_df['journalId'] == currConfJourId]['journalVolumeIds'].item()
    procVols.append(getProcVol)
papers_df['proceedingsVolumesIds'] = procVols

Since, some of the paper's abstract is null, we are imputing with a default value of "Abstract content"

In [20]:
# Impute None values for abstract
papers_df['paperAbstract'] = papers_df.apply(lambda x: 'Abstract content' if x['paperAbstract'] == None else x['paperAbstract'], axis = 1)

Imputing conferenceJournalId as randomly chosen conference from the list of conferences and corresponding conferenceJournalTitle for the paperType as poster.

In [21]:
papers_df.to_csv('data1/papers.csv',index = False, header = True, mode = 'w')

### Creating authors domain relation

In [22]:
domain_ids = []
pIds = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    
    currPaperUrlNum = 'p'+str(paperUrlNum)
    
    # Adding pId to domainId
    if details['fieldsOfStudy']:
        for i in range(len(details['fieldsOfStudy'])):
            domainId = details['fieldsOfStudy'][i]
            domain_ids.append(domainId)
            pIds.append(currPaperUrlNum)
    paperUrlNum += 1

domainsPapers_df = pd.DataFrame(domain_ids, columns = ['domainName'])
domainsPapers_df['paperId'] = pIds

In [23]:
domainsPapers_df = domainsPapers_df.merge(
                    domain_df,
                    how = 'inner',
                    left_on = ['domainName'],
                    right_on = ['domainName']
                    )

domainsPapers_df = domainsPapers_df.merge(
                    papers_df,
                    how = 'inner',
                    left_on = ['paperId'],
                    right_on = ['paperId']
                    )
domainsPapers_df.to_csv('data1/domainsPapers.csv',index = False, header = True, mode = 'w')

In [24]:
domainsConferencesJournals_df = domainsPapers_df[['conferenceJournalId', 'proceedingsVolumesIds', 'domainId']]
domainsConferencesJournals_df.to_csv('data1/domainsProceedingsVolumes.csv', index = False, header = True, mode = 'w')

### Creating authors papers relation

In [25]:
aUrlIds = []
pIds = []

paperUrlNum = 0

for i in range(len(df)):
    paperCurr = [] 
    details = df[i]['_data']
    currPaperUrlNum = 'p'+str(paperUrlNum)
    
    # Appending the pId to authors_df
    if details['authors']:
        for i in range(len(details['authors'])):
            aUrlId = details['authors'][i]['authorId']
            aUrlIds.append(str(aUrlId))
            pIds.append(currPaperUrlNum)
    paperUrlNum += 1

authorsPapers_df = pd.DataFrame(aUrlIds, columns = ['authorUrl'])
authorsPapers_df['paperId'] = pIds

In [26]:
authorsPapers_df = authorsPapers_df.merge(
                    authors_df,
                    how = 'inner',
                    left_on = ['authorUrl'],
                    right_on = ['authorUrl']
                    )

authorsPapers_df = authorsPapers_df.merge(
                    papers_df,
                    how = 'inner',
                    left_on = ['paperId'],
                    right_on = ['paperId']
                    )
authorsPapers_df.head()
authorsPapers_df.to_csv('data1/authorsPapers.csv',index = False, header = True, mode = 'w')

### Creating Reviews Data

In [27]:
# assign ids to reviews for adding to uri's of the graph
ids = []
for i in range(len(papers_df)):
    ids.append('r'+str(i))

decisionChoices = ['accepted','rejected']

reviews_df = pd.DataFrame(ids, columns = ['reviewId'])
reviews_df['paperId'] = papers_df['paperId']
reviews_df['reviewText'] = ['Review content'] * len(reviews_df)
reviews_df['reviewDecision'] = random.choices(decisionChoices, weights = [0.8, 0.2], k = len(papers_df))
reviews_df['reviewDecisionBoolean'] = reviews_df.apply(lambda x: 1 if x['reviewDecision'] == 'accepted' else 0, axis = 1)
reviews_df['paperTitle'] = papers_df['paperTitle']
reviews_df.to_csv('data1/reviews.csv',index = False, header = True, mode = 'w')

### Creating Reviewer Data

In [28]:
all_authors = list(authorsPapers_df['authorId'].unique())
paperAuthors = authorsPapers_df.groupby('paperId', as_index = False, sort = False)['authorId'].agg(lambda x: [l for l in x])

In [29]:
totalPapers = len(papers_df)
set_authors = set(all_authors)

ids = []
submissions = []
reviewers = []

for i in range(totalPapers * 2):
    ids.append('r'+str(i))
    submissions.append(papers_df['paperId'][i//2])
    
    curr_pId = papers_df['paperId'][i//2]
    curr_authors = paperAuthors[paperAuthors['paperId'] == curr_pId]['authorId'].tolist()
    
    availableReviewers = [x for x in set_authors if not x in curr_authors]
    reviewers.append(random.choice(availableReviewers))

In [30]:
# Reviewer (author id: aId) reviews the submission (with id: sId)
reviewers_df = pd.DataFrame(ids, columns = ['rId'])
reviewers_df['paperId'] = submissions
reviewers_df['authorId'] = reviewers

In [31]:
reviewers_df = reviewers_df.merge(
            authors_df,
            how = 'inner',
            left_on = ['authorId'],
            right_on = ['authorId']
            )

reviewers_df = reviewers_df.merge(
            papers_df,
            how = 'inner',
            left_on = ['paperId'],
            right_on = ['paperId']
            )

In [32]:
reviewers_df.to_csv('data1/reviewers.csv',index = False, header = True, mode = 'w')

### Creating chairs data

In [33]:
confPapers = papers_df[papers_df['conferenceJournalId'].str[:1] == 'c']['conferenceJournalId'].tolist()

confPapers_df = pd.DataFrame(confPapers, columns = ['conferenceId'])

ids = []
for i in range(len(confPapers_df)):
    ids.append('chair'+str(i))
confPapers_df['chairId'] = ids

confPapers_df['authorId'] = random.choices(authors_df['authorId'], k = len(confPapers_df))

It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

In [34]:
confPapers_df.to_csv('data1/chairs.csv',index = False, header = True, mode = 'w')

This implies that for author with x id chairs the conference y with c chairId or author x handles the conference y.

### Creating editors data

In [35]:
jourPapers = papers_df[papers_df['conferenceJournalId'].str[:1] == 'j']['conferenceJournalId'].tolist()

jourPapers_df = pd.DataFrame(jourPapers, columns = ['journalId'])

ids = []
for i in range(len(jourPapers_df)):
    ids.append('editor'+str(i))
jourPapers_df['editorId'] = ids

jourPapers_df['authorId'] = random.choices(authors_df['authorId'], k = len(jourPapers_df))
jourPapers_df.to_csv('data1/editors.csv',index = False, header = True, mode = 'w')

It is assumed that any author from our database can chair any conference, irrespective of the number of papers written, etc.

This implies that for author with x id is the editor of the journal y with e editorId or author x handles the journal y.

In [36]:
jourPapers_df.to_csv('data1/editors.csv',index = False, header = True, mode = 'w')