# Preprocessing Data for SDM Lab-2 Knowledge Graphs

In [6]:
import pandas as pd

### Author

In [7]:
def modify_csv(file_path, output_path):
    
    data = pd.read_csv(file_path)
    data.insert(0, 'authorID', ['a' + str(i) for i in range(len(data))])
    data.rename(columns={'ID': 'authorRef'}, inplace=True)
    data.to_csv(output_path, index=False)

input_author_path = 'authors.csv'  
output_author_path = 'authors1.csv' 

modify_csv(input_author_path,output_author_path)

### Conferences

In [15]:
def modify_dataset(input_file_path, output_file_path):

        df = pd.read_csv(input_file_path)
        df = df.iloc[1:].reset_index(drop=True)
        df.insert(0, 'conferenceID', ['c' + str(i) for i in range(len(df))])
        df.rename(columns={'ID': 'conferenceRef'}, inplace=True)
        edition_index = df.columns.get_loc('edition') + 1
        df.insert(edition_index, 'conProIds', ['cp' + str(i) for i in range(len(df))])
        df.to_csv(output_file_path, index=False)

input_file_path = 'conference_semantic-2.csv'
output_file_path = 'conference.csv'
modify_dataset(input_file_path, output_file_path)

### Proceedings

In [16]:
def modify_proceeding(input_file_path, output_file_path):

        df = pd.read_csv(input_file_path)
        df = df.iloc[1:].reset_index(drop=True)
        df = df[['year', 'name']]
        df.insert(df.columns.get_loc('name'), 'proceedingID', ['cp' + str(i) for i in range(len(df))])
        df.insert(df.columns.get_loc('proceedingID') + 1, 'proceedingTitle', ['proceeding' + str(i) for i in range(len(df))])
        df.to_csv(output_file_path, index=False)
    
input_file_path = 'proceedings0.csv'  
output_file_path = 'proceeding.csv'  
modify_proceeding(input_file_path, output_file_path)

### Conference-Proceedings

In [19]:
def conference_proceeding(input_file_path, output_file_path):
    
        df = pd.read_csv(input_file_path)
        df['proceedingTitle'] = ['proceeding' + str(i) for i in range(len(df))]
        df.to_csv(output_file_path, index=False)
    
input_file_path = 'confPro-semantic.csv'  
output_file_path = 'conferenceProceedings.csv'  
conference_proceeding(input_file_path, output_file_path)

### Journal

In [22]:
def journal(input_file_path, output_file_path):

    df = pd.read_csv(input_file_path)
    df = df.drop('END_ID', axis=1)
    df.rename(columns={'START_ID': 'journalRef'}, inplace=True)
    df.insert(0, 'journalID', ['j' + str(i) for i in range(len(df))])
    year_index = df.columns.get_loc('year') + 1
    df.insert(year_index, 'journalVID', ['jv' + str(i) for i in range(len(df))])
    df.to_csv(output_file_path, index=False)

input_file_path = 'paper_published_in_journal-2.csv'  
output_file_path = 'journal.csv'  
journal(input_file_path, output_file_path)

### Journal-Volume

In [29]:
def journal_volumes(input_file_path, output_file_path):

    df = pd.read_csv(input_file_path)
    df = df.drop('volume', axis=1)
    year_index = df.columns.get_loc('year') + 1
    df.insert(year_index, 'volumeName', ['journal' + str(i) for i in range(len(df))])
    df.to_csv(output_file_path, index=False)

input_file_path = 'journalVolumes-semantic.csv'  
output_file_path = 'journal_volumes.csv'  
journal_volumes(input_file_path, output_file_path)

### Volumes

In [35]:
def volumes(input_file_path, output_file_path):

    df = pd.read_csv(input_file_path)
    df = df.drop(['journalID', 'journalRef', 'journal'], axis=1)
    df.rename(columns={'journalVID': 'volumeID'}, inplace=True)
    df.to_csv(output_file_path, index=False)

input_file_path = 'journal_volumes.csv'  
output_file_path = 'volumes.csv'  
volumes(input_file_path, output_file_path)

## Chair

In [36]:
import os

In [43]:
def chair(conference_path, authors_path, output_path):
    conference_df = pd.read_csv(conference_path)
    authors_df = pd.read_csv(authors_path)

    if 'conferenceID' not in conference_df.columns:
        raise ValueError("The conference.csv file does not contain the 'conferenceID' column.")
    if 'authorID' not in authors_df.columns:
        raise ValueError("The authors1.csv file does not contain the 'authorID' column.")

    min_length = min(len(conference_df['conferenceID']), len(authors_df['authorID']))
    conference_ids = conference_df['conferenceID'].sample(min_length, random_state=1).reset_index(drop=True)
    author_ids = authors_df['authorID'].sample(min_length, random_state=1).reset_index(drop=True)

    new_df = pd.DataFrame({
        'conferenceID': conference_ids,
        'authorID': author_ids,
        'chairID': [f'chair{i}' for i in range(min_length)]
    })

    new_df.to_csv(output_path, index=False)

conference_csv_path = 'conference.csv'
authors_csv_path = 'authors1.csv'
output_csv_path = 'chair.csv'

output_dir = os.path.dirname(output_csv_path)
if output_dir:
    os.makedirs(output_dir, exist_ok=True)

chair(conference_csv_path, authors_csv_path, output_csv_path)

## Editor

In [47]:
def editor(journal_path, authors_path, output_path):
    journal_df = pd.read_csv(journal_path)
    authors_df = pd.read_csv(authors_path)

    if 'journalID' not in journal_df.columns:
        raise ValueError("The journal.csv file does not contain the 'journalID' column.")
    if 'authorID' not in authors_df.columns:
        raise ValueError("The authors1.csv file does not contain the 'authorID' column.")

    min_length = min(len(journal_df['journalID']), len(authors_df['authorID']))
    journal_ids = journal_df['journalID'].sample(min_length, random_state=1).reset_index(drop=True)
    author_ids = authors_df['authorID'].sample(min_length, random_state=1).reset_index(drop=True)

    new_df = pd.DataFrame({
        'journalID': journal_ids,
        'authorID': author_ids,
        'editorID': [f'chair{i}' for i in range(min_length)]
    })

    new_df.to_csv(output_path, index=False)

journal_csv_path = 'journal.csv'
authors_csv_path = 'authors1.csv'
output_csv_path = 'editor.csv'

output_dir = os.path.dirname(output_csv_path)
if output_dir:
    os.makedirs(output_dir, exist_ok=True)

editor(journal_csv_path, authors_csv_path, output_csv_path)

## Papers

In [49]:
def load_data(file_path):
    return pd.read_csv(file_path)

def drop_columns(dataframe, columns):
    return dataframe.drop(columns=columns)

def extract_combine_ids(conference_df, journal_df,):
    conference_ids = conference_df['conferenceID']
    journal_ids = journal_df['journalID']
    combined_ids = pd.DataFrame({'conJourID': pd.concat([conference_ids, journal_ids]).sort_index(kind='merge').reset_index(drop=True)})
    
    return combined_ids

def save_data(dataframe, file_path):
    dataframe.to_csv(file_path, index=False)
    
def main():
    papers_semantic = load_data('papers_semantic.csv')
    conference = load_data('conference.csv')
    journal = load_data('journal.csv')

    papers_semantic_cleaned = drop_columns(papers_semantic, ['pages', 'doi', 'link'])
    combined_ids = extract_combine_ids(conference, journal)
    papers_semantic_final = pd.concat([papers_semantic_cleaned, combined_ids], axis=1)
    
    save_data(papers_semantic_final, 'papers.csv')

if __name__ == "__main__":
    main()

In [52]:
import numpy as np

In [73]:
def combine_paper_data(papers_file, conference_file, journal_file, output_file):
    try:
        papers_semantic_df = pd.read_csv(papers_file)
        conference_df = pd.read_csv(conference_file)
        journal_df = pd.read_csv(journal_file)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return
    except pd.errors.EmptyDataError as e:
        print(f"Error: {e}")
        return
    except pd.errors.ParserError as e:
        print(f"Error: {e}")
        return

    papers_semantic_df.drop(columns=["pages", "doi", "link"], inplace=True)
    
    conf_ids = list(conference_df["conferenceID"])
    jour_ids = list(journal_df["journalID"])
    conf_titles = list(conference_df["name"])
    jour_titles = list(journal_df["journal"])

    min_len = min(len(conf_ids), len(jour_ids))

    mixed_ids = []
    mixed_titles = []

    for i in range(min_len):
        mixed_ids.append(conf_ids[i])
        mixed_ids.append(jour_ids[i])
        mixed_titles.append(conf_titles[i])
        mixed_titles.append(jour_titles[i])

    mixed_ids.extend(conf_ids[min_len:])
    mixed_ids.extend(jour_ids[min_len:])
    mixed_titles.extend(conf_titles[min_len:])
    mixed_titles.extend(jour_titles[min_len:])

    np.random.shuffle(mixed_ids)
    np.random.shuffle(mixed_titles)
    
    confjorID = pd.Series(mixed_ids[:len(papers_semantic_df)])
    confjorTitle = pd.Series(mixed_titles[:len(papers_semantic_df)])
    
    conference_journal_df = pd.DataFrame({"confjorID": confjorID, "confjorTitle": confjorTitle})
    combined_df = pd.concat([papers_semantic_df.reset_index(drop=True), conference_journal_df], axis=1)
    combined_df.insert(0, "paperID", ["p" + str(i) for i in range(len(combined_df))])
    combined_df.to_csv(output_file, index=False)

papers_file = 'papers_semantic.csv'
conference_file = 'conference.csv'
journal_file = 'journal.csv'
output_file = 'papers_combined.csv'

combine_paper_data(papers_file, conference_file, journal_file, output_file)

## Author-Paper

In [76]:
authors_df = pd.read_csv('authors1.csv')
papers_combined_df = pd.read_csv('papers_combined.csv')

def distribute_paper_data_evenly(paper_ids_df, chunk_size=100):
    paper_ids = paper_ids_df['paperID'].tolist()
    ids = paper_ids_df['ID'].tolist()
    titles = paper_ids_df['title'].tolist()
    abstracts = paper_ids_df['abstract'].tolist()
    confjor_ids = paper_ids_df['confjorID'].tolist()
    confjor_titles = paper_ids_df['confjorTitle'].tolist()
    
    arranged_paper_ids = []
    arranged_ids = []
    arranged_titles = []
    arranged_abstracts = []
    arranged_confjor_ids = []
    arranged_confjor_titles = []
    
    for i in range(0, len(paper_ids), chunk_size):
        chunk_end = min(i + chunk_size, len(paper_ids))
        chunk_size_adjusted = chunk_end - i
        
        arranged_paper_ids.extend([paper_ids[i]] * chunk_size_adjusted)
        arranged_ids.extend([ids[i]] * chunk_size_adjusted)
        arranged_titles.extend([titles[i]] * chunk_size_adjusted)
        arranged_abstracts.extend([abstracts[i]] * chunk_size_adjusted)
        arranged_confjor_ids.extend([confjor_ids[i]] * chunk_size_adjusted)
        arranged_confjor_titles.extend([confjor_titles[i]] * chunk_size_adjusted)
    
    arranged_paper_ids = arranged_paper_ids[:len(paper_ids)]
    arranged_ids = arranged_ids[:len(ids)]
    arranged_titles = arranged_titles[:len(titles)]
    arranged_abstracts = arranged_abstracts[:len(abstracts)]
    arranged_confjor_ids = arranged_confjor_ids[:len(confjor_ids)]
    arranged_confjor_titles = arranged_confjor_titles[:len(confjor_titles)]
    
    return arranged_paper_ids, arranged_ids, arranged_titles, arranged_abstracts, arranged_confjor_ids, arranged_confjor_titles

shuffled_paper_id_df = papers_combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
arranged_paper_ids, arranged_ids, arranged_titles, arranged_abstracts, arranged_confjor_ids, arranged_confjor_titles = distribute_paper_data_evenly(shuffled_paper_id_df)

num_authors = len(authors_df)
while len(arranged_paper_ids) < num_authors:
    arranged_paper_ids.extend(arranged_paper_ids)
    arranged_ids.extend(arranged_ids)
    arranged_titles.extend(arranged_titles)
    arranged_abstracts.extend(arranged_abstracts)
    arranged_confjor_ids.extend(arranged_confjor_ids)
    arranged_confjor_titles.extend(arranged_confjor_titles)

arranged_paper_ids = arranged_paper_ids[:num_authors]
arranged_ids = arranged_ids[:num_authors]
arranged_titles = arranged_titles[:num_authors]
arranged_abstracts = arranged_abstracts[:num_authors]
arranged_confjor_ids = arranged_confjor_ids[:num_authors]
arranged_confjor_titles = arranged_confjor_titles[:num_authors]

authors_with_chunks = authors_df.copy()

authors_with_chunks['paperID'] = ''
authors_with_chunks['ID'] = ''
authors_with_chunks['title'] = ''
authors_with_chunks['abstract'] = ''
authors_with_chunks['confjorID'] = ''
authors_with_chunks['confjorTitle'] = ''

authors_with_chunks['paperID'] = arranged_paper_ids
authors_with_chunks['ID'] = arranged_ids
authors_with_chunks['title'] = arranged_titles
authors_with_chunks['abstract'] = arranged_abstracts
authors_with_chunks['confjorID'] = arranged_confjor_ids
authors_with_chunks['confjorTitle'] = arranged_confjor_titles

authors_with_chunks.to_csv('authorsPapers_.csv', index=False)

## Publications

In [79]:
def publications(input_file_path, output_file_path):
    import pandas as pd

    df = pd.read_csv(input_file_path)
    df.drop(columns=["ID", "confjorID", "confjorTitle"], inplace=True)
    df.insert(0, 'publicationID', ['pub' + str(i) for i in range(len(df))])
    df.to_csv(output_file_path, index=False)

input_file_path = 'papers_combined.csv'  
output_file_path = 'publications.csv' 

publications(input_file_path, output_file_path)

## Topic

In [85]:
import pandas as pd

def topic(input_file_path, output_file_path):
    df = pd.read_csv(input_file_path)
    df = df.drop(columns=["domain", "ID"])
    df.insert(0, 'topicID', ['t' + str(i) for i in range(len(df))])
    df = df.rename(columns={'name': 'topicName'})
    df.to_csv(output_file_path, index=False)

input_file_path = 'keywords_semantic-2.csv'
output_file_path = 'topic.csv'

topic(input_file_path, output_file_path)

## Topic-Paper

In [91]:
topic_df = pd.read_csv('topic.csv')
papers_combined_df = pd.read_csv('papers_combined.csv')

if len(topic_df) < len(papers_combined_df):
    raise ValueError("Not enough rows in topic.csv to match papers_combined.csv")

sampled_topic_df = topic_df.sample(n=len(papers_combined_df), replace=True, random_state=42).reset_index(drop=True)
papers_combined_with_topics = pd.concat([papers_combined_df, sampled_topic_df], axis=1)

columns = papers_combined_with_topics.columns.tolist()
new_order = columns[-2:] + columns[:-2]
reordered_df = papers_combined_with_topics[new_order]

output_file = 'topicsPapers.csv'
reordered_df.to_csv(output_file, index=False)

## Topics-Proceedings-Volumes

In [93]:
topic_df = pd.read_csv('topic.csv')
tpv_df = pd.read_csv('conference.csv')

if len(topic_df) < len(tpv_df):
    raise ValueError("Not enough rows in topic.csv to match papers_combined.csv")

sampled_topic_df = topic_df.sample(n=len(tpv_df), replace=True, random_state=42).reset_index(drop=True)
tpv_topics = pd.concat([tpv_df, sampled_topic_df], axis=1)

columns = tpv_topics.columns.tolist()
new_order = columns[-2:] + columns[:-2]
reordered_df = tpv_topics[new_order]

output_file = 'conference.csv'
reordered_df.to_csv(output_file, index=False)

In [102]:
file_path = 'conference.csv'
df = pd.read_csv(file_path)

combined_ids = []

for i in range(len(df)):
    if i % 2 == 0:
        combined_ids.append(df.loc[i, 'journalID'])
    else:
        combined_ids.append(df.loc[i, 'conferenceID'])

df['jcIDs'] = combined_ids

df.to_csv('tpv.csv', index=False)

In [104]:
file_path = 'tpv.csv'
df = pd.read_csv(file_path)

combined_ids = []

for i in range(len(df)):
    if i % 2 == 0:
        combined_ids.append(df.loc[i, 'journalVID'])
    else:
        combined_ids.append(df.loc[i, 'conProIds'])

df['vpIDs'] = combined_ids

df.to_csv('vp.csv', index=False)

In [106]:
df = pd.read_csv('vp.csv')
df = df.drop(columns=["journalID", "conferenceID", "journalVID", "conProIds"])
df.to_csv('topicsProceedingsVolumes.csv', index=False)

## Reviews

In [107]:
file_path = 'publications.csv'
df = pd.read_csv(file_path)

review_decision = []
review_decision_boolean = []

for i in range(len(df)):
    if i % 4 == 0:
        review_decision.append('rejected')
        review_decision_boolean.append(0)
    else:
        review_decision.append('accepted')
        review_decision_boolean.append(1)

df['reviewDecision'] = review_decision
df['reviewDecisionBoolean'] = review_decision_boolean

df.to_csv('reviews.csv', index=False)

In [110]:
df = pd.read_csv('reviews.csv')
df = df.drop('abstract', axis=1)
df.insert(0,'reviewID', ['r' + str(i) for i in range(len(df))])
df.to_csv('reviews.csv', index=False)

## Reviewers

In [113]:
topic_df = pd.read_csv('authors1.csv')
reviewers_df = pd.read_csv('reviewers.csv')

if len(topic_df) < len(reviewers_df):
    raise ValueError("Not enough rows in topic.csv to match papers_combined.csv")

sampled_topic_df = topic_df.sample(n=len(reviewers_df), replace=True, random_state=42).reset_index(drop=True)
reviewers_a = pd.concat([reviewers_df, sampled_topic_df], axis=1)

columns = reviewers_a.columns.tolist()
new_order = columns[-2:] + columns[:-2]
reordered_df = reviewers_a[new_order]

output_file = 'reviewers.csv'
reordered_df.to_csv(output_file, index=False)

In [116]:
df = pd.read_csv('reviewers.csv')
#df['reviewID'] = np.random.permutation(df['reviewID'])
shuffled_indices = np.random.permutation(df.index)
df_shuffled = df.copy()
df_shuffled[['publicationID', 'paperID', 'title']] = df[['publicationID', 'paperID', 'title']].iloc[shuffled_indices].values
df_shuffled.to_csv('reviewers.csv', index=False)

## Supervisor

In [122]:
chair_file_path = 'chair.csv'
authors_papers_file_path = 'authorsPapers.csv'

df_chair = pd.read_csv(chair_file_path)
df_authors_papers = pd.read_csv(authors_papers_file_path)

conference_ids = []
chair_ids = []

num_chairs = len(df_chair)
for i in range(len(df_authors_papers)):
    chair_index = i // 10 % num_chairs  
    conference_ids.append(df_chair.loc[chair_index, 'conferenceID'])
    chair_ids.append(f'chair{chair_index}')

df_authors_papers['conferenceID'] = conference_ids
df_authors_papers['chairID'] = chair_ids

df_authors_papers.to_csv('supervisors.csv', index=False)