https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema

In [1]:
%load_ext lab_black
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import pickle
import os

CONFERENCE_SERIES = "mag/ConferenceSeries.txt"
CONFERENCE_SERIES_HEADER = [
    "ConferenceSeriesId",
    "Rank",
    "NormalizedName",
    "DisplayName",
    "PaperCount",
    "CitationCount",
    "CreatedDate",
]
CONFERENCE_INSTANCES = "mag/ConferenceInstances.txt"
CONFERENCE_INSTANCES_HEADER = [
    "ConferenceInstanceId",
    "NormalizedName",
    "DisplayName",
    "ConferenceSeriesId",
    "Location",
    "OfficialUrl",
    "StartDate",
    "EndDate",
    "AbstractRegistrationDate",
    "SubmissionDeadlineDate",
    "NotificationDueDate",
    "FinalVersionDueDate",
    "PageCount",
    "CitationCount",
    "Latitude",
    "Longitude",
    "CreatedDate",
]
PAPERS = "mag/Papers.txt"
PAPERS_HEADER = [
    "PaperId",
    "Rank",
    "Doi",
    "DocType",
    "PaperTitle",
    "OriginalTitle",
    "BookTitle",
    "Year",
    "Date",
    "Publisher",
    "JournalId",
    "ConferenceSeriesId",
    "ConferenceInstanceId",
    "Volumne",
    "Issue",
    "FirstPage",
    "LastPage",
    "ReferenceCount",
    "CitationCount",
    "EstimatedCitation",
    "OriginalVenue",
    "FamilyId",
    "CreatedDate",
]
PAPER_AUTHOR_AFFILIATIONS = "mag/PaperAuthorAffiliations.txt"
PAPER_AUTHOR_AFFILIATIONS_HEADER = [
    "PaperId",
    "AuthorId",
    "AffiliationId",
    "AuthorSequenceNumber",
    "OriginalAuthor",
    "OritinalAffiliation",
]
AUTHOR = "mag/Authors.txt"
AUTHOR_HEADER = [
    "AuthorId",
    "Rank",
    "NormalizedName",
    "DisplayName",
    "LastKnownAffiliationId",
    "PaperCount",
    "CitationCount",
    "CreateDate",
]
PAPER_FIELDS_OF_STUDY = "mag/PaperFieldsOfStudy.txt"
PAPER_FIELDS_OF_STUDY_HEADER = ["PaperId", "FieldOfStudyId", "Score"]
FIELDS_OF_STUDY = "mag/FieldsOfStudy.txt"
FIELDS_OF_STUDY_HEADER = [
    "FieldOfStudyId",
    "Rank",
    "NormalizedName",
    "DisplayName",
    "MainType",
    "Level",
    "PaperCount",
    "CitationCount",
    "CreateDate",
]

In [2]:
def extract_conferences(interested_range, filename=None):
    with open(CONFERENCE_SERIES) as fp:
        conference_series_df = pd.read_csv(
            fp, sep="\t", header=None, names=CONFERENCE_SERIES_HEADER
        )

    with open(CONFERENCE_INSTANCES) as fp:
        conference_instances_df = pd.read_csv(
            fp, sep="\t", header=None, names=CONFERENCE_INSTANCES_HEADER
        )

    kdd_series = conference_series_df[conference_series_df.NormalizedName == "KDD"]
    kdd_instances = conference_instances_df[
        (
            conference_instances_df.ConferenceSeriesId
            == kdd_series.ConferenceSeriesId.iloc[0]
        )
        & (conference_instances_df.DisplayName.isin(interested_range))
    ]
    if filename:
        kdd_instances.to_csv(filename)
    return kdd_instances

In [3]:
def extract_papers(kdd_instances, num_loops=-1):
    chunksize = 10 ** 6
    count = 0
    paperCounter = Counter({})
    relevantPapers = pd.DataFrame(columns=PAPERS_HEADER)
    for chunk in pd.read_csv(
        PAPERS,
        chunksize=chunksize,
        sep="\t",
        header=None,
        names=PAPERS_HEADER,
        low_memory=False,
    ):
        count += 1
        print(f"Processing {chunksize * (count - 1)} - {chunksize * count}")
        paperCounter += Counter(
            dict(
                pd.to_numeric(chunk.Year, errors="coerce")
                .astype("Int32")
                .value_counts()
            )
        )
        relevantPapers = relevantPapers.append(
            chunk[
                chunk.ConferenceInstanceId.isin(
                    list(kdd_instances.ConferenceInstanceId)
                )
            ],
            ignore_index=True,
        )
        if count == num_loops:
            break
    all_paper_count = (
        pd.DataFrame.from_dict(paperCounter, orient="index")
        .reset_index()
        .rename(columns={"index": "year", 0: "count"})
    )
    all_paper_count.to_csv(r"results/all_paper_count.csv")
    relevantPapers.to_csv(r"results/kdd_papers.csv")

In [4]:
def extract_paper_author_affiliation(kdd_papers, num_loops=-1):
    chunksize = 10 ** 7
    count = 0
    relavantPaperAuthorAffiliations = pd.DataFrame(
        columns=PAPER_AUTHOR_AFFILIATIONS_HEADER
    )
    for chunk in pd.read_csv(
        PAPER_AUTHOR_AFFILIATIONS,
        chunksize=chunksize,
        sep="\t",
        header=None,
        names=PAPER_AUTHOR_AFFILIATIONS_HEADER,
    ):
        count += 1
        print(f"Processing {chunksize * (count - 1)} - {chunksize * count}")
        relavantPaperAuthorAffiliations = relavantPaperAuthorAffiliations.append(
            chunk[chunk.PaperId.isin(list(kdd_papers.PaperId))], ignore_index=True
        )
        if count == num_loops:
            break
    relavantPaperAuthorAffiliations.to_csv(r"results/kdd_paper_author_affiliations.csv")

In [5]:
def extract_author(kdd_paper_author_affiliation, num_loops=-1):
    chunksize = 10 ** 6
    count = 0
    relevantAuthors = pd.DataFrame(columns=AUTHOR_HEADER)
    for chunk in pd.read_csv(
        AUTHOR, chunksize=chunksize, sep="\t", header=None, names=AUTHOR_HEADER
    ):
        count += 1
        print(f"Processing {chunksize * (count - 1)} - {chunksize * count}")
        relevantAuthors = relevantAuthors.append(
            chunk[chunk.AuthorId.isin(list(kdd_paper_author_affiliation.AuthorId))],
            ignore_index=True,
        )
        if count == num_loops:
            break
    relevantAuthors.to_csv(r"results/kdd_authors.csv")

In [6]:
def extract_paper_author_affiliation_for_authors(kdd_authors, num_loops=-1):
    chunksize = 10 ** 7
    count = 0
    relavantPaperAuthorAffiliations = pd.DataFrame(
        columns=PAPER_AUTHOR_AFFILIATIONS_HEADER
    )
    for chunk in pd.read_csv(
        PAPER_AUTHOR_AFFILIATIONS,
        chunksize=chunksize,
        sep="\t",
        header=None,
        names=PAPER_AUTHOR_AFFILIATIONS_HEADER,
    ):
        count += 1
        print(f"Processing {chunksize * (count - 1)} - {chunksize * count}")
        relavantPaperAuthorAffiliations = relavantPaperAuthorAffiliations.append(
            chunk[chunk.AuthorId.isin(list(kdd_authors.AuthorId))], ignore_index=True
        )
        if count == num_loops:
            break
    relavantPaperAuthorAffiliations.to_csv(
        r"results/kdd_paper_author_affiliations_for_authors.csv"
    )

In [7]:
def extract_paper_fields_of_study(paper_list, num_loops=-1):
    chunksize = 10 ** 7
    count = 0
    relevantPaperFieldsOfStudy = pd.DataFrame(columns=PAPER_FIELDS_OF_STUDY_HEADER)
    for chunk in pd.read_csv(
        PAPER_FIELDS_OF_STUDY,
        chunksize=chunksize,
        sep="\t",
        header=None,
        names=PAPER_FIELDS_OF_STUDY_HEADER,
    ):
        count += 1
        print(f"Processing {chunksize * (count - 1)} - {chunksize * count}")
        relevantPaperFieldsOfStudy = relevantPaperFieldsOfStudy.append(
            chunk[chunk.PaperId.isin(paper_list)], ignore_index=True
        )
        if count == num_loops:
            break
    relevantPaperFieldsOfStudy.to_csv(r"results/paper_fields_of_study.csv")

In [8]:
def save_dataframe_to_file(dataframe, filepath):
    os.remove(filepath)
    with open(filepath, "a") as fd:
        fd.write(f'{",".join(map(str, dataframe.columns))}')
        for index in tqdm(range(len(dataframe))):
            fd.write(f'\n{",".join(map(str, dataframe.iloc[index]))}')

In [9]:
def convert_fields_of_study_to_feature_vector(paper_fields_of_study):
    table_dict = {}
    for paperId in tqdm(list(set(paper_fields_of_study.PaperId))[0:10000]):
        dataframe = paper_fields_of_study[paper_fields_of_study.PaperId == paperId]
        table = pd.pivot_table(
            dataframe,
            values="Score",
            index=["PaperId"],
            columns=["FieldOfStudyId"],
            aggfunc=np.sum,
        )
        table_dict.update(table.T.to_dict())
    dataframe = pd.DataFrame(table_dict).fillna(0).T.round(3)

    with open(FIELDS_OF_STUDY) as fp:
        fields_of_study_df = pd.read_csv(
            fp, sep="\t", header=None, names=FIELDS_OF_STUDY_HEADER
        )

        level2_fields = sorted(
            list(
                fields_of_study_df[
                    (fields_of_study_df.FieldOfStudyId.isin(dataframe.columns))
                    & (fields_of_study_df.Level == 2)
                ].FieldOfStudyId
            )
        )
        dataframe = dataframe[level2_fields]
        dataframe.columns = [
            fields_of_study_df[fields_of_study_df.FieldOfStudyId == id]
            .iloc[0]
            .NormalizedName
            for id in dataframe.columns
        ]
    dataframe.to_csv(r"results/paper_features.csv")
    return level2_fields

In [10]:
def generate_author_features(
    paper_author_affiliation_for_authors, paper_fields_of_study, field_order
):
    dictionary = {}
    for authorId in tqdm(set(paper_author_affiliation_for_authors.AuthorId)):
        paperIds = list(
            paper_author_affiliation_for_authors[
                paper_author_affiliation_for_authors.AuthorId == authorId
            ].PaperId
        )
        fields = (
            paper_fields_of_study[paper_fields_of_study.PaperId.isin(paperIds)]
            .filter(["FieldOfStudyId", "Score"])
            .groupby("FieldOfStudyId")
            .mean()
        )
        fields_dictionary = dict(zip(fields.index, fields.Score.values))
        dictionary[authorId] = fields_dictionary

    dataframe = pd.DataFrame(dictionary).fillna(0).T.round(3)
    dataframe = dataframe[field_order]
    with open(FIELDS_OF_STUDY) as fp:
        fields_of_study_df = pd.read_csv(
            fp, sep="\t", header=None, names=FIELDS_OF_STUDY_HEADER
        )

        dataframe.columns = [
            fields_of_study_df[fields_of_study_df.FieldOfStudyId == id]
            .iloc[0]
            .NormalizedName
            for id in dataframe.columns
        ]
    dataframe.to_csv(r"results/author_features.csv")

In [11]:
def generate_author_features_using_maximum(
    paper_author_affiliation_for_authors, paper_fields_of_study, field_order
):
    dictionary = {}
    for authorId in tqdm(set(paper_author_affiliation_for_authors.AuthorId)):
        paperIds = list(
            paper_author_affiliation_for_authors[
                paper_author_affiliation_for_authors.AuthorId == authorId
            ].PaperId
        )
        fields = (
            paper_fields_of_study[paper_fields_of_study.PaperId.isin(paperIds)]
            .filter(["FieldOfStudyId", "Score"])
            .groupby("FieldOfStudyId")
            .max()
        )
        fields_dictionary = dict(zip(fields.index, fields.Score.values))
        dictionary[authorId] = fields_dictionary

    dataframe = pd.DataFrame(dictionary).fillna(0).T.round(3)
    dataframe = dataframe[field_order]
    with open(FIELDS_OF_STUDY) as fp:
        fields_of_study_df = pd.read_csv(
            fp, sep="\t", header=None, names=FIELDS_OF_STUDY_HEADER
        )

        dataframe.columns = [
            fields_of_study_df[fields_of_study_df.FieldOfStudyId == id]
            .iloc[0]
            .NormalizedName
            for id in dataframe.columns
        ]
    dataframe.to_csv(r"results/author_features_max.csv")

In [12]:
# This code block extract the features for simulation

# extract_conferences([
#         "KDD 2019",
#         "KDD 2018",
#         "KDD 2017",
#         "KDD 2016",
#         "KDD 2015",
#         "KDD 2014",
#         "KDD 2013",
#         "KDD 2012",
#         "KDD 2011",
#         "KDD 2010",
#     ], "results/kdd_instances.csv")
# kdd_instances = pd.read_csv(r"results/kdd_instances.csv", index_col=0)
# extract_papers(kdd_instances)

kdd_papers = pd.read_csv(r"results/kdd_papers.csv", index_col=0)
kdd_papers[kdd_papers.Year > 2013]

extract_paper_author_affiliation(kdd_papers)

kdd_paper_author_affiliation = pd.read_csv(
    r"results/kdd_paper_author_affiliations.csv", index_col=0
)
extract_author(kdd_paper_author_affiliation)
kdd_authors = pd.read_csv(r"results/kdd_authors.csv", index_col=0)
extract_paper_author_affiliation_for_authors(kdd_authors)
paper_author_affiliation_for_authors = pd.read_csv(
    r"results/kdd_paper_author_affiliations_for_authors.csv", index_col=0
)
extract_paper_fields_of_study(list(paper_author_affiliation_for_authors.PaperId))
paper_fields_of_study = pd.read_csv(r"results/paper_fields_of_study.csv", index_col=0)
kdd_paper_fields_of_study = paper_fields_of_study[
    paper_fields_of_study.PaperId.isin(kdd_papers.PaperId)
]
fields = convert_fields_of_study_to_feature_vector(kdd_paper_fields_of_study)

paper_fields_of_study = paper_fields_of_study[
    paper_fields_of_study.FieldOfStudyId.isin(fields)
]
paper_fields_of_study = pd.read_csv(r"results/paper_fields_of_study.csv", index_col=0)
generate_author_features(
    paper_author_affiliation_for_authors, paper_fields_of_study, fields
)
generate_author_features_using_maximum(
    paper_author_affiliation_for_authors, paper_fields_of_study, fields
)

Processing 0 - 10000000
Processing 10000000 - 20000000
Processing 20000000 - 30000000
Processing 30000000 - 40000000
Processing 40000000 - 50000000
Processing 50000000 - 60000000
Processing 60000000 - 70000000
Processing 70000000 - 80000000
Processing 80000000 - 90000000
Processing 90000000 - 100000000
Processing 100000000 - 110000000
Processing 110000000 - 120000000
Processing 120000000 - 130000000
Processing 130000000 - 140000000
Processing 140000000 - 150000000
Processing 150000000 - 160000000
Processing 160000000 - 170000000
Processing 170000000 - 180000000
Processing 180000000 - 190000000
Processing 190000000 - 200000000
Processing 200000000 - 210000000
Processing 210000000 - 220000000
Processing 220000000 - 230000000
Processing 230000000 - 240000000
Processing 240000000 - 250000000
Processing 250000000 - 260000000
Processing 260000000 - 270000000
Processing 270000000 - 280000000
Processing 280000000 - 290000000
Processing 290000000 - 300000000
Processing 300000000 - 310000000
Pro

  mask |= (ar1 == a)
100%|██████████| 2779/2779 [00:34<00:00, 81.41it/s]
100%|██████████| 7152/7152 [13:58<00:00,  8.53it/s]
100%|██████████| 7152/7152 [14:00<00:00,  8.51it/s]
