In [223]:
import pandas as pd

---

In [224]:
default_df = pd.read_csv("default_citation_metadata.csv", header=0, index_col=0)

In [225]:
broken_citations = pd.read_csv("clear_citation_metadata.csv", header=0, sep=";", index_col=0)

In [226]:
citations = pd.concat([broken_citations.iloc[:, :2], default_df.iloc[:, 2:]], axis=1)

---

In [227]:
citations = citations.dropna(subset=['authors']);

In [229]:
citations.replace("High-Energy Physics - Theory", "High Energy Physics - Theory", inplace=True)
citations["subjects"].fillna("High Energy Physics - Theory", inplace=True)
citations.replace(r"Probability Theory", "Statistics and Probability", regex=True, inplace=True)
citations.replace(r"Quantum Algebra", "Quantum", regex=True, inplace=True)

In [230]:
citations

Unnamed: 0,paper_id,authors,title,abstract,subjects,comments
0,9810034,"Anastasia Doikou, Rafael I. Nepomechie",Parity and Charge Conjugation Symmetries and S...,We formulate the notion of parity for the peri...,"High Energy Physics - Theory, Exactly Solvable...",20
1,201111,G. Papadopoulos,KT and HKT Geometries in Strings and in Black ...,Some selected applications of KT and HKT geome...,"High Energy Physics - Theory, Differential Geo...",26
2,9210111,A. P. Balachandran,"Gauge Symmetries,Topology and Quantisation",The following two loosely connected sets of to...,High Energy Physics - Theory,74
3,9303104,P. Berglund,Dimensionally Reduced Landau-Ginzburg Orbifold...,"It is observed that a large class of $(2,2)$ s...",High Energy Physics - Theory,10
4,9509068,"Tae Seong Kim, Won Ho Kye, Jae Kwan Kim",The Dynamical Behaviors in (2+1)-Dimensional G...,We analyze (2+1)-dimensional Gross-Neveu model...,High Energy Physics - Theory,19
...,...,...,...,...,...,...
29550,210293,Dmitri Antonov,Finite-temperature properties of the supersymm...,The finite-temperature properties of supersymm...,High Energy Physics - Theory,8
29551,9310065,Aurelian Isar,Wigner distribution function for the harmonic ...,Time evolution of the expectation values of va...,High Energy Physics - Theory,17
29552,103002,Kazuto Oshima,Critical Coupling in (1+1)-Dimensional Light-F...,Spontaneous symmetry breaking in (1+1)-dimensi...,High Energy Physics - Theory,21
29553,9406137,"Cesar Gomez, Henri Ruegg, Philippe Zaugg",Lattice Poincare as a quantum deformed algebra,We propose a definition of a Poincar\'e algebr...,High Energy Physics - Theory,10


---

In [231]:
def list_from_string(string):
    if isinstance(string, str):
        return string.split(", ")
    else:
        return None

def flatlist(t):
    flat_list = []
    for sublist in t:
        if isinstance(sublist, list):
            for item in sublist:
                flat_list.append(item)
        else:
            flat_list.append(sublist)
    return flat_list

def flatset(t):
    flat_list = set()
    for sublist in t:
        if isinstance(sublist, set):
            for item in sublist:
                flat_list.add(item)
        else:
            flat_list.add(sublist)
    return flat_list

def mapper(data):
    data2id, id2data = dict(), dict()
    for ind, element in enumerate(data):
        data2id[element] = ind
        id2data[ind] = element
    return data2id, id2data

In [232]:
class Node:
    def __init__(self):
        self.author_id = 0
        self.interests = set()
    def __dict__(self):
        return {"author_id": self.author_id, "interests": self.interests}
    def __repr__(self):
        return str(self.__dict__())

In [233]:
class Edge:
    def __init__(self):
        self.from_id = 0
        self.to_id = 0
        self.article_ids = set()
    def __dict__(self):
        return {"from_id": self.from_id, "to_id": self.to_id, "article_ids": self.article_ids}
    def __repr__(self):
        return str(self.__dict__())

---

In [234]:
# We need to get all authors and change them to ids
authors = flatlist(list(map(list_from_string, citations['authors'].tolist())))
author2id, id2author = mapper(list(set(authors)))

In [235]:
from tqdm.notebook import tqdm
edges = []
for _, article in tqdm(citations.iterrows()):
    article_authors = article["authors"]
    article_authors = list_from_string(article_authors)
    if len(article_authors) > 1:
        for from_author_idx in range(len(article_authors)):
            for to_author in (article_authors[from_author_idx+1:]):
                
                from_id = author2id[article_authors[from_author_idx]]
                to_id = author2id[to_author]

                unique_check = True

                for edge in edges:
                    if edge.from_id == from_id and edge.to_id == to_id or (edge.from_id == to_id and edge.to_id == from_id):
                        unique_check = False
                        edge.article_ids.add(article["paper_id"])

                if unique_check:
                    edge = Edge()
                    edge.from_id = from_id
                    edge.to_id = to_id
                    edge.article_ids.add(article["paper_id"])
                    edges.append(edge)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [236]:
dict_edges = [edge.__dict__() for edge in edges]

In [237]:
extended_edge_list = pd.DataFrame(dict_edges)

In [238]:
extended_edge_list

Unnamed: 0,from_id,to_id,article_ids
0,8950,7794,"{9705187, 9808012, 9803118, 9708145, 9810034, ..."
1,9308,3382,{9509068}
2,9308,3432,{9509068}
3,3382,3432,{9509068}
4,9105,673,"{211203, 212139, 207127}"
...,...,...,...
24016,9922,9388,{6021}
24017,2700,3331,{9708101}
24018,755,12203,{9406137}
24019,755,4805,{9406137}


In [239]:
extended_edge_list.to_csv("extended_edge_list.csv")

---

In [244]:
subjects_tagged = pd.read_csv("subjects_tagged.csv", index_col=1)
subjects_tagged.drop(columns=["Unnamed: 0"], inplace=True)
subjects_tagged = subjects_tagged.to_dict("index")
for subject in subjects_tagged:
    subjects_tagged[subject] = subjects_tagged[subject]["keywords"].lower().split("/")
subject_list = subjects_tagged
len(subjects_tagged)

51

In [245]:
subjects = flatlist(list(map(list_from_string, citations['subjects'].unique().tolist())))
subject2id, id2subject = mapper(list(set(subjects)))

In [266]:
from tqdm.notebook import tqdm
nodes = []
for author_id in tqdm(id2author.keys()):
    node = Node()
    node.author_id = author_id
    connections = extended_edge_list.loc[(extended_edge_list['from_id'] == author_id) | (extended_edge_list['to_id'] == author_id)]
    if not connections.empty:
        get_all_articles = list(flatset(connections["article_ids"].tolist()))
        all_articles = citations[citations["paper_id"].isin(get_all_articles)]
        all_abstracts = all_articles["abstract"].dropna().tolist()
        all_abstracts = " ".join(all_abstracts).lower()
        article_abstract_subjects = []
        for subject in subject_list:
            for keyword in subject_list[subject]:
                if keyword in all_abstracts:
                    article_abstract_subjects.append(subject)
        article_subjects = flatlist(map(list_from_string, all_articles["subjects"].dropna().tolist()))
        all_subjects = set(article_abstract_subjects + article_subjects)
        if all_subjects:
            for subject in all_subjects:
                node.interests.add(subject2id[subject])
    else:
        author_name = id2author[author_id]
        all_articles = citations[citations['authors'].str.contains(author_name)]
        all_abstracts = all_articles["abstract"].dropna().tolist()
        all_abstracts = " ".join(all_abstracts).lower()
        article_abstract_subjects = []
        for subject in subject_list:
            for keyword in subject_list[subject]:
                if keyword in all_abstracts:
                    article_abstract_subjects.append(subject)
        article_subjects = flatlist(map(list_from_string, all_articles["subjects"].dropna().tolist()))
        all_subjects = set(article_abstract_subjects + article_subjects)
        if all_subjects:
            for subject in all_subjects:
                node.interests.add(subject2id[subject])
    nodes.append(node)

HBox(children=(FloatProgress(value=0.0, max=12925.0), HTML(value='')))




In [267]:
dict_nodes = [node.__dict__() for node in nodes]

In [268]:
author_features_list = pd.DataFrame(dict_nodes)

In [None]:
author_features_list

In [270]:
author_features_list.to_csv("author_feature_list.csv")

---

In [259]:
id2author_df = pd.DataFrame(data=id2author.items(), columns=["id", "author"])
id2author_df = id2author_df.set_index("id")
id2author_df.to_csv("id_2_author.csv")

In [260]:
id2author_df

Unnamed: 0_level_0,author
id,Unnamed: 1_level_1
0,Ulrich Theis
1,S. Stramaglia
2,E. T. Akhmedov
3,Nematollah Riazi
4,Yuri Malyuta
...,...
12920,Michael Goodband
12921,P. M. Saffin
12922,A. Tomasiello
12923,J. A. Teschner


---

In [261]:
id2subjects_df = pd.DataFrame(data=id2subject.items(), columns=["id", "subject"])
id2subjects_df = id2subjects_df.set_index("id")
id2subjects_df.to_csv("id_2_subjects.csv")

In [None]:
id2subjects_df