In [1]:
__author__ = "Jon Ball"
__version__ = "June 2024"

In [2]:
import re
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from numpy import array # needed for eval()

In [3]:
with open("data/issns_active_2022.txt", "r") as infile:
    issnsActive = list(line.strip() for line in infile.readlines())
issnsActive = {issn: 1 for issn in issnsActive if issn}
print(len(issnsActive))

60709


In [4]:
def get_edges(row):
    # for each unique publication in the dimensions data
    pub_id = row["publication_id"]
    # if the ISSN indicates a 2022 active OJS context
    if row["issn"] in issnsActive or row["eissn"] in issnsActive:
        # get publication_id for each document it references
        ref_ids =  list(set([s.strip("'") for s in row["reference_ids"].strip("[]").split()])) if "[]" not in row["reference_ids"] else []
        # get publication_id for each document that cites it
        cite_ids = list(set([d["id"] for d in [eval(s) for s in re.split(r"\.*\n+\.*", row["citing_ids"].strip("[]"))]])) if "[]" not in row["citing_ids"] else []
        # define edges in the citation graph
        V = [(pub_id, ref_id) for ref_id in ref_ids if ref_id]
        V += [(cite_id, pub_id) for cite_id in cite_ids if cite_id]
        return V
    else:
        return None

In [5]:
def get_issns(row):
    # if an ISSN is provided
    issn = row["issn"]
    issn = issn.strip() if isinstance(issn, str) else ""
    # if an E-ISSN is provided
    eissn = row["eissn"]
    eissn = eissn.strip() if isinstance(eissn, str) else ""
    # join ISSN and E-ISSN if both are present, else take either
    if issn and eissn:
        issns = f"{issn}\n{eissn}"
    elif issn:
        issns = issn
    elif eissn:
        issns = eissn
    else:
        issns = None
    return issns


In [6]:
def get_id_issn_map(row):
    # for each unique publication in the dimensions data
    pub_id = row["publication_id"]
    # return a mapping of publication_id to ISSN(s)
    issns = row["issns"]
    return pub_id, issns

## references_of_all_pubs.csv - 2022 active ISSNs

In [7]:
ref_df = pd.read_csv("data/references_of_all_pubs.csv", header=None)
ref_df.columns = ["index", "publication_id", "reference_ids", "citing_ids", "doi", "issn", "eissn", "type", "date", "category_for", "times_cited", "research_org_cities", "ur_id", "source", "research_org_country_names", "altmetrics", "title", "abstract", "concepts", "idk"]
ref_df.head()

Unnamed: 0,index,publication_id,reference_ids,citing_ids,doi,issn,eissn,type,date,category_for,times_cited,research_org_cities,ur_id,source,research_org_country_names,altmetrics,title,abstract,concepts,idk
0,0,pub.1120627557,[],"[{'id': 'pub.1158208917', 'year': 2023}]",10.35673/ajmpi.v4i1.215,2406-8802,2685-550X,article,2019-07-31,"{'first_level': {'codes': array(['43', '50'], ...","{'times_cited': 1, 'recent_citations': 1, 'fie...",[],['ur.011573102374.79'],"{'id': 'jour.1377569', 'title': 'Al-Adalah Jur...",[],"{'id': None, 'score': None}",{'preferred': 'PRAKSIS POLITIK NABI MUHAMMAD S...,"{'preferred': ""This paper examines the praxis ...","[{'concept': 'political action', 'relevance': ...",[]
1,1,pub.1016805418,['pub.1060450759' 'pub.1060527901' 'pub.106052...,"[{'id': 'pub.1138140594', 'year': 2021}\n {'id...",10.1038/294139a0,0028-0836,1476-4687,article,1981-11-01,"{'first_level': {'codes': array(['34', '51'], ...","{'times_cited': 120, 'recent_citations': 4, 'f...",['grid.482271.a'],['ur.0770644175.43' 'ur.01133403503.58' 'ur.01...,"{'id': 'jour.1018957', 'title': 'Nature', 'iss...",['United Kingdom'],"{'id': None, 'score': None}",{'preferred': 'Near-edge X-ray absorption spec...,{'preferred': 'The measurement of X-ray absorp...,[{'concept': 'X-ray absorption fine structure'...,[]
2,2,pub.1144573636,['pub.1029003997' 'pub.1134779582' 'pub.113470...,"[{'id': 'pub.1162918892', 'year': 2023}\n {'id...",10.3390/w14020151,,2073-4441,article,2022-01-07,"{'first_level': {'codes': array(['37', '40'], ...","{'times_cited': 26, 'recent_citations': 26, 'f...",[],['ur.011257637763.81' 'ur.016141117352.37' 'ur...,"{'id': 'jour.1398516', 'title': 'Water', 'issn...",[],"{'id': None, 'score': None}",{'preferred': 'Development of a Distributed Ma...,{'preferred': 'The article is devoted to the p...,"[{'concept': 'mathematical model', 'relevance'...",[]
3,3,pub.1142327767,['pub.1001503523' 'pub.1002062994' 'pub.100313...,"[{'id': 'pub.1150797038', 'year': 2022}\n {'id...",10.1016/j.seta.2021.101661,2213-1388,2213-1396,article,2022-02-01,"{'first_level': {'codes': array(['33'], dtype=...","{'times_cited': 32, 'recent_citations': 32, 'f...",['grid.1004.5' 'grid.5037.1'],['ur.011201747573.76' 'ur.01166110143.28'],"{'id': 'jour.1144532', 'title': 'Sustainable E...",['Australia' 'Sweden'],"{'id': '119980489', 'score': 8}",{'preferred': 'A comparative review on the app...,{'preferred': 'Radiant low-temperature heating...,"[{'concept': 'high-temperature cooling', 'rele...",[]
4,4,pub.1037712872,['pub.1069651355' 'pub.1058573268' 'pub.100770...,"[{'id': 'pub.1134429328', 'year': 2021}\n {'id...",10.1016/0304-3878(81)90004-3,0304-3878,1872-6089,article,1981-08-01,"{'first_level': {'codes': array(['38', '44'], ...","{'times_cited': 800, 'recent_citations': 82, '...",['grid.17635.36'],['ur.010256274757.27' 'ur.012552575265.02'],"{'id': 'jour.1122871', 'title': 'Journal of De...",['United States'],"{'id': '64389583', 'score': 15}",{'preferred': 'The measurement and sources of ...,{'preferred': 'Production function models are ...,"[{'concept': 'firm attributes', 'relevance': 0...",[]


In [8]:
ref_df["issns"] = ref_df.apply(get_issns, axis=1)

In [9]:
ref_df["issns"]

0           2406-8802\n2685-550X
1           0028-0836\n1476-4687
2                      2073-4441
3           2213-1388\n2213-1396
4           0304-3878\n1872-6089
                    ...         
11556604    0047-2891\n1573-6601
11556605    0269-9931\n1464-0600
11556606    1079-5014\n1758-5368
11556607    0731-7107\n1545-228X
11556608    1554-351X\n1532-5970
Name: issns, Length: 11556609, dtype: object

In [10]:
activeDF = ref_df[(ref_df["issn"].isin(issnsActive.keys())) | (ref_df["eissn"].isin(issnsActive.keys()))].copy()
activeDF.count()

index                         1215121
publication_id                1215121
reference_ids                 1215121
citing_ids                    1215121
doi                           1187300
issn                          1077002
eissn                         1119079
type                          1215121
date                          1214844
category_for                   797000
times_cited                   1215107
research_org_cities           1215121
ur_id                         1215121
source                        1215121
research_org_country_names    1215121
altmetrics                    1215121
title                         1215121
abstract                      1005616
concepts                      1215121
idk                           1215121
issns                         1215121
dtype: int64

In [11]:
activeDF["edges"] = activeDF.apply(get_edges, axis=1)
print(activeDF.iloc[0]["edges"])

[('pub.1158208917', 'pub.1120627557')]


In [12]:
edge_list = [e for el in activeDF["edges"].tolist() for e in el]
print(len(edge_list))
print(edge_list[0])

39646526
('pub.1158208917', 'pub.1120627557')


In [13]:
unique_contexts = activeDF[activeDF["issns"].notnull()]["issns"].unique().tolist()
print(len(unique_contexts))

24309


In [14]:
id2issns = ref_df.apply(get_id_issn_map, axis=1).tolist()
print(id2issns[0])

('pub.1120627557', '2406-8802\n2685-550X')


In [15]:
del ref_df

## citations_of_all_pubs.csv - 2022 active ISSNs

In [16]:
cite_df = pd.read_csv("data/citations_of_all_pubs.csv", header=None)
cite_df.columns = ["index", "publication_id", "reference_ids", "citing_ids", "doi", "issn", "eissn", "type", "date", "category_for", "times_cited", "research_org_cities", "ur_id", "source", "research_org_country_names", "altmetrics", "title", "abstract", "concepts"]
cite_df.head()

Unnamed: 0,index,publication_id,reference_ids,citing_ids,doi,issn,eissn,type,date,category_for,times_cited,research_org_cities,ur_id,source,research_org_country_names,altmetrics,title,abstract,concepts
0,0,pub.1011627231,['pub.1000621744' 'pub.1000722723' 'pub.100103...,"[{'id': 'pub.1115166073', 'year': 2016}\n {'id...",10.1152/ajpendo.90306.2008,0193-1849,1522-1555,article,2008-05-20,"{'first_level': {'codes': array(['31', '42'], ...","{'times_cited': 100, 'recent_citations': 6, 'f...",['grid.6451.6'],['ur.07527627403.94' 'ur.01276472107.59'],"{'id': 'jour.1327387', 'title': 'AJP Endocrino...",['Israel'],"{'id': None, 'score': None}",{'preferred': 'Transcriptional regulation of t...,{'preferred': 'The insulin-responsive glucose ...,[]
1,1,pub.1011182182,['pub.1013726283' 'pub.1018900088' 'pub.102673...,"[{'id': 'pub.1102285413', 'year': 2010}\n {'id...",10.1353/apa.0.0007,0360-5949,1533-0699,article,2008-03-01,"{'first_level': {'codes': array(['43', '44', '...","{'times_cited': 7, 'recent_citations': 2, 'fie...",['grid.266515.3'],['ur.011637600145.05'],"{'id': 'jour.1143161', 'title': 'Transactions ...",['United States'],"{'id': '56626804', 'score': 9}",{'preferred': 'Genus quid est?: Roman Scholars...,{'preferred': 'From at least as early as Varro...,[]
2,2,pub.1000357800,['pub.1008741958' 'pub.1040358033' 'pub.105278...,"[{'id': 'pub.1055082006', 'year': 2015}\n {'id...",10.1016/j.cryobiol.2008.09.005,0011-2240,1090-2392,article,2008-09-19,"{'first_level': {'codes': array(['30', '32'], ...","{'times_cited': 29, 'recent_citations': 1, 'fi...",['grid.29980.3a'],['ur.013616340112.49' 'ur.01310020772.09' 'ur....,"{'id': 'jour.1001351', 'title': 'Cryobiology',...",['New Zealand'],"{'id': None, 'score': None}",{'preferred': 'Characterization of a family of...,{'preferred': 'Five genes coding for ice-activ...,[]
3,3,pub.1007126901,['pub.1006612903' 'pub.1010028023' 'pub.100433...,"[{'id': 'pub.1035493063', 'year': 2009}\n {'id...",10.1080/02513625.2008.10557013,0251-3625,2166-8604,article,2008-01-01,"{'first_level': {'codes': array(['33'], dtype=...","{'times_cited': 2, 'recent_citations': 0, 'fie...",[],['ur.016213633215.53' 'ur.016420304310.79'],"{'id': 'jour.1053460', 'title': 'disP - The Pl...",[],"{'id': None, 'score': None}","{'preferred': 'Die Zentralität war schon da!',...",,[]
4,4,pub.1007032109,['pub.1102845096' 'pub.1113140804' 'pub.106505...,"[{'id': 'pub.1046862372', 'year': 2015}\n {'id...",10.1080/14623940701816709,1462-3943,1470-1103,article,2008-02-01,"{'first_level': {'codes': array(['50'], dtype=...","{'times_cited': 3, 'recent_citations': 0, 'fie...",['grid.410319.e'],['ur.01313550775.24'],"{'id': 'jour.1139399', 'title': 'Reflective Pr...",['Canada'],"{'id': None, 'score': None}",{'preferred': 'Performing responsibility: ethi...,"{'preferred': 'I explore, and reflect on, the ...",[]


In [17]:
cite_df["issns"] = cite_df.apply(get_issns, axis=1)

In [18]:
activeDF = cite_df[(cite_df["issn"].isin(issnsActive.keys())) | (cite_df["eissn"].isin(issnsActive.keys()))].copy()
activeDF.count()

index                         992926
publication_id                992926
reference_ids                 992926
citing_ids                    992926
doi                           987473
issn                          869269
eissn                         926256
type                          992926
date                          992414
category_for                  746954
times_cited                   992918
research_org_cities           992926
ur_id                         992926
source                        992926
research_org_country_names    992926
altmetrics                    992926
title                         992926
abstract                      889261
concepts                      992926
issns                         992926
dtype: int64

In [19]:
unique_contexts += activeDF[activeDF["issns"].notnull()]["issns"].unique().tolist()
print(len(unique_contexts))

37575


In [20]:
activeDF["edges"] = activeDF.apply(get_edges, axis=1)
print(activeDF.iloc[0]["edges"])

[('pub.1034132371', 'pub.1002326098'), ('pub.1034132371', 'pub.1045799580'), ('pub.1034132371', 'pub.1052271780'), ('pub.1034132371', 'pub.1026913579'), ('pub.1034132371', 'pub.1003604100'), ('pub.1034132371', 'pub.1022711941'), ('pub.1045952946', 'pub.1034132371'), ('pub.1079673138', 'pub.1034132371'), ('pub.1041770623', 'pub.1034132371'), ('pub.1022062989', 'pub.1034132371'), ('pub.1081150687', 'pub.1034132371'), ('pub.1017412483', 'pub.1034132371'), ('pub.1019258875', 'pub.1034132371'), ('pub.1081321795', 'pub.1034132371'), ('pub.1082366114', 'pub.1034132371'), ('pub.1020675711', 'pub.1034132371'), ('pub.1008448320', 'pub.1034132371'), ('pub.1080561215', 'pub.1034132371'), ('pub.1017860117', 'pub.1034132371'), ('pub.1020250538', 'pub.1034132371'), ('pub.1028685680', 'pub.1034132371'), ('pub.1043886108', 'pub.1034132371'), ('pub.1093077033', 'pub.1034132371'), ('pub.1032937461', 'pub.1034132371'), ('pub.1008432868', 'pub.1034132371'), ('pub.1021686121', 'pub.1034132371'), ('pub.10153

In [21]:
edge_list += [e for el in activeDF["edges"].tolist() for e in el]
print(len(edge_list))
print(edge_list[-1])

78578459
('pub.1140573500', 'pub.1110438123')


In [22]:
id2issns += cite_df.apply(get_id_issn_map, axis=1).tolist()
print(id2issns[-1])

('pub.1032068684', '0001-8791\n1095-9084')


In [23]:
del cite_df

## Map publication_id's to ISSNs in the citation graph

In [24]:
id2issns = dict(id2issns)

In [25]:
issnEdges = defaultdict(list)
for citer_id, citee_id in edge_list:
    try:
        issnEdges[
            id2issns[citer_id]
            ].append(
                id2issns[citee_id]
                )
    except KeyError:
        continue

In [26]:
print(len(issnEdges))

57006


In [27]:
with open("data/issnEdges.json", "w") as outfile:
    json.dump(issnEdges, outfile)

In [28]:
with open("data/unique_contexts.json", "w") as outfile:
    json.dump(unique_contexts, outfile)