In [2]:
import pandas as pd
import json
import lxml
import sys
import os
from lxml import html
from tqdm import tqdm
from collections import defaultdict, Counter

In [3]:
with open ("data/issn2json.json", "r") as f:
    issn2json = json.load(f)
print(f"type: {type(issn2json)}")
print(f"size: {sys.getsizeof(issn2json)}")

<class 'dict'>


In [51]:
seen = set()
for k, v in issn2json.items():
    if type(v) not in seen:
        print(v)
    seen.add(type(v))

{'@graph': [{'@id': 'http://id.loc.gov/vocabulary/countries/no', 'label': 'Norway'}, {'@id': 'organization/ISSNCenter#_r', '@type': 'http://schema.org/Organization'}, {'@id': 'resource/ISSN-L/0032-8847', 'identifiedBy': 'resource/ISSN/0032-8847#ISSN-L'}, {'@id': 'resource/ISSN/0032-8847', 'identifiedBy': ['resource/ISSN/0032-8847#KeyTitle', 'resource/ISSN/0032-8847#ISSN-L'], 'mainTitle': 'Prismet', 'otherPhysicalFormat': 'resource/ISSN/2535-311X', 'title': 'resource/ISSN/0032-8847#KeyTitle', 'format': 'vocabularies/medium#Print', 'isFormatOf': 'resource/ISSN/2535-311X', 'type': 'http://marc21rdf.info/terms/formofmaterial#a', 'isPartOf': 'resource/ISSN-L/0032-8847', 'name': ['Prismet', 'Prismet (Oslo)'], 'publication': 'resource/ISSN/0032-8847#ReferencePublicationEvent', 'url': 'https://www.nb.no/search?q=seriestitleid:"oai:nb.bibsys.no:998520425584702202"&mediatype=tidsskrift'}, {'@id': 'resource/ISSN/0032-8847#ISSN-L', '@type': 'http://id.loc.gov/ontologies/bibframe/IssnL', 'status': 

In [52]:
issn2json = {k: v for k, v in issn2json.items() if not isinstance(v, str)}
print(f"number of json responses: {len(issn2json)}")

number of json responses: 79143


In [53]:
d = next(iter(issn2json.values()))
print(f"keys: {d.keys()}")

keys: dict_keys(['@graph', '@context'])


In [56]:
keys = set()
for d in issn2json.values():
    for k in d.keys():
        keys.add(k)
print(f"keys: {keys}")

keys: {'error', 'exception', 'status', 'message', '@graph', 'timestamp', 'path', '@context'}


In [60]:
for d in issn2json.values():
    if "error" in d.keys():
        print(d)
        break

print(f"number of errors: {len([d for d in issn2json.values() if 'error' in d])}")
print(f"number of exceptions: {len([d for d in issn2json.values() if 'exception' in d])}")

{'timestamp': '2023-09-13T21:07:35.051+0000', 'status': 500, 'error': 'Internal Server Error', 'exception': 'org.marc4j.util.JsonParser$Escape', 'message': 'A string value cannot contain data after its closing quote (this is most likely caused by a missing comma between members); at Input Source: "MarcInput", Line: 1, Column: 1776, Member Name: ind1', 'path': '/restricted-api/rest/rdf/json'}
number of errors: 34
number of exceptions: 34


In [61]:
issn2json = {k: v for k, v in issn2json.items() if "error" not in v}
print(f"number of json responses: {len(issn2json)}")

number of json responses: 79109


In [63]:
print(f"num records: {len(issn2json)}")
valid = [rec for rec in issn2json.values() if isinstance(rec, dict)]
print(f"num json responses: {len(valid)}")
# nested list comprehension that returns the number of dicts in issn2json.values() that have a key 'mainTitle' in one of the dictionaries listed at the key '@graph'
titles = [(k, title) for k, rec in issn2json.items() for title in rec["@graph"] if "mainTitle" in title]
print(f"num titles: {len(titles)}")

num records: 79109
num json responses: 79109
num titles: 78407


In [67]:
issn2title = [(k, title["mainTitle"]) for k, rec in issn2json.items() for title in rec["@graph"] if "mainTitle" in title]
print(f"num titles: {len(issn2title)}")
print(issn2title[:5])

num titles: 78407
[('0032-8847', 'Prismet'), ('2595-9204', 'Revista Summa Sapientiae.'), ('2654-8097', 'Jurnal Akuntansi dan Pasar Modal.'), ('2655-1772', 'Prosiding Asis.'), ('1984-0098', 'Mythos.')]


In [68]:
issn2title = dict(issn2title)
with open("data/issn2title.json", "w") as f:
    json.dump(issn2title, f)

In [74]:
next(iter(issn2title.values()))

'Prismet'

In [76]:
seen = set()
for k, v in issn2title.items():
    if type(v) not in seen:
        print(v)
    seen.add(type(v))

Prismet
['Cifrova platforma: ìnformacìjnì tehnologìï v socìokulʹturnìj sferì.', 'Цифрова платформа: інформаційні технології в соціокультурній сфері.']


In [77]:
#unique titles
unique_titles = set([t[0] if isinstance(t, list) else t for t in issn2title.values()])
print(f"num unique titles: {len(unique_titles)}")

num unique titles: 60231


In [80]:
with open("data/ukr_missing_issns_from_01jul2023_to_11jul2023.csv", "r") as f:
    missing_issns = list(line for line in f.read().split("\n") if line)[1:]
print(f"num missing issns: {len(missing_issns)}")

num missing issns: 282


In [82]:
for issn in missing_issns[:10]:
    if issn in issn2title:
        print(issn, issn2title[issn])

2309-1533 ['Ìнноваційна економіка.', 'Ìnnovacìjna ekonomìka.']
2310-4864 ['Ìнноваційна економіка.', 'Ìnnovacìjna ekonomìka.']
2520-2626 ['Українознавчий альманах.', 'Ukraïnoznavčij alʹmanah.']
2709-8400 ['Технології електронного навчання.', 'Tehnologìï elektronnogo navčannâ.']
2221-3805 Elektrotehnìčnì ta komp'ûternì sistemi.
2221-3937 ["Elektrotehnìčnì ta komp'ûternì sistemi.", "Електротехнічні та комп'ютерні системи."]
2312-3125 ['Автоматизация технологических и бизнес-процессов.', 'Avtomatizaciâ tehnologičeskih i biznes-processov.']
2312-931X ['Автоматизация технологических и бизнес-процессов.', 'Avtomatizaciâ tehnologičeskih i biznes-processov.']
2312-847X ['Економіка харчової промисловості.', 'Ekonomìka harčovoï promislovostì.']
2411-4111 ['Ekonomìka harčovoï promislovostì.', 'Економіка харчової промисловості.']
