STEP 1: Use CrossRef API to extract the JSONs with the metadata of the Articles of DH Journals

In [0]:
#Files necessary to perform the script:
#http://doi.org/10.5281/zenodo.3406564
#Necessary Libraries: csv, urllib, json

import csv
from csv import DictReader
import urllib

def jsondump(title, issn, filepath=""):
  query = 'http://api.crossref.org/journals/' + issn + '/works?rows=1000'
  file = title
  path = filepath
  filepath = path + file.replace("/","-")+".json"
  with open (filepath, 'w+') as jsonfile:
    try:
      r = urllib.request.urlopen(query)
      data = json.loads(r.read().decode(r.info().get_param('charset') or 'utf-8'))
      if data['message']['total-results'] > 1000: #this is necessary because the crossref API limits the maximum number of results per query @ 1000 and some journals have 1000+ publications
        json.dump(data,jsonfile)
        newquery = query+'&offset=1000'
        newfile = title.replace("/", "-")
        newfile2 = newfile+" part 2.json" 
        filepath2 = path+newfile2
        r2 = urllib.request.urlopen(newquery)
        with open (filepath2, "w+") as jsonfile2:
          data2 = json.loads(r2.read().decode(r2.info().get_param('charset') or 'utf-8'))
          json.dump(data2,jsonfile2)
      else:
        json.dump(data, jsonfile)
    except urllib.error.URLError: #this writes publication:null if the ISSN of the journal is not found using crossref api 
      data = {"publication":"null"}
      json.dump(data, jsonfile) #alternatively you can chose not to create any file commenting this line and the line above and removing the colon besides URLError

dhjournalpath = 'dh_journals.csv' #put here the path to the csv containing the DH Journals
filepath = "" #put here the path to the folder where you want the results to be saved, if left blank it'll be created in the same folder as the script
with open(dhjournalpath, 'r') as csvfile: 
  reader = DictReader(csvfile)
  for row in reader:
    if row['DH LEVEL'] == "Exclusively" or row['DH LEVEL'] == "Significantly":
      a = row['TITLE']
      b = row['E_ISSN']
      jsondump(a,b, filepath)

STEP 2: Creation of Ausiliary Dictionaries

In [0]:
#This script was used to create a dictionary with only DOI and DATES, in order
#to speed up the process of the next script

#Be aware that if you decided not to comment the lines in the script above to avoid creating files, you should manually remove the files
#which have {publication:null} as their content

import json
import os

directory = '' #insert your path here, this is the path where the json from the script above (jsondump) have been created
doidate = dict()
for filename in os.listdir(directory):
    if filename.endswith(".json"): #be aware that the folder should only have the json files created with the previous script
      with open(directory + filename, "r") as jsonfile:
        data = json.load(jsonfile)
        print("loading " + filename)
        for element in data["message"]["items"]:
          date = element["created"]["date-parts"][0][0]
          doi = element["DOI"]
          doidate[doi]=date

tadirah_techlist = ['Information Retrieval', #these are the tadirah techniques in the DH Course Registry
 'Encoding',
 'Text Mining',
 'Linked Open Data',
 'Searching',
 'Mapping',
 'Georeferencing',
 'Preservation Metadata',
 'Scanning',
 'Topic Modeling',
 'Named Entity Recognition',
 'Machine Learning',
 'Browsing',
 'POS-Tagging',
 'Concordancing',
 'Brainstorming',
 'Pattern Recognition',
 'Cluster Analysis',
 'Collocation Analysis',
 'Open Archival Information Systems',
 'Photography',
 'Versioning',
 'Gamification',
 'Web Crawling',
 'Commenting',
 'Distance Measures',
 'Sentiment Analysis',
 'Technology Preservation',
 'Durable Persistent Media',
 'Debugging',
 'Principal Component Analysis',
 'Sequence Alignment',
 'Emulation',
 'Replication',
 'Bit Stream Preservation',
 'Migration']

finaldict = {}
for tech in tadirah_techlist:  #this is to prepare the dictionary that will be populated by the next matching script
  finaldict[tech]={2002:{"number":0,"doilist":[]},2003:{"number":0,"doilist":[]},2003:{"number":0,"doilist":[]},2004:{"number":0,"doilist":[]},2005:{"number":0,"doilist":[]},2006:{"number":0,"doilist":[]},2007:{"number":0,"doilist":[]},2008:{"number":0,"doilist":[]},2009:{"number":0,"doilist":[]},2010:{"number":0,"doilist":[]},2011:{"number":0,"doilist":[]},2012:{"number":0,"doilist":[]},2013:{"number":0,"doilist":[]},2014:{"number":0,"doilist":[]},2015:{"number":0,"doilist":[]},2016:{"number":0,"doilist":[]},2017:{"number":0,"doilist":[]},2018:{"number":0,"doilist":[]},2019:{"number":0,"doilist":[]},2020:{"number":0,"doilist":[]}}

STEP 3: Matching Tadirah Techniques with Keywords of DH Publications

In [0]:
#for this script you will need the csv generated with Microsoft Academics and both the dictionary with dois and dates and the one with dates and techniques
import os
import csv
directory = '' #place the directory of the MA csvs
for filename in os.listdir(directory):
    if filename.endswith(".csv"): #be aware that you should not have csv files inside this folder other than the ones extracted from MA
      with open(directory + filename, "r") as csvfile:
        csvz = csv.DictReader(csvfile)
        print("reading " + filename)
        for row in csvz:
          try:
            year = doidate[row["DOI"]]
            for item in finaldict:
              if row["F.FN"]:
                keywords = row["F.FN"].lower().replace("-", " ")
                if item.lower().replace("-"," ") in keywords: #this is specific for POS-TAGGING  that becomes pos tagging
                  finaldict[item][year]["number"]+=1
                  finaldict[item][year]["doilist"].append(row["DOI"])
                  print("Found tadirah term " + item + " in article " + row["DOI"])
          except:KeyError #the csv generated with microsoft academics might have some issues, the exception is here to prevent the code to crash on the csv errors

with open("tadirahdhmatch.json", "w+") as jsonoutput: #you can change tadirahdhmatch.json with any filename and path you want, as it stands the file will be generated in the same folder as the script
  json.dump(finaldict, jsonoutput)
