In [1]:
import os, json, time, string
from collections import OrderedDict
from habanero import Crossref

def normalize_str(str_input):
    for punctuation in string.punctuation:
        str_input = str_input.replace(string.punctuation, ' ')
        
    while '  ' in str_input:
        str_input = str_input.replace('  ', ' ')
        
    return str_input.strip().lower()

INPUT_FILE = os.path.join('..', '..', 'raw_data', 'osf.json')
AUTHOR_FILE = os.path.join('..', '..', 'raw_data', 'osf_authors.json')

CROSSREF_FILE = os.path.join('..', '..', 'raw_data', 'CrossRef.json')

CURRENT_FILE = os.path.join('..', '..', 'data', 'crossref_publication_search_results_new_new_new.json')
OUTPUT_FILE = os.path.join('..', '..', 'data', 'osf_title_author_search_results.json')

if not os.path.exists(OUTPUT_FILE):
	open(OUTPUT_FILE, 'w').close()

osf_records = OrderedDict()

with open(INPUT_FILE, 'r') as f:
	for line in f:
		data = json.loads(line)

		for record in data['data']:
			osf_records[record['id']] = {'title' : record['attributes']['title'], \
				'doi' : record['links']['preprint_doi'].replace('https://doi.org/', '')}

with open(AUTHOR_FILE, 'r') as f:
	for line in f:
		data = json.loads(line)

		try:
			for author in list(data.values())[0][0]['data']:
				if author['attributes']['bibliographic']:
					try:
						first_author = author['embeds']['users']['data']['attributes']
					except:
						first_author = author['embeds']['users']['errors'][0]['meta']

					first_author['unregistered_contributor'] = author['attributes']['unregistered_contributor']

					osf_records[list(data.keys())[0]]['first_author'] = first_author
					break
		except:
			pass

dois = set()
with open(CROSSREF_FILE, 'r') as f:
	for line in f:
		data = json.loads(line)

		# if data['DOI'] not in processed_dois:
		if 'osf.io' in data['DOI'] and data['DOI'].split('/')[2] in osf_records:
			if osf_records[data['DOI'].split('/')[2]]['doi'] == data['DOI']:
				crossref_title = data['title'][0]

				crossref_first_author = ''
				if 'author' in data:
					for author in data['author']:
						if author['sequence'] == 'first':
							if 'given' in author:
								crossref_first_author = author['given']

							if 'family' in author:
								crossref_first_author = crossref_first_author + ' ' + author['family']

							if 'suffix' in author:
								crossref_first_author = crossref_first_author + ' ' + author['suffix']

							# it seems that if name is present, there are no other name fields (e.g. given, family)
							if 'name' in author:
								crossref_first_author = crossref_first_author + ' ' + author['name']
				
				osf_title = osf_records[data['DOI'].split('/')[2]]['title']

				osf_author = ''
				if 'first_author' in osf_records[data['DOI'].split('/')[2]]:
					author = osf_records[data['DOI'].split('/')[2]]['first_author']

					if author['given_name'] != '':
						osf_author = author['given_name']

					if author['middle_names'] != '':
						osf_author = osf_author + ' ' + author['middle_names']

					if author['family_name'] != '':
						osf_author = osf_author + ' ' + author['family_name']

					if 'suffix' in author and author['suffix'] != '':
						osf_author = osf_author + ' ' + author['suffix']

					if osf_author == '':
						osf_author = author['full_name']

				osf_records[data['DOI'].split('/')[2]]['first_author_str'] = osf_author
				
				if normalize_str(osf_title) == normalize_str(crossref_title) and \
					normalize_str(osf_author) == normalize_str(crossref_first_author):
					dois.add(data['DOI'])

print(len(dois))

processed_data = {}
with open(CURRENT_FILE, 'r') as f:
	for line in f:
		data = json.loads(line)
		if data['DOI'] in dois:
			processed_data[data['DOI']] = data

print(len(processed_data.keys()))

28422
28422


In [16]:
done = []
with open(OUTPUT_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        done.append(data['DOI'])

In [15]:
cr = Crossref()

with open(OUTPUT_FILE, 'a') as o:
    for id_, record in osf_records.items():
        if record['doi'] not in done:
            if record['doi'] in processed_data:
                result = processed_data[record['doi']]
            else:
                print(record['doi'])
                if record['title'].strip() != '':
                    retries = 0
                    while retries < 10:
                        query_title = normalize_str(record['title'])
                        try:
                            if 'first_author_str' in record and record['first_author_str'].strip() != '':
                                query_author = normalize_str(record['first_author_str'])
                                result = cr.works(query_title=query_title, query_author=query_author, limit = 10)
                            else:
                                result = cr.works(query_title=query_title, limit = 10)

                            break
                        except Exception as e:
                            retries += 1

                            if retries == 1:
                                print('Error for DOI {}'.format(record['doi']))
                            print(e)
                            print('Retry #{}'.format(retries))

                            result = { 'error' : str(e), 'query_title' :  query_title, 'query_author' : query_author}
                            # time.sleep(10 * retries)
                            time.sleep(2)

                    result['DOI'] = record['doi']

                    time.sleep(0.04)
                else:
                    print('Error for DOI {} : empty title'.format(record['doi']))
                    result = { 'error' : 'Empty title', 'DOI' : record['doi'] }

            json.dump(result, o)
            o.write('\n')
            o.flush()

10.31227/osf.io/ypmu5
10.31227/osf.io/u4zfj
10.31227/osf.io/c5kna
10.31227/osf.io/kfvr8
10.31227/osf.io/wxdqe
10.31227/osf.io/yfpxn
10.31227/osf.io/gqjhu
10.31219/osf.io/e4h28
10.31227/osf.io/dgcjh
10.31227/osf.io/amsn7
10.31227/osf.io/d6pmq
10.31227/osf.io/gyev8
10.31235/osf.io/fnt5m
10.31229/osf.io/rz4uq
10.31231/osf.io/hs4jc
10.31234/osf.io/7eydr
10.31224/osf.io/7bvmd
10.31224/osf.io/znuwd
10.31234/osf.io/x8c9n
10.31235/osf.io/bkrn8
10.31230/osf.io/dprv5
10.33543/osf.io/3qp98
10.33543/osf.io/6qm8r
10.31219/osf.io/dntf6
10.31224/osf.io/2n6tb
10.31224/osf.io/5gmj4
10.31231/osf.io/tr463
10.31234/osf.io/j2qmy
10.31234/osf.io/k79es
10.31234/osf.io/67c8s
10.31234/osf.io/mcuw2
10.31224/osf.io/48wr2
10.31219/osf.io/4jwkr
10.31234/osf.io/nd2km
10.31234/osf.io/h62uc
10.31234/osf.io/fnbh3
10.31234/osf.io/scj5d
10.31234/osf.io/wfjra
10.31235/osf.io/bvgz3
10.31221/osf.io/wjvrs
10.31221/osf.io/9t2mv
10.31221/osf.io/xy8r7
10.31234/osf.io/v8xsc
10.31234/osf.io/f6x2s
10.31234/osf.io/mj63e
10.31234/o

10.31219/osf.io/dq6uw
10.31219/osf.io/sedzu
10.31219/osf.io/bwa7q
10.31234/osf.io/k83cz
10.31227/osf.io/tpcvg
10.31227/osf.io/5ksma
10.31227/osf.io/yv3jn
10.31227/osf.io/zb857
10.31219/osf.io/3hy4d
10.31219/osf.io/7dr4y
10.31219/osf.io/fkeqv
10.31219/osf.io/7tjq4
10.31219/osf.io/g4umk
10.31219/osf.io/tk7ew
10.31219/osf.io/6kgj7
10.31219/osf.io/qap5u
10.31219/osf.io/xe78b
10.31219/osf.io/x4ksb
10.31219/osf.io/z67gd
10.31219/osf.io/cquda
10.31219/osf.io/cyaeq
10.31219/osf.io/h8qrj
10.31219/osf.io/hpbng
10.31219/osf.io/8kv7z
10.31219/osf.io/a2sdt
10.31227/osf.io/58e3p
10.31227/osf.io/ntjb6
10.31227/osf.io/ujynb
10.31227/osf.io/stb34
10.31227/osf.io/c9a8w
10.31235/osf.io/z69x2
10.31227/osf.io/s9g7n
10.31227/osf.io/4zv8h
10.31219/osf.io/ekcxz
10.31227/osf.io/vaytd
10.31227/osf.io/m9s5k
10.31227/osf.io/hw4xq
10.31227/osf.io/h9j28
10.31219/osf.io/xrbsg
10.31219/osf.io/hgjc9
10.31227/osf.io/7e69k
10.31219/osf.io/2acve
10.31227/osf.io/e82cv
10.31234/osf.io/9vc2n
10.31227/osf.io/y7cxs
10.31227/o

10.31235/osf.io/3zn8x
10.31234/osf.io/rvykq
10.31235/osf.io/cxvk9
10.31227/osf.io/6crdm
10.31227/osf.io/uy6kr
10.31219/osf.io/6b2dp
10.31230/osf.io/dh2nu
10.31234/osf.io/9fzsv
10.31219/osf.io/nrgz4
10.31234/osf.io/asje7
10.31227/osf.io/xw7hb
10.31227/osf.io/bscnd
10.31230/osf.io/zshkp
10.31219/osf.io/46yxr
10.31230/osf.io/uxc7e
10.31234/osf.io/3v7hx
10.31227/osf.io/rezk6
10.31227/osf.io/46d7g
10.31219/osf.io/fh29j
10.31235/osf.io/pg9zj
10.31230/osf.io/h43af
10.31230/osf.io/sy6pk
10.31227/osf.io/twfzm
10.31227/osf.io/64wes
10.31219/osf.io/buw3p
10.31230/osf.io/4kt5g
10.31230/osf.io/2b6cz
10.31237/osf.io/8t5n4
10.31234/osf.io/4r2tf
10.31227/osf.io/pxrv6
10.31227/osf.io/3uvy9
10.31230/osf.io/5k934
10.31219/osf.io/9emjx
10.31219/osf.io/fjr97
10.31219/osf.io/nt7q9
10.31219/osf.io/7tm8w
10.31219/osf.io/jq6k2
10.31227/osf.io/7z46h
10.31234/osf.io/m43hy
10.31730/osf.io/3mwu5
10.31227/osf.io/j2shc
10.31227/osf.io/jk25h
10.31227/osf.io/z2wq3
10.31227/osf.io/juasn
10.31227/osf.io/3nkug
10.31227/o

10.31235/osf.io/zda8t
10.31235/osf.io/z9fdm
10.31235/osf.io/tpdxc
10.31235/osf.io/zj5bp
10.31235/osf.io/6jeb5
10.31235/osf.io/dbmry
10.31219/osf.io/4rng5
10.31234/osf.io/pcq6a
10.31219/osf.io/xcfdq
10.31234/osf.io/pgztq
10.31220/osf.io/68bpq
10.31234/osf.io/yd4re
10.31234/osf.io/pr8pn
10.31235/osf.io/vz2gd
10.31235/osf.io/vqckw
10.31235/osf.io/5vf8h
10.31235/osf.io/cqe57
10.31235/osf.io/e2vgs
10.31235/osf.io/53y9k
10.31235/osf.io/dz922
10.31235/osf.io/2syyn
10.31235/osf.io/wcg39
10.31235/osf.io/5vesv
10.31235/osf.io/5ta5t
10.31235/osf.io/7f86r
10.31235/osf.io/pyf5f
10.31235/osf.io/mrpr2
10.31235/osf.io/fstfd
10.31235/osf.io/csm6j
10.31235/osf.io/hjzww
10.31234/osf.io/hynwh
10.31235/osf.io/cypme
10.31235/osf.io/3nw8q
10.31234/osf.io/z3tjc
10.31235/osf.io/tsdwy
10.31235/osf.io/m9yqc
10.31219/osf.io/xv2f7
10.31234/osf.io/j2wxk
10.31235/osf.io/nw78x
10.31235/osf.io/8mnwv
10.31234/osf.io/k9mn3


In [4]:
processed_data.keys()

dict_keys(['10.31227/osf.io/8cf79', '10.31234/osf.io/dtevx', '10.31227/osf.io/n5eyb', '10.31219/osf.io/226jb', '10.31234/osf.io/23akm', '10.31235/osf.io/24akw', '10.31222/osf.io/24gyk', '10.31234/osf.io/25pnr', '10.31219/osf.io/25s3e', '10.31219/osf.io/25w7u', '10.31235/osf.io/265ws', '10.31234/osf.io/266vp', '10.31234/osf.io/27b43', '10.31235/osf.io/26zn4', '10.31235/osf.io/27hz8', '10.31235/osf.io/286er', '10.31228/osf.io/2877p', '10.31235/osf.io/28sgr', '10.31219/osf.io/2a3mj', '10.31235/osf.io/2bdmj', '10.31219/osf.io/2dyfn', '10.31235/osf.io/2edg7', '10.31235/osf.io/2ej6f', '10.31219/osf.io/2eznq', '10.31219/osf.io/2fcy9', '10.31219/osf.io/2fbrz', '10.31235/osf.io/2fn99', '10.31235/osf.io/2fpkm', '10.31235/osf.io/2ft8x', '10.31219/osf.io/2gehg', '10.31235/osf.io/2h9t6', '10.31235/osf.io/2j8uv', '10.31234/osf.io/2kcn3', '10.31235/osf.io/2n8ba', '10.31235/osf.io/2nujm', '10.31219/osf.io/2pgxj', '10.31234/osf.io/2q4h8', '10.31219/osf.io/2qpa7', '10.31234/osf.io/2qjd5', '10.31219/osf.

In [9]:
'10.31227/osf.io/ypmu5' in done

False

In [17]:
len(done)

29430

In [None]:
|