# DataCite Metadata and funding information
### Eric Schares, Jan-March 2024

Look at DataCite metdata. How many records have funding acknowledgements, and how many of those are USFF?

---
## Request the data

In [1]:
import requests
import pandas as pd
#import tqdm
import json

In [2]:
# All
#url = 'https://api.datacite.org/dois'

# fundingReferences
#url = 'https://api.datacite.org/dois?query=fundingReferences:*'

# Datasets only
#url = 'https://api.datacite.org/dois?resource-type-id=dataset'

#MAIN - one year at a time
#url = 'https://api.datacite.org/dois?resource-type-id=dataset&published=2020&query=fundingReferences:*&page[size]=10'

# use page[cursor]=1 to start cursor paging, gets more than 10,000 records
url = 'https://api.datacite.org/dois?resource-type-id=dataset&published=2022&affiliation=true&query=fundingReferences:*&page[cursor]=1&page[size]=1000'

## How many records would we get for this year?

In [3]:
api_response = requests.get(url)

In [4]:
api_response

<Response [200]>

In [5]:
parsed_response = api_response.json()
parsed_response['meta']['total']
#parsed_response

172216

## Parse the JSON response and pull out required fields

In [6]:
def get_DataCite_page(url):

    api_response = requests.get(url)
    
    parsed_response = api_response.json()
    #print(f"{parsed_response['meta']['total']} total records")
    print(f"{len(parsed_response['data'])} this round")
    
    for result in parsed_response['data']:    # data, meta, links
        #print('\n')

        if 'id' in result:
            top_id = result['id']
        else:
            top_id = None

        if 'attributes' in result:

            doi = result['attributes']['doi']
            pubyear = result['attributes']['publicationYear']
            #print(doi, pubyear)     

            if 'creators' in result['attributes'] and result['attributes']['creators'] != []:  # if creators appears AND is not empty
                creators = result['attributes']['creators']
            else:
                creators = None
                
            if 'publisher' in result['attributes'] and result['attributes']['publisher'] != []:
                publisher = result['attributes']['publisher']
            else:
                publisher = None
                
            if 'subjects' in result['attributes'] and result['attributes']['subjects'] != []:
                subjects = result['attributes']['subjects']
            else:
                subjects = None
                
            if 'relatedIdentifiers' in result['attributes'] and result['attributes']['relatedIdentifiers'] != []:
                relatedIdentifiers = result['attributes']['relatedIdentifiers']
            else:
                relatedIdentifiers = None
                
            if 'sizes' in result['attributes'] and result['attributes']['sizes'] != []:
                sizes = result['attributes']['sizes'][0]   # only ever one element in the list so we can take [0]
            else:
                sizes = None
                
            if 'formats' in result['attributes'] and result['attributes']['formats'] != []:
                formats = result['attributes']['formats']
            else:
                formats = None

            if 'rightsList' in result['attributes'] and result['attributes']['rightsList'] != []:
                #num_rightsList = len(result['attributes']['rightsList'])
                rightsIdentifier = None    # so if rightsIdentifier isn't in the next list, will still return something

                for rightslist in result['attributes']['rightsList']:
                    if 'rightsIdentifier' in rightslist:
                        rightsIdentifier = rightslist['rightsIdentifier']
            else:
                rightsIdentifier = None

            #print(f"story so far {doi} {pubyear} {creators} {publisher} {rightsIdentifier}")
            
            if 'fundingReferences' in result['attributes']:    # fundingReferences is a LIST, not a dict
                num_funders = len(result['attributes']['fundingReferences'])
                #print(f"{doi} {pubyear} has { num_funders } funders")

                for single_funder in result['attributes']['fundingReferences']:
                    #return result['attributes']['fundingReferences'][i].get(attribute) or default_value
                    
                    if 'funderName' in single_funder:
                        fundername = single_funder['funderName']
                    else:
                        fundername = None
                        
                    if 'funderIdentifier' in single_funder:
                        funderidentifier = single_funder['funderIdentifier']
                    else:
                        funderidentifier = None

                    if 'funderIdentifierType' in single_funder:
                        funderidentifiertype = single_funder['funderIdentifierType']
                    else:
                        funderidentifiertype = None
                        
                    if 'awardTitle' in single_funder:
                        awardTitle = single_funder['awardTitle']
                    else:
                        awardTitle = None
                        
                    if 'awardNumber' in single_funder:
                        awardNumber = single_funder['awardNumber']
                    else:
                        awardNumber = None

                    # Add new complete line of data for every funder
                    information.append([top_id, pubyear, doi, publisher, subjects, relatedIdentifiers, sizes,
                                        formats, rightsIdentifier, creators, num_funders, awardTitle, awardNumber,
                                        fundername, funderidentifier, funderidentifiertype])
            else:
                print(f"No fundingReferences? {result['id']}")  # should never trigger since in the API call we say fundingReferences=*
        else:
            print(f"No attributes {result}")
    
    if(1):  # set to 0 if you only want the first page of results for testing
        if('next' in parsed_response['links']):
            get_DataCite_page(parsed_response['links']['next'])
        else:
            print('Done!')       


# Run it

In [7]:
information = []
api_response = requests.get(url)
api_response

<Response [200]>

In [8]:
parsed_response = api_response.json()
parsed_response['meta']['total']

172216

In [9]:
%%time
# about 100 per second. 4947 took 46s
# 10 seconds per 1000 request, 32,000 takes about 6 minutes
# 16m 55s for 87,334 records = .0116s/record
# 23m 24s for 127,175 records = .0011s/record
# 28m 57s for 172,056 records = .010s/record

get_DataCite_page(url)

1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 this round
1000 thi

In [10]:
len(information)

193488

In [8]:
information

[['10.5878/000247',
  2020,
  '10.5878/000247',
  'Lund University',
  [{'lang': 'en',
    'subject': 'local government elections',
    'valueUri': 'https://elsst.cessda.eu/id/4/9ff130f0-d8d5-4dc4-b619-5fa3d955599c',
    'subjectScheme': 'ELSST',
    'classificationCode': '9ff130f0-d8d5-4dc4-b619-5fa3d955599c'},
   {'lang': 'sv',
    'subject': 'kommunalval',
    'valueUri': 'https://elsst.cessda.eu/id/4/9ff130f0-d8d5-4dc4-b619-5fa3d955599c',
    'subjectScheme': 'ELSST',
    'classificationCode': '9ff130f0-d8d5-4dc4-b619-5fa3d955599c'},
   {'lang': 'en',
    'subject': 'political parties',
    'valueUri': 'https://elsst.cessda.eu/id/4/c1561f24-0cc7-464c-82c9-e8d820060494',
    'subjectScheme': 'ELSST',
    'classificationCode': 'c1561f24-0cc7-464c-82c9-e8d820060494'},
   {'lang': 'sv',
    'subject': 'politiska partier',
    'valueUri': 'https://elsst.cessda.eu/id/4/c1561f24-0cc7-464c-82c9-e8d820060494',
    'subjectScheme': 'ELSST',
    'classificationCode': 'c1561f24-0cc7-464c-82c9-

In [11]:
# convert to a dataframe
df = pd.DataFrame(information, columns=['top_id', 'publicationyear', 'doi', 'publisher', 'subjects', 'relatedIdentifiers', 'sizes',
                                        'formats', 'rightsIdentifier', 'creators', 'num_funders', 'awardTitle', 'awardNumber',
                                        'funder_name', 'funderidentifier', 'funderidentifiertype'])
df.head(5)

Unnamed: 0,top_id,publicationyear,doi,publisher,subjects,relatedIdentifiers,sizes,formats,rightsIdentifier,creators,num_funders,awardTitle,awardNumber,funder_name,funderidentifier,funderidentifiertype
0,10.5878/000405,2022,10.5878/000405,University of Gothenburg,"[{'lang': 'en', 'subject': 'internal politics'...","[{'relationType': 'HasVersion', 'relatedIdenti...",197.43 MiB,,,"[{'name': 'Swedish National Data Service, Univ...",1,,,Bank of Sweden Tercentenary Foundation,https://ror.org/02jkbm893,ROR
1,10.17863/cam.13001,2022,10.17863/cam.13001,Apollo - University of Cambridge Repository,"[{'subject': 'molecular dynamics'}, {'subject'...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Larsen, A', 'nameType': 'Personal',...",1,,1198,EPSRC,,
2,10.17863/cam.13242,2022,10.17863/cam.13242,Apollo - University of Cambridge Repository,"[{'subject': 'NMR relaxation and diffusion'}, ...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Sederman, Andy', 'nameType': 'Perso...",2,,EP/F047991/1,EPSRC,,
3,10.17863/cam.13242,2022,10.17863/cam.13242,Apollo - University of Cambridge Repository,"[{'subject': 'NMR relaxation and diffusion'}, ...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Sederman, Andy', 'nameType': 'Perso...",2,,EP/K039318/1,EPSRC,,
4,10.17863/cam.13340,2022,10.17863/cam.13340,Apollo - University of Cambridge Repository,"[{'subject': 'DFT'}, {'subject': 'Terahertz'},...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Ruggiero, Michael', 'nameType': 'Pe...",1,,1198,EPSRC,,


In [12]:
df.shape

(193488, 16)

In [13]:
#df.to_csv('DataCite_2024_datasets_4819.csv', index=False)
df.to_csv('2022.csv', index=False)

---
## Optional: load data here

In [3]:
#df = pd.read_csv('DataCite_example_set.csv')
df = pd.read_csv('2023.csv')
df.head(3)

  df = pd.read_csv('2023.csv')


Unnamed: 0,top_id,publicationyear,doi,publisher,subjects,relatedIdentifiers,sizes,formats,rightsIdentifier,creators,num_funders,awardTitle,awardNumber,funder_name,funderidentifier,funderidentifiertype,ROR_ID,USFF,USFF_2
0,10.18465/mnt_la_faute_sur_mer_2013,2023,10.18465/mnt_la_faute_sur_mer_2013,Observatoire des Sciences de l'Univers Nantes ...,"[{'subject': 'Géomorphologie', 'valueUri': ''}...",,,,,"[{'name': 'Launeau, Patrick', 'nameType': 'Per...",2,,,"Direction Régionale de l'Environnement, de l'A...",,,,,
1,10.18465/mnt_la_faute_sur_mer_2013,2023,10.18465/mnt_la_faute_sur_mer_2013,Observatoire des Sciences de l'Univers Nantes ...,"[{'subject': 'Géomorphologie', 'valueUri': ''}...",,,,,"[{'name': 'Launeau, Patrick', 'nameType': 'Per...",2,,,Conseil Régional des Pays de la Loire,https://doi.org/10.13039/501100013414,Crossref Funder ID,01zsm1k25,False,False
2,10.18465/mnt_jard_sur_mer_2013,2023,10.18465/mnt_jard_sur_mer_2013,Observatoire des Sciences de l'Univers Nantes ...,"[{'subject': 'Géomorphologie', 'valueUri': ''}...",,,,,"[{'name': 'Launeau, Patrick', 'nameType': 'Per...",2,,,"Direction Régionale de l'Environnement, de l'A...",,,,,


In [4]:
df.shape

(365314, 19)

In [3]:
df.sample(5)

Unnamed: 0,top_id,publicationyear,doi,publisher,subjects,relatedIdentifiers,sizes,formats,rightsIdentifier,creators,num_funders,awardTitle,awardNumber,funder_name,funderidentifier,funderidentifiertype,ROR_ID,USFF,USFF_2
314846,10.60712/si-id373666.1,2023,10.60712/si-id373666.1,Leibniz Institute DSMZ - German Collection of ...,"[{'subject': 'BACTERIA'}, {'subject': 'Listeri...",,,"['JSON', 'application/json']",cc-by-4.0,"[{'name': 'Reimer, Lorenz C.', 'nameType': 'Pe...",1,NFDI4Microbiota - Nationale Forschungsdateninf...,460129525,Deutsche Forschungsgemeinschaft e.V.,018mejw64,ROR,018mejw64,False,False
303630,10.60712/si-id352531.1,2023,10.60712/si-id352531.1,Leibniz Institute DSMZ - German Collection of ...,"[{'subject': 'EUKARYOTA'}, {'subject': 'Sperma...",,,"['JSON', 'application/json']",cc-by-4.0,"[{'name': 'Reimer, Lorenz C.', 'nameType': 'Pe...",1,NFDI4Microbiota - Nationale Forschungsdateninf...,460129525,Deutsche Forschungsgemeinschaft e.V.,018mejw64,ROR,018mejw64,False,False
363144,10.5061/dryad.z612jm6jr,2023,10.5061/dryad.z612jm6jr,Dryad,[{'subject': 'FOS: Engineering and technology'...,,3521781 bytes,,cc0-1.0,"[{'name': 'Wu, Yanru', 'nameType': 'Personal',...",3,,QN2023029,Science and Technology Project of the Hebei Ed...,,,,,
271348,10.60712/si-id316005.1,2023,10.60712/si-id316005.1,Leibniz Institute DSMZ - German Collection of ...,"[{'subject': 'UNKNOWN'}, {'subject': 'Unidenti...",,,"['JSON', 'application/json']",cc-by-4.0,"[{'name': 'Reimer, Lorenz C.', 'nameType': 'Pe...",1,NFDI4Microbiota - Nationale Forschungsdateninf...,460129525,Deutsche Forschungsgemeinschaft e.V.,018mejw64,ROR,018mejw64,False,False
364949,10.5061/dryad.mw6m90628,2023,10.5061/dryad.mw6m90628,Dryad,"[{'subject': 'Solar energy'}, {'subject': 'ani...","[{'relationType': 'IsCitedBy', 'relatedIdentif...",232754098 bytes,,cc0-1.0,"[{'name': 'Levin, Michael', 'nameType': 'Perso...",4,,CAR-A-6689,UC Davis Agricultural Experiment Station*,,,,,


---

# ROR
## Find a ROR when given another ID type (Crossref, ISNI, etc.). Adds column `ROR_ID`

In [7]:
df['funderidentifiertype'].value_counts()

ROR                   315877
Crossref Funder ID     34291
ISNI                    1899
Other                    368
GRID                      99
Fearless Fund              1
Name: funderidentifiertype, dtype: int64

In [8]:
df['ROR_ID'].value_counts()

018mejw64                    293146
00k4n6c32                      8204
None                           5260
032e6b942                      4343
021nxhr62                      4168
                              ...  
https://ror.org/05cxhm587         1
05ycxzd89                         1
039570836                         1
000dswa46                         1
02pry0c91                         1
Name: ROR_ID, Length: 3280, dtype: int64

- detect ISNI by 16 digits
- detect Crossref funder by 12 digit

In [15]:
def convert_to_ROR(id_number: str, visited_convert_to_ROR: dict[str, str]) -> str:
    """
    Accepts a identifier that is NOT a ROR ID
    Could be an ISNI or Crossref funder ID
    Returns the corresponding ROR ID
    """
    # ISNI: http://isni.org/isni/0000000449071619
    # Crossref funder ID: https://doi.org/10.13039/501100013414
    
    print(id_number)
    
    if (id_number is None) or (id_number == 'nan'):   # you didn't pass anything in, just return nothing
        return None
    
    id_number = str(id_number)
    end_number = id_number.split('/')[-1]
    
    if 'isni' in id_number or len(end_number)==16:  # format it in 4 groups of 4, the way ROR API wants --> 0000 0004 4907 1619
        end_number = end_number[0:4] + ' ' + end_number[4:8] + ' ' + end_number[8:12] + ' ' + end_number[12:16]
        #print('ISNI', end_number)
    if '(' in end_number:
        end_number = end_number.split('(')[0]
    if end_number.startswith('#'):
        end_number = end_number.replace('#', '')
    
    #print(end_number)
    
    # see if we can run this smarter, don't test every string every time but remember what we already tested and found out
    # start with empty cache
    if visited_convert_to_ROR is None:
        visited_convert_to_ROR = {}
    # check it here, if we've done it already we're done
    if end_number in visited_convert_to_ROR:
        print(f'{end_number} Already visited')
        return visited_convert_to_ROR[end_number]
    
    #print(end_number)
    
    # call the ROR API query https://ror.readme.io/docs/map-other-organization-id-types-to-ror
    url = 'https://api.ror.org/organizations?query="' + end_number + '"'
    api_response = requests.get(url)
    parsed_response = api_response.json()
    
    if parsed_response['number_of_results'] == 1:
        print(parsed_response['items'][0]['id'])
        ror_id = parsed_response['items'][0]['id'].split('/')[-1]
        
        # remember we already visited this org id, record its status
        visited_convert_to_ROR[end_number] = ror_id
        return(ror_id)
    
    elif parsed_response['number_of_results'] == 0:
        print('No results')        
        visited_convert_to_ROR[end_number] = "No results"
        return ('No results')
    
    else:
        print('Multiple results')
        visited_convert_to_ROR[end_number] = "Multiple results"
        return ('Multiple results')


In [16]:
df.head(3)

Unnamed: 0,top_id,publicationyear,doi,publisher,subjects,relatedIdentifiers,sizes,formats,rightsIdentifier,creators,num_funders,awardTitle,awardNumber,funder_name,funderidentifier,funderidentifiertype
0,10.5878/000405,2022,10.5878/000405,University of Gothenburg,"[{'lang': 'en', 'subject': 'internal politics'...","[{'relationType': 'HasVersion', 'relatedIdenti...",197.43 MiB,,,"[{'name': 'Swedish National Data Service, Univ...",1,,,Bank of Sweden Tercentenary Foundation,https://ror.org/02jkbm893,ROR
1,10.17863/cam.13001,2022,10.17863/cam.13001,Apollo - University of Cambridge Repository,"[{'subject': 'molecular dynamics'}, {'subject'...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Larsen, A', 'nameType': 'Personal',...",1,,1198,EPSRC,,
2,10.17863/cam.13242,2022,10.17863/cam.13242,Apollo - University of Cambridge Repository,"[{'subject': 'NMR relaxation and diffusion'}, ...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Sederman, Andy', 'nameType': 'Perso...",2,,EP/F047991/1,EPSRC,,


### Load dictionary

In [17]:
visited = json.load(open('visited_dict.txt',))

In [18]:
len(visited)

21308

### Run it

In [19]:
%%time
# create new column called 'ROR_ID' and convert other forms to ROR (ISNI and Crossref Funder ID)
# converting 30,000 took about 5min
# 36,000 took 10.5 min
# 38,000 took 

#what about GRID ID?

#df['ROR_ID'] = df['funderidentifier'].map(convert_to_ROR, visited)
df['ROR_ID'] = df['funderidentifier'].apply(convert_to_ROR, visited_convert_to_ROR = visited)

https://ror.org/02jkbm893
02jkbm893 Already visited
None
None
None
None
None
None
None
None
None
None
None
None
https://doi.org/10.13039/100010661
100010661 Already visited
https://doi.org/10.13039/100010661
100010661 Already visited
https://doi.org/10.13039/100010661
100010661 Already visited
None
None
None
None
None
None
None
None
https://doi.org/10.13039/100011102
100011102 Already visited
None
None
None
None
None
None
None
None
None
None
https://doi.org/10.13039/501100000780
501100000780 Already visited
https://doi.org/10.13039/501100000780
501100000780 Already visited
None
None
None
None
None
None
https://ror.org/01104nk81
01104nk81 Already visited
https://doi.org/10.13039/501100001665
501100001665 Already visited
None
None
None
None
None
None
None
https://doi.org/10.13039/501100000780
501100000780 Already visited
None
https://doi.org/10.13039/501100000780
501100000780 Already visited
https://ror.org/0029jxk29
0029jxk29 Already visited
None
None

 Already visited
https://doi.org/1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Save the `visited` dictionary

In [65]:
# save to file
json.dump(visited, open('visited_dict.txt', 'w'))

In [66]:
len(visited)

21329

In [3]:
# read it back in
visited = json.load(open('visited_dict.txt',))
visited

{'00epmv149': '00epmv149',
 '00k4n6c32': '00k4n6c32',
 '501100010956': '04z8jg394',
 '05mmh0f86': '05mmh0f86',
 '01fmd5559': '01fmd5559',
 '02b5d8509': '02b5d8509',
 '02wxr8x18': '02wxr8x18',
 '021nxhr62': '021nxhr62',
 '100000062': '00adh9b73',
 '501100000923': '05mmh0f86',
 '00rbzpz17': '00rbzpz17',
 '501100012190': '00ghqgy32',
 '501100003725': '013aysd81',
 '501100006769': '03y2gwe85',
 '100010661': '00k4n6c32',
 '501100013414': '01zsm1k25',
 '047egay20': '047egay20',
 '05k73zm37': '05k73zm37',
 '01h0zpd94': '01h0zpd94',
 '0439y7842': '0439y7842',
 '02h291k47': '02h291k47',
 '0314h5y94': '0314h5y94',
 '501100023560': 'No results',
 '05psqqq26': '05psqqq26',
 'Native Australian Animals Trust*': 'None',
 'Taxonomy Research & Information Network*': 'None',
 'Deutsche Forschungsgemeinschaft': 'https://ror.org/018mejw64',
 'Svalbard Environmental Protection Fund*': 'None',
 'Research Grant Council, Hong Kong Special Administrative Region*': 'None',
 'Royal Society': 'https://ror.org/03w

## Find a ROR when no ID given at all, use the name string to look up a ROR ID

In [9]:
df.loc[ df['funderidentifiertype'].isnull() ].shape

(12779, 19)

In [24]:
def find_a_ROR_when_nothing_given(name_string:str, visited_string_to_ROR: dict[str, str]) -> str:
    """
    Takes a name string when no funder ID is given
    Uses https://api.ror.org/organizations?affiliation=university+of+wisconsin+madison to look for corresponding ROR ID
    """
    # United States Department of Health and Human Services. National Institutes of Health. National Institute on Aging
   
    #print(name_string)

    if name_string is None:   # you didn't pass anything in, just return nothing
        return None
    if (name_string == 'nan') or (name_string == 'NaN'):
        return None
    
    if isinstance(name_string, float):  #.isnumeric():
        return "float"
    
    if name_string.isnumeric():  # does a string contain only numbers?
        return "numeric"
    
    name_string = str(name_string)
    
    if name_string.startswith('\n'):
        #name_string = name_string.strip()
        name_string = " ".join(name_string.split())
    if '"' in name_string:
        name_string = name_string.replace('"', '\'')
    if '#' in name_string:
        name_string = name_string.replace('#', '')
    
    print(name_string)
    
    
    # see if we can run this smarter, don't test every string every time but remember what we already tested and found out
    # start with empty cache if nothing provided
    if visited_string_to_ROR is None:
        visited_string_to_ROR = {}
    # check it here, if we've done it already we're done
    if name_string in visited_string_to_ROR:
        print(f'Already visited {name_string}')
        return visited_string_to_ROR[name_string]
    
    
    # call the ROR API query https://ror.readme.io/docs/map-other-organization-id-types-to-ror
    url = 'https://api.ror.org/organizations?affiliation=' + name_string# + '"'
    #print(url)
    api_response = requests.get(url)
    parsed_response = api_response.json()
    
    print(f"{name_string}: {parsed_response['number_of_results']} results")
    
    for result in parsed_response['items']:        
        #print(result['organization']['id'])
        if parsed_response['number_of_results'] == 1 or result['chosen'] == True:  # take the org ID if chosen=True OR there's only one result to consider
            print(f"\nchosen=true {result['organization']['id']} ")
            
            #### remember we already visited this one, record its status
            visited_string_to_ROR[name_string] = result['organization']['id']
            
            return(result['organization']['id'])           

    visited_string_to_ROR[name_string] = 'None'
    return('None')

In [216]:
find_a_ROR_when_nothing_given('1043572', visited)

'numeric'

In [10]:
df.loc[ df['funderidentifier'].isnull() ].shape

(13389, 19)

### Run ROR when nothing given - 16m

In [25]:
%%time

# 32,000 took 16m
# 77,164 took a lot, but 3min after everything in the visited dict

df.loc[ df['funderidentifier'].isnull(), 'ROR_ID'] = df.loc[ df['funderidentifier'].isnull(), 'funder_name' ].apply(find_a_ROR_when_nothing_given, visited_string_to_ROR=visited)

EPSRC
Already visited EPSRC
EPSRC
Already visited EPSRC
EPSRC
Already visited EPSRC
EPSRC
Already visited EPSRC
Engineering and Physical Sciences Research Council
Already visited Engineering and Physical Sciences Research Council
EPSRC
Already visited EPSRC
Engineering and Physical Sciences Research Council
Already visited Engineering and Physical Sciences Research Council
EPSRC
Already visited EPSRC
Engineering and Physical Sciences Research Council
Already visited Engineering and Physical Sciences Research Council
EPSRC
Already visited EPSRC
Engineering and Physical Sciences Research Council
Already visited Engineering and Physical Sciences Research Council
Engineering and Physical Sciences Research Council
Already visited Engineering and Physical Sciences Research Council
European Commission
Already visited European Commission
Engineering and Physical Sciences Research Council
Already visited Engineering and Physical Sciences Research Council
Leverhulme Trust
Already visited Leverhu

NIOZ - COS: 5 results
National Natural Science Foundation of China
Already visited National Natural Science Foundation of China
National Natural Science Foundation of China
Already visited National Natural Science Foundation of China
Innovate UK
Already visited Innovate UK
Natural Sciences and Engineering Research Council
Already visited Natural Sciences and Engineering Research Council
Medical Research Council
Already visited Medical Research Council
National Science Foundation
Already visited National Science Foundation
Australian Research Council
Already visited Australian Research Council
Biotechnology and Biological Sciences Research Council
Already visited Biotechnology and Biological Sciences Research Council
EPSRC
Already visited EPSRC
European Research Council
Already visited European Research Council
European Research Council
Already visited European Research Council
European Commission Horizon 2020
Already visited European Commission Horizon 2020
European Research Council
Al

ERC-StG: 8 results
NWO
Already visited NWO
NIOZ - Royal Netherlands Institute for Sea Research


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



HASH(0x55d381332728): 0 results
HASH(0x55d3813165b8)
HASH(0x55d3813165b8): 0 results
Goethe University Frankfurt
Already visited Goethe University Frankfurt
National Science Foundation
Already visited National Science Foundation
National Science Foundation
Already visited National Science Foundation
Agency for Science, Technology and Research (A*STAR)
Already visited Agency for Science, Technology and Research (A*STAR)
Agency for Science, Technology and Research (A*STAR), Singapore
Already visited Agency for Science, Technology and Research (A*STAR), Singapore
National Medical Research Council, Ministry of Health, Singapore
Already visited National Medical Research Council, Ministry of Health, Singapore
National Medical Research Council, Ministry of Health, Singapore
Already visited National Medical Research Council, Ministry of Health, Singapore
CPU times: total: 1 s
Wall time: 4.1 s


In [39]:
df[df['funder_name']=='United States Agency for International Development via the BASIS research program at the University of California, Davis']

Unnamed: 0,top_id,publicationyear,doi,publisher,subjects,relatedIdentifiers,sizes,formats,rightsIdentifier,creators,num_funders,awardTitle,awardNumber,funder_name,funderidentifier,funderidentifiertype,ROR_ID
2960,10.3886/e183866,2023,10.3886/e183866,ICPSR - Interuniversity Consortium for Politic...,"[{'lang': 'en', 'subject': 'HIV Testing'}, {'l...","[{'relationType': 'IsDocumentedBy', 'relatedId...",,,,"[{'name': 'Yang, Dean', 'nameType': 'Personal'...",1,,"AID-OAA-L-12-00001, AID-OAA-LA-16-0004, and AI...",United States Agency for International Develop...,,,Multiple results
2961,10.3886/e183866v1,2023,10.3886/e183866v1,ICPSR - Interuniversity Consortium for Politic...,"[{'lang': 'en', 'subject': 'HIV Testing'}, {'l...","[{'relationType': 'IsDocumentedBy', 'relatedId...",,,,"[{'name': 'Yang, Dean', 'nameType': 'Personal'...",1,,"AID-OAA-L-12-00001, AID-OAA-LA-16-0004, and AI...",United States Agency for International Develop...,,,Multiple results


In [84]:
find_a_ROR_when_nothing_given('\n        Fram Centre’s flagship “Effects of climate change on terrestrial\n        ecosystems, landscapes, society and indigenous peoples"*\n ', visited)

Fram Centre’s flagship “Effects of climate change on terrestrial ecosystems, landscapes, society and indigenous peoples'*
Fram Centre’s flagship “Effects of climate change on terrestrial ecosystems, landscapes, society and indigenous peoples'*: 20 results


'None'

In [27]:
df.to_csv('2022.csv', index=False)

---
### Manually pull out non-matching strings and save them

### Common terms to look for:

NSF, NASA, USDA, US, U.S., NIH, Forest, NSF:, NOAA, Directorate

In [11]:
df.loc[ df['ROR_ID']=='None' ].shape

(5260, 19)

In [12]:
df.loc[ df['ROR_ID']=='None' ,'funder_name'].value_counts()

unknown                                                                                                     681
Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire          280
NWO                                                                                                          53
Institut National des Sciences de l'Univers - Centre National de la Recherche Scientifique (INSU - CNRS)     43
European Commission - Horizon 2020-RI (EC-H2020)                                                             39
                                                                                                           ... 
ANZSA                                                                                                         1
US Army Research Office                                                                                       1
Hainan Province Science and Technology Program Special Fund                                             

In [50]:
# save value_counts as a df
counts = df.loc[ df['ROR_ID']=='None' ,'funder_name'].value_counts().reset_index().rename(columns={"index": "NoneFunder", 'funder_name': "count"})
counts

Unnamed: 0,NoneFunder,count
0,unknown,681
1,"Direction Régionale de l'Environnement, de l'A...",280
2,NWO,53
3,Institut National des Sciences de l'Univers - ...,43
4,European Commission Horizon 2020,39
...,...,...
2617,Basic Energy Sciences,1
2618,ANZSA,1
2619,US Army Research Office,1
2620,Hainan Province Science and Technology Program...,1


In [51]:
counts.to_csv('2023_funders_of_None.csv', index=False)

### Add information to the `visited` dictionary

In [54]:
#read in test_keys and test_values as lists
restored_df = pd.read_csv('2023_noneRORs_manuallyrestored.csv')
restored_df

Unnamed: 0,Name,ROR
0,\n U.S. Department of Health & Human Se...,https://ror.org/01cwqze88
1,\n U.S. Department of the Interior's In...,https://ror.org/03v0pmy70
2,American Academy of Pediatrics and the Health ...,https://ror.org/033jnv181
3,Funding for this research and manuscript devel...,https://ror.org/027ka1x80
4,Intramural Research Program of the NIH,https://ror.org/01cwqze88
...,...,...
70,United States Forest Service Cohesive Strategy*,https://ror.org/03zmjc935
71,US Fish and Wildlife Service,https://ror.org/04k7dar27
72,USDA Forest Service Pacific Southwest Region*,https://ror.org/01na82s61
73,USDA NIFA Postdoctoral Fellowship Award,https://ror.org/01na82s61


In [55]:
test_keys = restored_df['Name']
test_keys

0     \n        U.S. Department of Health & Human Se...
1     \n        U.S. Department of the Interior's In...
2     American Academy of Pediatrics and the Health ...
3     Funding for this research and manuscript devel...
4                Intramural Research Program of the NIH
                            ...                        
70      United States Forest Service Cohesive Strategy*
71                        US Fish and Wildlife Service 
72        USDA Forest Service Pacific Southwest Region*
73              USDA NIFA Postdoctoral Fellowship Award
74    USDA-NIFA Agricultural Genome to Phenome Initi...
Name: Name, Length: 75, dtype: object

In [56]:
len(test_keys)

75

In [57]:
test_values = restored_df['ROR']
test_values

0     https://ror.org/01cwqze88
1     https://ror.org/03v0pmy70
2     https://ror.org/033jnv181
3     https://ror.org/027ka1x80
4     https://ror.org/01cwqze88
                ...            
70    https://ror.org/03zmjc935
71    https://ror.org/04k7dar27
72    https://ror.org/01na82s61
73    https://ror.org/01na82s61
74    https://ror.org/01na82s61
Name: ROR, Length: 75, dtype: object

In [58]:
# using dict() and zip() to convert lists to dictionary
res = dict(zip(test_keys, test_values))
res

{'\n        U.S. Department of Health & Human Services | NIH | National\n        Institute of Neurological Disorders and Stroke\n      ': 'https://ror.org/01cwqze88',
 "\n        U.S. Department of the Interior's International Technical\n        Assistance Program*\n      ": 'https://ror.org/03v0pmy70',
 'American Academy of Pediatrics and the Health Resources and Services Administration (HRSA) of the U.S. Department of Health and Human Services (HHS) ': 'https://ror.org/033jnv181',
 'Funding for this research and manuscript development was provided by NASA, Biological and Physical Sciences (BPS).': 'https://ror.org/027ka1x80',
 'Intramural Research Program of the NIH': 'https://ror.org/01cwqze88',
 'J. R. is funded by NIH  F32GM146366. This work was supported by NSF CAREER 1552126 to D.F.': 'https://ror.org/021nxhr62',
 'NASA ACE Mission Office': 'https://ror.org/027ka1x80',
 'NASA Awards 80NSSC21K0311 (M.E.R.-C./S.B.R.) and 80NSSC19K9518 (S.B.R.) as well as NIH Award R37HD019938 (U.B

In [62]:
len(visited)

21325

In [63]:
visited.update(res)

In [64]:
len(visited)

21329

### Repair the ROR_ID column one funder at a time. Can do it this way....OR

In [44]:
df.loc[ df['funder_name']=='DoI']

Unnamed: 0,top_id,publicationyear,doi,publisher,subjects,relatedIdentifiers,sizes,formats,rightsIdentifier,creators,num_funders,awardTitle,awardNumber,funder_name,funderidentifier,funderidentifiertype,ROR_ID
2432,10.12751/g-node.2e31e3,2019,10.12751/g-node.2e31e3,G-Node,"[{'subject': 'Neuroscience'}, {'subject': 'Ele...","[{'relationType': 'IsDescribedBy', 'relatedIde...",,,,"[{'name': 'Cadena, Santiago', 'nameType': 'Per...",11,,D16PC00003,DoI,,,


In [42]:
%%time
fixing_name = 'NSF'
df.loc[ df['funder_name']==fixing_name, 'ROR_ID'] = df.loc[ df['funder_name']==fixing_name, 'funder_name' ].apply(find_a_ROR_when_nothing_given, visited_string_to_ROR=visited)

Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
Already visited NSF
CPU times: total: 0 ns
Wall time: 5.98 ms


In [32]:
df[0:20].to_csv('2019_test.csv', index=False)

## Or, do a whole big rerun stomping on `ROR_ID`s of None

In [38]:
df.head(3)

Unnamed: 0,top_id,publicationyear,doi,publisher,subjects,relatedIdentifiers,sizes,formats,rightsIdentifier,creators,num_funders,awardTitle,awardNumber,funder_name,funderidentifier,funderidentifiertype,ROR_ID
0,10.5878/000405,2022,10.5878/000405,University of Gothenburg,"[{'lang': 'en', 'subject': 'internal politics'...","[{'relationType': 'HasVersion', 'relatedIdenti...",197.43 MiB,,,"[{'name': 'Swedish National Data Service, Univ...",1,,,Bank of Sweden Tercentenary Foundation,https://ror.org/02jkbm893,ROR,02jkbm893
1,10.17863/cam.13001,2022,10.17863/cam.13001,Apollo - University of Cambridge Repository,"[{'subject': 'molecular dynamics'}, {'subject'...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Larsen, A', 'nameType': 'Personal',...",1,,1198,EPSRC,,,https://ror.org/0439y7842
2,10.17863/cam.13242,2022,10.17863/cam.13242,Apollo - University of Cambridge Repository,"[{'subject': 'NMR relaxation and diffusion'}, ...","[{'relationType': 'IsSupplementTo', 'relatedId...",,,cc-by-4.0,"[{'name': 'Sederman, Andy', 'nameType': 'Perso...",2,,EP/F047991/1,EPSRC,,,https://ror.org/0439y7842


In [67]:
df.loc[ df['ROR_ID']=='None' ].shape

(5411, 17)

In [68]:
%%time
df.loc[ df['ROR_ID']=='None', 'ROR_ID'] = df.loc[ df['ROR_ID']=='None', 'funder_name' ].apply(find_a_ROR_when_nothing_given, visited_string_to_ROR=visited)

Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Already visited Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Already visited Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Already visited Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Already visited Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Direction Régionale de l'Environnement, de l'Aménagement et du Logement (DREAL) - Pays de la Loire
Already visited Direction Régionale de l'Envi

EU Horizons 2020: 4 results
Comart Foundation
Already visited Comart Foundation
Comart Foundation
Already visited Comart Foundation
EPSRC Centre for Doctoral Training in Sustainable Materials and Manufacturing
Already visited EPSRC Centre for Doctoral Training in Sustainable Materials and Manufacturing
EPSRC Centre for Doctoral Training in Sustainable Materials and Manufacturing
Already visited EPSRC Centre for Doctoral Training in Sustainable Materials and Manufacturing
Hertzberg Family Foundation
Already visited Hertzberg Family Foundation
Swiss National Foundation
Already visited Swiss National Foundation
Mercator Foundation Switzerland 
Already visited Mercator Foundation Switzerland 
Hertzberg Family Foundation
Already visited Hertzberg Family Foundation
Swiss National Foundation
Already visited Swiss National Foundation
Mercator Foundation Switzerland 
Already visited Mercator Foundation Switzerland 
Open Research Program of the State Key Laboratory of Severe Weather*
Already vis

In [69]:
df.loc[ df['ROR_ID']=='None' ].shape

(5260, 17)

In [70]:
df.to_csv('2023.csv', index=False)

## Add things to the `visited` dictionary

In [57]:
add_df = pd.read_csv('2019_fundingstrings_stillneedROR.csv', encoding = 'latin1')
add_df

Unnamed: 0,String,ROR
0,Air Force Office of Scientific Research (AFOSR),https://ror.org/011e9bt93
1,Army Research Laboratory,https://ror.org/011hc8f90
2,Army Research Office,https://ror.org/05epdh915
3,Army Research Office (ARO) Multidisciplinary U...,https://ror.org/05epdh915
4,Department of Defense Strategic Environment Re...,https://ror.org/0447fe631
...,...,...
105,USDA-AFRI,https://ror.org/01na82s61
106,USDA-NIFA,https://ror.org/01na82s61
107,USDA-NIFA Hatch funds,https://ror.org/01na82s61
108,USDA-NIFA-AFRI,https://ror.org/01na82s61


In [58]:
(key,val) = (add_df['String'], add_df['ROR'])

In [59]:
key

0        Air Force Office of Scientific Research (AFOSR)
1                               Army Research Laboratory
2                                   Army Research Office
3      Army Research Office (ARO) Multidisciplinary U...
4      Department of Defense Strategic Environment Re...
                             ...                        
105                                            USDA-AFRI
106                                            USDA-NIFA
107                                USDA-NIFA Hatch funds
108                                       USDA-NIFA-AFRI
109                                            USDA-SCRI
Name: String, Length: 110, dtype: object

In [60]:
val

0      https://ror.org/011e9bt93
1      https://ror.org/011hc8f90
2      https://ror.org/05epdh915
3      https://ror.org/05epdh915
4      https://ror.org/0447fe631
                 ...            
105    https://ror.org/01na82s61
106    https://ror.org/01na82s61
107    https://ror.org/01na82s61
108    https://ror.org/01na82s61
109    https://ror.org/01na82s61
Name: ROR, Length: 110, dtype: object

In [65]:
new_dict = add_df.set_index('String').to_dict()['ROR']

In [67]:
type(new_dict)

dict

In [68]:
json.dump(new_dict, open('new_visited_dict.txt', 'w'))