In [1]:
## Dependencies
import requests
from requests.exceptions import HTTPError
import json
from config import SandboxClientID
from config import SandboxClientsecret
from config import SandboxTwoAuthToken
from config import PubClientID
from config import PubClientsecret
from config import PubTwoAuthToken
from urllib.parse import urlencode
from pandas.io.json import json_normalize  
import pandas as pd
import sys, os

## Dependencies not used
# import re
# import glob
# import numpy as np
# from pprint import pprint



## Note: ORCID (orcid-python) package is outdated
## https://github.com/scholrly/orcid-python
## pip list -v can be used to list packages' install locations
## C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages

## Take a look at Python 3.0 Wiki Built-in Changes section, where it is stated:
## Removed dict.iteritems(), dict.iterkeys(), and dict.itervalues().
## Instead: use dict.items(), dict.keys(), and dict.values() respectively.

# import orcid
# from inspect import getmembers, isfunction
# print(getmembers(orcid, isfunction))
# help(orcid)

## Note: Python-Orcid package also does not work: 
## https://github.com/ORCID/python-orcid

## Pathway for reading data from ORCID Public API 

In this pathway you will use the /read-public endpoint. There is an authentication process (ORCID calls it a two step authentication) where you must request a token prior to your request to read information, but the token is good for many years. You can save the token to the config file with your ClientID and ClientSecret keys once you have it. 

Do not confuse this with the /read-limited endpoint, as that endpoint is meant for reading data from a specific user record and requires the 3 step authentication (i.e. requesting a code, and then exchanging the code for the user's token) where the user whose profile you want to read from must authenticate which gives you (or your web app) permission to read the data. 

The /read-public end point is sufficient if you want to search and return data from the ORCID database from your university, for example. 

In [None]:
## Step 1: Use the /oath/token endpoint to receive the token needed for using the /read-pubic endpoint

## NOTE!!!! You only need to do this step ONCE because your token is good for many years. 
## You can add your token to your config file for future use. 

## Set up header and parameters for post request

headers = {
    "accept": "application/json",
    "method": "post"
}

data = {
    "client_id": PubClientID,
    "client_secret": PubClientsecret,
    "grant_type" : "client_credentials",
    "scope": "/read-public"
}


## Create try/catch loop for request
    
    ## URLs between sandbox and production differ
    ## Member Production API = https://api.orcid.org
    ## Public API = 'http://pub.orcid.org/'
    ## Sandbox API = https://sandbox.orcid.org
    
                
   
for url in ["http://pub.orcid.org/oauth/token"]:
    try: 
        ## Make the API post request 
        response = requests.post(url, headers = headers, data = data) 
        
        ## Print responses
        print(response.status_code)
        print(response.url)
        print(response.text)
        print(type(response.text))
        response_string = response.text
        
        ## If the response was successful, no Exception will be raised
        response.raise_for_status()
    
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')  # Python 3.6
   
    except Exception as err:
        print(f'Other error occurred: {err}')  # Python 3.6
    
    else:
        print('Success!')
        

## Resources
## https://info.orcid.org/documentation/api-tutorials/api-tutorial-read-data-on-a-record/#easy-faq-2536

In [None]:
## Step 2: Save the access token as a variable

## Save the token as a variable
json_token = json.loads(response_string)
print(json_token["access_token"])
final_token = json_token["access_token"]


In [2]:
## Step 3: Use the access token and call the /read-public endpoint

## Use the saved token from your config file
final_token = PubTwoAuthToken
#print(final_token)

## Save the query items as variables
ringgold_id = "3270" 
grid_id = "grid.16753.36" 
email_domain = "@northwestern.edu" 
organization_name = "Northwestern University"
organization_name2 = "Northwestern"

## Save the items in a list that you'd like returned
# fieldList = ["orcid", "email", "given-name","family-name", "given-and-family-names", "credit-name",
#              "other-name","current-institution-affiliation-name","past-institution-affiliation-name"]


## Set up query for request

## Unfortunately you cannot use the query_dict and urlencode to create the query string
## Urlencode creates an = equal to connect subject and key; and the ORCID API requires a : colon

# query_dict = {
#      "ringgold-org-id":ringgold_id,
#      "OR grid-org-id":grid_id, 
#      "OR email": email_domain,
#      "OR affiliation-org-name": organization_name,
#      "affiliation-org-name": organization_name2
# }

#query_str = urlencode(query_dict)
#print(query_str)

## Instead, you can use the urlencode to safely create the URL and then change = to : where needed. 

query_str = "ringgold-org-id:3270&OR+grid-org-id:grid.16753.36&OR+email:%40northwestern.edu&OR+affiliation-org-name:Northwestern+University&OR+affiliation-org-name:Northwestern"



## Set up headers and parameters for request

headers = {
    "Accept": "application/json",
    "Authorization": f'Bearer{final_token}' 
}


# parameters = {
#     #"access_token": final_token,
#    # "q" : query_str,
#     "field": ",".join(fieldList)
    
# }

#search_url = "https://api.sandbox.orcid.org/search/?"

search_response_list = []

def get_request(query_str, headers, search_response_list):
    
    #search_response_list = []
    start = 0
    rows = 1000

    while True: 
        
        try:
                
                ## Create URL for request
                print("------")
                #search_url = f"https://api.sandbox.orcid.org/v3.0/expanded-search/?q={query_str}&start={start}&rows={rows}" 
                search_url = f"http://pub.orcid.org/expanded-search/?q={query_str}&start={start}&rows={rows}"                 

                
                print("Requesting", search_url)
                
                ## URLs between sandbox and production differ
                ## Member Production API = https://api.orcid.org
                ## Public API = 'http://pub.orcid.org/'
                ## Sandbox API = https://sandbox.orcid.org
                
                ## Make the API post request 
                search_response = requests.get(search_url, headers = headers,  timeout=30) # params = parameters,
                #print(type(search_response))
                json_data = search_response.json()
                #print(json_data["expanded-result"][0]["orcid-id"])
                #print(type(json_data))
                
                ## Print responses
                print("Search Status", search_response.status_code)
                # print(search_response.url)
                print("Response text", search_response.text)
                # print(type(search_response.text))
                
                
                ## If the response was successful, no Exception will be raised
                search_response.raise_for_status()
                
                ## Need something that will give us an error, or else it will keep working
                print(json_data["expanded-result"][0]["orcid-id"]) 
                
               
               
                ## If we did find more ORCID data, add them to the list and move on to next offset
                search_response_list.extend(json_data["expanded-result"])
                
                
                
        except HTTPError as http_err:
                print(f'HTTP error occurred: {http_err}')  # Python 3.6
                break

        except Exception as err:
                print(f'Other error occurred: {err}. ', 'Exiting the loop!')  # Python 3.6
                print(search_response.url)
                break

        else:
                print('Success!')
        
        start = start + 1000

    return search_response_list


get_request(query_str, headers, search_response_list)

print(len(search_response_list))
print(type(search_response_list))

## Notes
## Please note the Public API is limited to 10,000 results. 
## Using the Member API (with a Member API token) does not limit the number of results.

## Resources
## https://info.orcid.org/faq/how-do-i-find-orcid-record-holders-at-my-institution/
## https://info.orcid.org/documentation/api-tutorials/api-tutorial-read-data-on-a-record/#easy-faq-2361
## https://info.orcid.org/documentation/api-tutorials/api-tutorial-searching-the-orcid-registry/
## https://realpython.com/python-api/#request-and-response
## http://www.compciv.org/guides/python/how-tos/creating-proper-url-query-strings/
## https://stackoverflow.com/questions/17788445/constructing-requests-with-url-query-string-in-python
## Using paginated APIs (4 ways): https://www.youtube.com/watch?v=4Fdyft-ky0w



------
Requesting http://pub.orcid.org/expanded-search/?q=ringgold-org-id:3270&OR+grid-org-id:grid.16753.36&OR+email:%40northwestern.edu&OR+affiliation-org-name:Northwestern+University&OR+affiliation-org-name:Northwestern&start=0&rows=1000
Search Status 200
Response text {"expanded-result":null,"num-found":0}
Other error occurred: 'NoneType' object is not subscriptable.  Exiting the loop!
https://pub.orcid.org/v3.0/expanded-search/
0
<class 'list'>


In [None]:
## 3a. Check data as needed
print(len(search_response_list))
print(type(search_response_list))
print(search_response_list[1])
print(type(search_response_list[1]))

In [None]:
## Step 4: Convert response, which isa list of dictionaries into pandas dataframe

orcid_results_df = pd.DataFrame(search_response_list)
orcid_results_df.head()


In [None]:
## Step 5: Save dataframe to a CSV

with open(r"output/orcid_results_df.csv", 'w', encoding='utf-8') as file:
    orcid_results_df.to_csv(file, line_terminator='\n', index=True)
    file.close()

In [None]:
## Step 6: Save the query items as variables
orcid_list = orcid_results_df['orcid-id'].tolist()
print(type(orcid_list))
print(orcid_list)

In [None]:
## Step 7: Query Employment specific data and append to dataframe

## Use the saved token from your config file
final_token = TwoAuthToken
print(final_token)


## Save the query items as variables
orcid_list = orcid_results_df['orcid-id'].tolist()
print(type(orcid_list))
print(orcid_list)


## Set up query for request

# employ_str = f"0000-0002-2099-3156/employments"

## Set up headers and parameters for request

headers = {
    "Accept": "application/json",
    "Authorization": f'Bearer{final_token}' 
}

employment_response_list = []

## Create URL for request

for orcid in orcid_list:

    try: 
     
            print("------")
            employ_url = f"https://api.sandbox.orcid.org/v3.0/{orcid}/employments" 
            print("Requesting", employ_url)

            ## URLs between sandbox and production differ
            ## production API = https://api.orcid.org
            ## sandbox API = https://sandbox.orcid.org

            ## Make the API post request 
            employ_response = requests.get(employ_url, headers = headers,  timeout=30) # params = parameters,
            #print(type(employ_response))
           
            json_employ_data = employ_response.json()
            #print(type(json_employ_data))
            #print(json_employ_data)
            
            
    
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')  # Python 3.6
        break

    except Exception as err:
        print(f'Other error occurred: {err}. ', 'Exiting the loop!')  # Python 3.6
        print(search_response.url)
        break

    else:
        print('Success!')

    ## Append json_employ_data which is a dict to a list, to create a list of dicts. 
    employment_response_list.append(json_employ_data.copy())
    
    print("Again", employment_response_list)
    
print(employment_response_list[0])
## Resources
## https://info.orcid.org/documentation/integration-guide/orcid-record/#Employment

In [None]:
## 7a. Check data as needed
print(len(employment_response_list))
print(type(employment_response_list))
print(employment_response_list[1])
print(type(employment_response_list[1]))

In [None]:
## Step 8: Convert response, which is a list of dictionaries into pandas dataframe. Then unnest columns. 

orcid_employment_df = pd.DataFrame(employment_response_list)
remove_nest_df = orcid_employment_df.explode("affiliation-group")
remove_nest_df_2 = pd.concat([remove_nest_df.drop(['affiliation-group'], axis=1), remove_nest_df['affiliation-group'].apply(pd.Series)], axis=1, join="outer")
remove_nest_df_3 = pd.concat([remove_nest_df_2.drop(['summaries'], axis=1), remove_nest_df_2['summaries'].apply(pd.Series)], axis=1, join="outer")
remove_nest_df_4 = pd.concat([remove_nest_df_3.drop([0], axis=1), remove_nest_df_3[0].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_5 = pd.concat([remove_nest_df_4.drop(["employment-summary"], axis=1), remove_nest_df_4["employment-summary"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_6 = pd.concat([remove_nest_df_5.drop(["start-date"], axis=1), remove_nest_df_5["start-date"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_7 = pd.concat([remove_nest_df_6.drop(["end-date"], axis=1), remove_nest_df_6["end-date"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_8 = pd.concat([remove_nest_df_7.drop(["organization"], axis=1), remove_nest_df_7["organization"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)

remove_nest_df_8.head()

In [None]:
## Step 9: remove duplicate names from columns

## Identify the current set of columns
cols=pd.Series(remove_nest_df_8.columns)

## For each duplicate column name, add a dot number to make it unique
for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

## Reset the column names
remove_nest_df_8.columns=cols

remove_nest_df_8.head()


In [None]:
## Step 10: Further unpack nesting from json

remove_nest_df_9 = pd.concat([remove_nest_df_8.drop(["year"], axis=1), remove_nest_df_8["year"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_9.rename(columns = {'value':'start_year'}, inplace = True)
remove_nest_df_9.drop(columns=[0], inplace = True)

remove_nest_df_10 = pd.concat([remove_nest_df_9.drop(["year.1"], axis=1), remove_nest_df_9["year.1"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_10.rename(columns = {'value':'end_year'}, inplace = True)
remove_nest_df_10.drop(columns=[0], inplace = True)

remove_nest_df_11 = pd.concat([remove_nest_df_10.drop(["month"], axis=1), remove_nest_df_10["month"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_11.rename(columns = {'value':'start_month'}, inplace = True)
remove_nest_df_11.drop(columns=[0], inplace = True)

remove_nest_df_12 = pd.concat([remove_nest_df_11.drop(["month.1"], axis=1), remove_nest_df_11["month.1"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_12.rename(columns = {'value':'end_month'}, inplace = True)
remove_nest_df_12.drop(columns=[0], inplace = True)

remove_nest_df_13 = pd.concat([remove_nest_df_12.drop(["day"], axis=1), remove_nest_df_12["day"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_13.rename(columns = {'value':'start_day'}, inplace = True)
remove_nest_df_13.drop(columns=[0], inplace = True)

remove_nest_df_14 = pd.concat([remove_nest_df_13.drop(["day.1"], axis=1), remove_nest_df_13["day.1"].apply(pd.Series)], axis=1, join="outer") #.reset_index(drop=True, inplace=True)
remove_nest_df_14.rename(columns = {'value':'end_day'}, inplace = True)
remove_nest_df_14.drop(columns=[0], inplace = True)


remove_nest_df_14.head()

## References
## https://stackoverflow.com/questions/57629435/how-to-extract-values-from-column-of-dictionaries-in-pandas/57629530


In [None]:
## Step 11: Save dataframe to a CSV

with open(r"output/orcid_employment_results_df.csv", 'w', encoding='utf-8') as file:
    remove_nest_df_14.to_csv(file, line_terminator='\n', index=True)
    file.close()


## Pathway for editing an ORCID record using the ORCID Member API

### Pathway code note yet complete

Note, this pathway is similar to if a user were to give permission to Scopus to update their record. So it doesn't really translate well into a Jupyter Notebook becuase we're not a web application asking a user to give us permission. More so, this pathway is included here so you can see how it works. 

In [None]:
## Step 1: Authenticate with ORCID to receive a token

## https://info.orcid.org/documentation/integration-guide/getting-started-with-your-orcid-integration/#easy-faq-2569

## Set up URL and parameters
url = "https://sandbox.orcid.org/oauth/authorize?"

parameters = {
    "client_id": ClientID,
    "client_secret": Clientsecret,
    "response_type" : "code",
    "scope": "read-limited",
    "redirect_uri" : "https://api.sandbox.orcid.org"
    
}

             
## Make the API request 

response = requests.get(url, params = parameters) #headers = headers,

## Print responses
print(response.status_code)
print(response.url)
# print(response.text)

## Click on the URL and authenticate with your sandbox ORCID username and password. 
## Don't yet have sandbox ORCID username and password? 
## username must be: XXXXX@mailinator.com
## Need to reset your sandbox ORCID password?
## You can go to https://www.mailinator.com/ and enter your username in the textbox 
## This will take you to your public mailbox and where youwill find the password reset email

## Resources
## https://info.orcid.org/faq/how-does-3-legged-oauth-work/

In [None]:
## Step 2: Exchange the code received for a token. 
## The code expires upon use but the token is good for multiple uses and expires after 20 years

URL=https://sandbox.orcid.org/oauth/token
  HEADER: Accept: application/json
  HEADER: Content-Type: application/x-www-form-urlencoded
  METHOD: POST
  DATA: 
    client_id=[Your client ID]
    client_secret=[Your client secret]
    grant_type=authorization_code
    code=Six-digit code
    redirect_uri=[Your landing page]
    
## Resources
## https://info.orcid.org/faq/how-does-3-legged-oauth-work/


In [None]:
## Step 3: Use the token in API requests you make to read or update that record.



In [None]:
# https://pub.sandbox.orcid.org/v3.0/search/?q=family-name:Einstein+AND+keyword:Relativity&start=0&rows=10
#             https://pub.sandbox.orcid.org/v3.0/search/?q=email:*@orcid.org

# Sandbox: https://api.sandbox.orcid.org

# Production: https://api.orcid.org

# Method: GET
#   Content-type: text/csv
#   Authorization type: Bearer
#   Access token: [Stored access token]
#   URL:  https://api.orcid.org/v3.0/csv-search/?q=ringgold-org-id:385488&fl=orcid,given-names,family-name,current-institution-affiliation-name,'