In [213]:
# Dependencies

#!pip install dimcli plotly tqdm -U --quiet
#!pip install openpyxl -U

In [94]:
import numpy as np
import sympy as sp
import pickle
from IPython.display import HTML
import pandas as pd
import itertools
import dimcli
import openpyxl
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)



[2mDimcli - Dimensions API Client (v0.9.1)[0m
[2mConnected to: https://app.dimensions.ai - DSL v1.31[0m
[2mMethod: manual login[0m


In [215]:
# Set up credentials and connection
dimcli.login(key="F86D0EE87BF7475E95D9A47574175BC3",
             endpoint="https://app.dimensions.ai")
dsl = dimcli.Dsl()

[2mDimcli - Dimensions API Client (v0.9.1)[0m
[2mConnected to: https://app.dimensions.ai - DSL v1.31[0m
[2mMethod: manual login[0m


<b>Overview</b>

There is support for magic functions such as:

In [217]:
%%dsldf

search publications
where researchers.id = "ur.013514345521.07"
return publications[doi + researchers]
limit 1

Returned Publications: 1 (total = 21)
[2mTime: 1.16s[0m


Unnamed: 0,researchers,doi
0,"[{'id': 'ur.01124722306.57', 'research_orgs': ...",10.1007/s00216-021-03510-5


It is simpler to just pass the dsl connection a string that the API can deal with. Note it expects JSON when creating the query. The workflow is just to create a valid query as a string, then pass it to the API:

In [226]:
q = """search publications for "graphene"
            where year=2019
       return publications[id+title+year+concepts_scores] limit 100"""

concepts = dsl.query(q).as_dataframe_concepts()
concepts.head(1)

Returned Publications: 100 (total = 115753)
[2mTime: 2.18s[0m


Unnamed: 0,id,title,year,concepts_count,concept,score,frequency,score_avg
0,pub.1129328198,Study of mechanochemistry of carbon nanotube u...,2019,6,study,0.084,30,0.2094


A more standard use case is create the query as different parts, so it is possible to drop in some kind of variable:

In [314]:
GRIDID = "grid.427695.b"
tot = dsl.query(f"""search publications where research_orgs.id="{GRIDID}" return publications limit 1""", verbose=False).count_total
print(f"{GRIDID} has a total of {tot} publications in Dimensions")

grid.427695.b has a total of 292 publications in Dimensions


It supports returning results as data frames (standard return object is JSON)

In [228]:
df = dsl.query(f"""search publications where research_orgs.id="{GRIDID}" return year limit 100""").as_dataframe()
df.rename(columns={"id": "year"}, inplace=True)
df.head(1)

Returned Year: 18
[2mTime: 1.13s[0m


Unnamed: 0,year,count
0,2014,28


A good workflow seems to be using the query along with the data frame to get what is needed

In [315]:
data = dsl.query(f"""search publications
                where research_orgs.id="{GRIDID}"
                return publications[doi+title+times_cited+category_for+journal]
                sort by times_cited limit 1000""")
df = data.as_dataframe()
df.head(1)[['title', 'doi', 'times_cited', 'journal.title']]


Returned Publications: 292 (total = 292)
[2mTime: 1.41s[0m


Unnamed: 0,title,doi,times_cited,journal.title
0,Global surveillance of cancer survival 1995–20...,10.1016/s0140-6736(14)62038-9,1408,The Lancet


Now for some work. Look through CI data to check applications for fellowships:

In [317]:
authors = pd.read_excel("./Data/ci-researcher-data/Premiers_Fellowship_Dimensions_Full.xlsx")
authors.head(1)

Unnamed: 0,researcher_ids,Program,ProjectIdentifier,Title,ContactFullName,PrimaryOrganisationName,2020_TotalPubs,LTD_TotalPubs,2020_TotalCitations,LTD_TotalCitations,h-index
0,['ur.0731267322.21'],Career Development Fellowship - CDF,2018/CDF001,The Clincial Translation of NSW Invented Tumou...,Ricky O'Brien,The University of Sydney,,,,,


Dimensions expects JSON, so some cleaning is needed:

In [231]:
authors.researcher_ids = authors.researcher_ids.str.replace(r"\['", "")
authors.researcher_ids = authors.researcher_ids.str.replace(r"\']", "")
authors.researcher_ids = authors.researcher_ids.str.replace(r"\'", " ")
authorIDs = authors.researcher_ids.to_list()
authorIDs = [str(x) for x in authorIDs]
authorIDs = [x for x in authorIDs if x != 'nan']

Create a query string and then run a function to evaulate the query with each authorID

In [305]:
queryPrefix = "search publications where researchers.id ="
querySuffix = " return publications[id+type+volume+year+issue+title+journal+authors + times_cited] limit 300"

In [306]:
def queryDimensions(researcherIDs = None):
    noResults = []
    dataFromAPI = []
    for i in range(len(researcherIDs)):
        r = dsl.query(queryPrefix + json.dumps(researcherIDs[i]) + querySuffix).as_dataframe()
        if (r.empty):
            noResults.append(researcherIDs[i])
        else:
            r['ressearcherID'] = researcherIDs[i]
            dataFromAPI.append(r)
   
        dataFromAPIasSingleDataFrame = pd.concat(dataFromAPI)
        
    return({"dataFromAPI": dataFromAPIasSingleDataFrame, "noResults": noResults})

In [311]:
#results = queryDimensions(authorIDs)

Results and errors  are returned: 

In [316]:
results['dataFromAPI']
results['noResults']

[' ur.01121702557.52']