In [2]:
# Dependencies

#!pip install dimcli plotly tqdm -U --quiet
#!pip install openpyxl -U

In [3]:
import numpy as np
import sympy as sp
import pickle
from IPython.display import HTML
import pandas as pd
import itertools
import dimcli
import openpyxl
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import json


In [15]:
# Set up credentials and connection
dimcli.login(key="F86D0EE87BF7475E95D9A47574175BC3",
             endpoint="https://app.dimensions.ai")
dsl = dimcli.Dsl()

[2mDimcli - Dimensions API Client (v0.9.2)[0m
[2mConnected to: <https://app.dimensions.ai/api/dsl> - DSL v1.31[0m
[2mMethod: manual login[0m


<b>Overview</b>

There is support for magic functions such as:

In [16]:
%%dsldf

search publications
where researchers.id = "ur.013514345521.07"
return publications[doi + researchers]
limit 1

Returned Publications: 1 (total = 21)
[2mTime: 1.09s[0m


Unnamed: 0,doi,researchers
0,10.1007/s00216-021-03510-5,"[{'id': 'ur.01124722306.57', 'research_orgs': ..."


But it is simpler to just pass the dsl connection a string that the API can deal with. Note it expects JSON when creating the query. The workflow is just to create a valid query as a string, then pass it to the API:

In [17]:
q = """search publications for "graphene"
            where year=2019
       return publications[id+title+year+concepts_scores] limit 100"""

concepts = dsl.query(q).as_dataframe_concepts()
concepts.head(1)

Returned Publications: 100 (total = 115799)
[2mTime: 1.99s[0m


Unnamed: 0,title,year,id,concepts_count,concept,score,frequency,score_avg
0,Study of mechanochemistry of carbon nanotube u...,2019,pub.1129328198,6,study,0.088,30,0.20953


A more standard use case is create the query as different parts, so it is possible to drop in some kind of variable:

In [18]:
GRIDID = "grid.427695.b"
tot = dsl.query(f"""search publications where research_orgs.id="{GRIDID}" return publications limit 1""", verbose=False).count_total
print(f"{GRIDID} has a total of {tot} publications in Dimensions")

grid.427695.b has a total of 290 publications in Dimensions


It supports returning results as data frames (standard return object is JSON)

In [19]:
df = dsl.query(f"""search publications where research_orgs.id="{GRIDID}" return year limit 100""").as_dataframe()
df.rename(columns={"id": "year"}, inplace=True)
df.head(1)

Returned Year: 18
[2mTime: 1.10s[0m


Unnamed: 0,year,count
0,2014,27


A good workflow seems to be using the query along with the data frame to get what is needed

In [20]:
data = dsl.query(f"""search publications
                where research_orgs.id="{GRIDID}"
                return publications[doi+title+times_cited+category_for+journal]
                sort by times_cited limit 1000""")
df = data.as_dataframe()
df.head(1)[['title', 'doi', 'times_cited', 'journal.title']]


Returned Publications: 290 (total = 290)
[2mTime: 1.43s[0m


Unnamed: 0,title,doi,times_cited,journal.title
0,Global surveillance of cancer survival 1995–20...,10.1016/s0140-6736(14)62038-9,1427,The Lancet


Now to get to what is needed in SRI. First import the CI data relating to fellowships:

In [21]:
authors = pd.read_excel("./Data/ci-researcher-data/Premiers_Fellowship_Dimensions_Full.xlsx")
authors.head(1)

Unnamed: 0,researcher_ids,Program,ProjectIdentifier,Title,ContactFullName,PrimaryOrganisationName,2020_TotalPubs,LTD_TotalPubs,2020_TotalCitations,LTD_TotalCitations,h-index
0,['ur.0731267322.21'],Career Development Fellowship - CDF,2018/CDF001,The Clincial Translation of NSW Invented Tumou...,Ricky O'Brien,The University of Sydney,,,,,


Dimensions expects JSON, so some cleaning is needed:

In [11]:
authors.researcher_ids = authors.researcher_ids.str.replace(r"\['", "")
authors.researcher_ids = authors.researcher_ids.str.replace(r"\']", "")
authors.researcher_ids = authors.researcher_ids.str.replace(r"\'", " ")
authorIDs = authors.researcher_ids.to_list()
authorIDs = [str(x) for x in authorIDs]
authorIDs = [x for x in authorIDs if x != 'nan']

Create a query prefix and suffix string and then run a function to evaulate the query with each authorID

In [38]:
queryPrefix = "search publications where researchers.id ="
querySuffix = " return publications[id+type+volume+year+issue+title+journal+ times_cited] limit 1000"

Single query: 

In [39]:
r = dsl.query(queryPrefix + json.dumps("ur.015600604603.79") + querySuffix).as_dataframe()

Returned Publications: 0
[2mTime: 1.09s[0m


And a function that takes the query. 

In [40]:
def queryDimensions(researcherIDs = None):
    noResults = []
    dataFromAPI = []
    for i in range(len(researcherIDs)):
        r = dsl.query(queryPrefix + json.dumps(researcherIDs[i]) + querySuffix).as_dataframe()
        if (r.empty):
            noResults.append(researcherIDs[i])
        else:
            r['researcherID'] = researcherIDs[i]
            dataFromAPI.append(r)
   
        dataFromAPIasSingleDataFrame = pd.concat(dataFromAPI)
        
    return({"dataFromAPI": dataFromAPIasSingleDataFrame, "noResults": noResults})

In [41]:
results = queryDimensions(authorIDs)

Returned Publications: 114 (total = 114)
[2mTime: 1.20s[0m
Returned Publications: 52 (total = 52)
[2mTime: 1.11s[0m
Returned Publications: 0
[2mTime: 1.10s[0m
Returned Publications: 82 (total = 82)
[2mTime: 1.20s[0m
Returned Publications: 98 (total = 98)
[2mTime: 1.23s[0m
Returned Publications: 94 (total = 94)
[2mTime: 1.19s[0m
Returned Publications: 93 (total = 93)
[2mTime: 1.15s[0m
Returned Publications: 38 (total = 38)
[2mTime: 1.11s[0m
Returned Publications: 94 (total = 94)
[2mTime: 1.14s[0m
Returned Publications: 57 (total = 57)
[2mTime: 1.12s[0m
Returned Publications: 79 (total = 79)
[2mTime: 1.11s[0m
Returned Publications: 32 (total = 32)
[2mTime: 1.10s[0m
Returned Publications: 57 (total = 57)
[2mTime: 1.17s[0m
Returned Publications: 38 (total = 38)
[2mTime: 1.14s[0m
Returned Publications: 28 (total = 28)
[2mTime: 1.10s[0m
Returned Publications: 77 (total = 77)
[2mTime: 1.13s[0m
Returned Publications: 45 (total = 45)
[2mTime: 1.13s[0m
Returned 

Results and errors  are returned: 

In [41]:
results['dataFromAPI']
results['noResults']

['ur.015600604603.79', ' ur.01121702557.52']

In [24]:
results['dataFromAPI']

Unnamed: 0,id,title,type,year,times_cited,journal.id,journal.title,volume,issue,researcherID
0,pub.1138524385,A real-time IGRT method using a Kalman filter ...,article,2021,0,jour.1018310,Physics in Medicine and Biology,,,ur.0731267322.21
1,pub.1138392446,The first-in-human implementation of adaptive ...,article,2021,0,jour.1094728,Radiotherapy and Oncology,161,,ur.0731267322.21
2,pub.1137351609,The adaptation and investigation of cone-beam ...,article,2021,0,jour.1018310,Physics in Medicine and Biology,66,10,ur.0731267322.21
3,pub.1137857401,First experimental evaluation of multi-target ...,article,2021,0,jour.1094728,Radiotherapy and Oncology,160,,ur.0731267322.21
4,pub.1137716857,Study protocol of the LARK (TROG 17.03) clinic...,article,2021,1,jour.1024632,BMC Cancer,21,1,ur.0731267322.21
...,...,...,...,...,...,...,...,...,...,...
53,pub.1006528517,Electrode-Skin contact impedance: In vivo meas...,article,2013,6,jour.1043366,Journal of Physics Conference Series,434,1,ur.01143543411.00
54,pub.1014893136,Pulmonary Embolism Detection with Electrical I...,article,2013,2,jour.1023705,Heart Lung and Circulation,22,,ur.01143543411.00
55,pub.1027607605,A Computer Simulation Study on the Applicabili...,chapter,2013,0,,,39,,ur.01143543411.00
56,pub.1056753649,Active electrode design suitable for simultane...,article,2012,7,jour.1126662,Electronics Letters,48,25,ur.01143543411.00


In [44]:
x.to_csv("data.csv")