In [1]:
# Dependencies

#!pip install dimcli plotly tqdm -U --quiet
#!pip install openpyxl -U

In [36]:
import numpy as np
import sympy as sp
import pickle
from IPython.display import HTML
import pandas as pd
import itertools
import dimcli
import openpyxl
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import json

In [3]:
# Set up credentials and connection
dimcli.login(key="F86D0EE87BF7475E95D9A47574175BC3",
             endpoint="https://app.dimensions.ai")
dsl = dimcli.Dsl()

[2mDimcli - Dimensions API Client (v0.9.1)[0m
[2mConnected to: https://app.dimensions.ai - DSL v1.31[0m
[2mMethod: manual login[0m


#### Data Cleaning

In [43]:
researchers = pd.read_excel("./Data/ethics-data/MASTER Ethics Register.xlsx", sheet_name="Register")
researchers = researchers[['Principal Investigator', 'CINSW Datasets']]

researchers = researchers.dropna().reset_index(drop = True)


remove_words = ['Dr', 'Professor', 'Mr', 'Associate', 'Prof', 'A/', 'Ms', '(contact Gemma Jacklyn)']
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

researchers['cleanedPI'] = researchers['Principal Investigator'].str.replace(pat, '').str.strip()
researchers = researchers.drop_duplicates(subset='cleanedPI', keep="last").reset_index(drop=True)
#researchers = researchers.drop(['Principal Investigator'], axis=1)

researchers.head(2)

Unnamed: 0,Principal Investigator,CINSW Datasets,cleanedPI
0,Dr Carolyn Mazariego,NSW PCCR,Carolyn Mazariego
1,Professor Kees Van Gool,NSW CR,Kees Van Gool


#### API Functions

In [13]:
def the_H_function(sorted_citations_list, n=1):
    """from a list of integers [n1, n2 ..] representing publications citations,
    return the max list-position which is >= integer

    eg
    >>> the_H_function([10, 8, 5, 4, 3]) => 4
    >>> the_H_function([25, 8, 5, 3, 3]) => 3
    >>> the_H_function([1000, 20]) => 2
    """
    if sorted_citations_list and sorted_citations_list[0] >= n:
        return the_H_function(sorted_citations_list[1:], n+1)
    else:
        return n-1

In [16]:
def queryDimensions(lookupList = None, queryPrefix = None, querySuffix = None, returnConceptsFrame = False):
    noResults = []
    dataFromAPI = []
    for i in range(len(lookupList)):
        if returnConceptsFrame:
            try:
                r = dsl.query(queryPrefix + json.dumps(lookupList[i]) + querySuffix).as_dataframe_concepts()
            except:
                pass
        else:            
            r = dsl.query(queryPrefix + json.dumps(lookupList[i]) + querySuffix).as_dataframe()
        
        if (r.empty ):
            noResults.append(lookupList[i])
        else:
            r['entryFromLookupList'] = lookupList[i]
            dataFromAPI.append(r)
   
            dataFromAPIasSingleDataFrame = pd.concat(dataFromAPI)
        
    return({"dataFromAPI": dataFromAPIasSingleDataFrame, "noResults": noResults})

#### Queries

In [37]:
queryPrefixForPublications = "search publications where authors ="
querySuffixForPublications = " return publications[id+type+volume+year+issue+title+journal+authors + times_cited] limit 1000"

queryPrefixForPublicationsWithConcepts = "search publications where authors ="
querySuffixForPublicationsWithConcepts = " return publications[id+type+volume+year+issue+title+journal+authors + times_cited + concepts_scores] limit 1000"

queryPrefixForHIndex = "search publications where authors =  "
querySuffixForHIndex = "return publications[times_cited] sort by times_cited limit 1000"

In [39]:
publicationData = queryDimensions(list(researchers.cleanedPI), queryPrefixForPublications, querySuffixForPublications, returnConceptsFrame=False)

Returned Publications: 4 (total = 4)
[2mTime: 1.12s[0m
Returned Publications: 39 (total = 39)
[2mTime: 1.08s[0m
Returned Publications: 10 (total = 10)
[2mTime: 1.04s[0m
Returned Publications: 140 (total = 140)
[2mTime: 1.81s[0m
Returned Publications: 186 (total = 186)
[2mTime: 1.72s[0m
Returned Publications: 88 (total = 88)
[2mTime: 1.43s[0m
Returned Publications: 148 (total = 148)
[2mTime: 1.64s[0m
Returned Publications: 115 (total = 115)
[2mTime: 1.69s[0m
Returned Publications: 3 (total = 3)
[2mTime: 1.16s[0m
Returned Publications: 68 (total = 68)
[2mTime: 1.37s[0m
Returned Publications: 65 (total = 65)
[2mTime: 1.35s[0m
Returned Publications: 275 (total = 275)
[2mTime: 1.99s[0m
Returned Publications: 0
[2mTime: 1.07s[0m
Returned Publications: 86 (total = 86)
[2mTime: 1.36s[0m
Returned Publications: 325 (total = 325)
[2mTime: 3.45s[0m
Returned Publications: 2 (total = 2)
[2mTime: 1.05s[0m
Returned Publications: 32 (total = 32)
[2mTime: 1.31s[0m
Retur

In [41]:
#publicationData['dataFromAPI']

In [44]:
#publicationData['dataFromAPI'].to_csv("./Data/publicationsAndCitations.csv")
researchers.to_csv("./Data/ResearcherNamesCleaned.csv")

#### Sample Code

Misc functions for H-Index

In [15]:
RESEARCHER = "ur.01357111535.49"

def the_H_function(sorted_citations_list, n=1):
    """from a list of integers [n1, n2 ..] representing publications citations,
    return the max list-position which is >= integer

    eg
    >>> the_H_function([10, 8, 5, 4, 3]) => 4
    >>> the_H_function([25, 8, 5, 3, 3]) => 3
    >>> the_H_function([1000, 20]) => 2
    """
    if sorted_citations_list and sorted_citations_list[0] >= n:
        return the_H_function(sorted_citations_list[1:], n+1)
    else:
        return n-1

def get_pubs_citations(researcher_id):
    q = """search publications where researchers.id = "{}" return publications[times_cited] sort by times_cited limit 1000"""
    pubs = dsl.query(q.format(researcher_id))
    return list(pubs.as_dataframe().fillna(0)['times_cited'])


print("H_index is:", the_H_function(get_pubs_citations(RESEARCHER)))

Returned Publications: 280 (total = 280)
[2mTime: 1.23s[0m
H_index is: 61
