# Enrichment of the CCT customers spreadsheet with GRID and Dimensions data

2019-08-29

In [81]:
# data analysis libraries
import json
import pandas as pd
from pandas.io.json import json_normalize
from tqdm import tqdm_notebook as tqdm
import time
from urllib.request import Request, urlopen # Python 3 
# Dimensions API query helper
import dimcli
from dimcli.shortcuts import chunks_of, dslquery
dimcli.login(instance="test")
dsl = dimcli.Dsl()
# 

DimCli v0.5.4 - Succesfully connected to <https://integration.ds-metrics.com> (method: dsl.ini file)


## 1. From Org Names to GRID IDs

In [37]:
APIKEY = "1c68ca9301dd9e60748780759bf10e48eb7155cb45527fe7"

def get_grid(stringa):
    s = "+".join(stringa.split())
    req = Request('https://grid.digital-science.com/api/unstructured_match?affiliation='+s) 
    req.add_header('api_key', APIKEY) 
    req.add_header('accept', 'application/json; version=2') 
    res = urlopen(req).read() 
    try:
        # Decode UTF-8 bytes to Unicode
        return json.loads(res.decode("utf-8"))
    except:
        print("Error decoding JSON - returning raw bytes array")
        return res

In [51]:
with open("orgs-cct.txt") as f:
    mylist = f.read().splitlines() 

In [55]:
# query GRID API for raw data and store in a dataframe
d = []
for x in mylist:
    print("====", x)
    grid_data = get_grid(x)
    if grid_data:
        gr = grid_data[0]['institute']['id']
    else:
        gr = None
    print(gr)
    d.append({'name' : x, 'grid' : gr})

df = pd.DataFrame.from_dict(d)
    

==== African Academy of Sciences (AAS)
grid.463020.3
==== Alzheimer's Research UK (ALZRUK)
grid.453466.6
==== Alzheimer's Society (AlzSoc)
grid.432249.a
==== Asthma UK (AUK)
grid.453156.0
==== Bloodwise (BW)
grid.453095.e
==== Breast Cancer Now (BCN)
grid.458394.7
==== British Heart Foundation (BHF)
grid.452924.c
==== Burroughs Wellcome Fund (BWF)
grid.427464.7
==== Caribbean Development Bank (CDB)
None
==== NIHR Central Commissioning Facility (CCF)
grid.473755.7
==== Debra International (DEBRA)
None
==== Diabetes UK (DUK)
grid.453048.e
==== DSA QAG
None
==== Education Endowment Foundation (EEF)
grid.484108.1
==== European & Developing Countries Clinical Trials Partnership (EDCTP)
None
==== Fight 4 Sight (F4S)
None
==== Fondation Botnar (FB)
None
==== Fondation contre le Cancer (FAC)
grid.453397.f
==== Fondazione Telethon (FT)
grid.11492.3f
==== Great Ormond Street Hospital (GOSH)
grid.420468.c
==== Harpur Trust
None
==== Health Research Board (HRB)
grid.413895.2
==== Holywood Trust
No

In [57]:
df.to_csv("orgs-grid-first-pass.csv")

## 2. From GRID IDs to grants

In [65]:
# let's reuse the manually edited CSV
df2 = pd.DataFrame().from_csv("orgs-grid-manual-edit.csv")
df2.head(10)

  


Unnamed: 0,grid,name
0,grid.463020.3,African Academy of Sciences (AAS)
1,grid.453466.6,Alzheimer's Research UK (ALZRUK)
2,grid.432249.a,Alzheimer's Society (AlzSoc)
3,grid.453156.0,Asthma UK (AUK)
4,grid.453095.e,Bloodwise (BW)
5,grid.458394.7,Breast Cancer Now (BCN)
6,grid.452924.c,British Heart Foundation (BHF)
7,grid.427464.7,Burroughs Wellcome Fund (BWF)
8,,Caribbean Development Bank (CDB)
9,grid.473755.7,NIHR Central Commissioning Facility (CCF)


Sample query

In [75]:
dslquery("""search grants where funders.id="grid.453466.6" return grants""").stats['total_count']

Returned Grants: 20 (total = 239)


239

In [83]:
def get_grants():
    counts = []
    for g in df2['grid']:
        if not pd.isna(g): 
            counts.append(dslquery("""search grants where funders.id="{}" return grants""".format(g)).stats['total_count'])
        else:
            counts.append(0)
        time.sleep(1)
    return counts

In [84]:
# Add column with Tot Grants
df2['grants'] = get_grants()

Returned Grants: 0
Returned Grants: 20 (total = 239)
Returned Grants: 20 (total = 474)
Returned Grants: 20 (total = 564)
Returned Grants: 20 (total = 2731)
Returned Grants: 20 (total = 677)
Returned Grants: 20 (total = 3724)
Returned Grants: 0
Returned Grants: 20 (total = 1402)
Returned Grants: 0
Returned Grants: 20 (total = 398)
Returned Grants: 0
Returned Grants: 0
Returned Grants: 20 (total = 155)
Returned Grants: 0
Returned Grants: 20 (total = 2261)
Returned Grants: 0
Returned Grants: 0
Returned Grants: 0
Returned Grants: 0
Returned Grants: 20 (total = 51)
Returned Grants: 0
Returned Grants: 20 (total = 236)
Returned Grants: 0
Returned Grants: 0
Returned Grants: 11 (total = 11)
Returned Grants: 20 (total = 57)
Returned Grants: 20 (total = 236)
Returned Grants: 0
Returned Grants: 20 (total = 837)
Returned Grants: 0
Returned Grants: 20 (total = 167)
Returned Grants: 0
Returned Grants: 0
Returned Grants: 0
Returned Grants: 20 (total = 52)
Returned Grants: 20 (total = 1863)
Returned Gr

In [86]:
df2.to_csv("orgs-grid-2-grants.csv")

## 3. From GRID IDs to pubs linked to grants

Sample query

In [88]:
dslquery("""search publications where funders.id="grid.453466.6" return publications""").stats['total_count']

Returned Publications: 20 (total = 1881)


1881

In [89]:
def get_pubs():
    counts = []
    for g in df2['grid']:
        if not pd.isna(g): 
            counts.append(dslquery("""search publications where funders.id="{}" return publications""".format(g)).stats['total_count'])
        else:
            counts.append(0)
        time.sleep(1)
    return counts

In [90]:
# Add column with Tot Grants
df2['pubs'] = get_pubs()

Returned Publications: 0
Returned Publications: 20 (total = 1881)
Returned Publications: 20 (total = 1385)
Returned Publications: 20 (total = 543)
Returned Publications: 20 (total = 2699)
Returned Publications: 20 (total = 1326)
Returned Publications: 20 (total = 24951)
Returned Publications: 20 (total = 8338)
Returned Publications: 20 (total = 1925)
Returned Publications: 0
Returned Publications: 20 (total = 2374)
Returned Publications: 0
Returned Publications: 0
Returned Publications: 20 (total = 638)
Returned Publications: 20 (total = 285)
Returned Publications: 20 (total = 14680)
Returned Publications: 20 (total = 1561)
Returned Publications: 20 (total = 5086)
Returned Publications: 20 (total = 42828)
Returned Publications: 0
Returned Publications: 20 (total = 438)
Returned Publications: 20 (total = 820)
Returned Publications: 20 (total = 572)
Returned Publications: 0
Returned Publications: 20 (total = 19654)
Returned Publications: 0
Returned Publications: 20 (total = 378)
Returned

In [91]:
df2.to_csv("orgs-grid-3-pubs.csv")

## 4. From GRID IDs to patents 

Sample query

In [92]:
dslquery("""search patents where funders.id="grid.453466.6" return patents""").stats['total_count']

Returned Patents: 0


0

In [93]:
def get_patents():
    counts = []
    for g in df2['grid']:
        if not pd.isna(g): 
            counts.append(dslquery("""search patents where funders.id="{}" return patents""".format(g)).stats['total_count'])
        else:
            counts.append(0)
        time.sleep(1)
    return counts

In [94]:
# Add column with Tot Grants
df2['patents'] = get_patents()

Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0
Returned Patents: 0


In [95]:
df2.to_csv("orgs-grid-4-patents.csv")

## 5. From GRID IDs to Clinical Studies 

Sample query

In [97]:
dslquery("""search clinical_trials where funders.id="grid.453466.6" return clinical_trials""").stats['total_count']

Returned Clinical_trials: 4 (total = 4)


4

In [98]:
def get_cltrials():
    counts = []
    for g in df2['grid']:
        if not pd.isna(g): 
            counts.append(dslquery("""search clinical_trials where funders.id="{}" return clinical_trials""".format(g)).stats['total_count'])
        else:
            counts.append(0)
        time.sleep(1)
    return counts

In [99]:
# Add column with Tot Grants
df2['cltrials'] = get_cltrials()

Returned Clinical_trials: 0
Returned Clinical_trials: 4 (total = 4)
Returned Clinical_trials: 20 (total = 26)
Returned Clinical_trials: 20 (total = 24)
Returned Clinical_trials: 20 (total = 48)
Returned Clinical_trials: 11 (total = 11)
Returned Clinical_trials: 20 (total = 253)
Returned Clinical_trials: 0
Returned Clinical_trials: 20 (total = 83)
Returned Clinical_trials: 0
Returned Clinical_trials: 20 (total = 68)
Returned Clinical_trials: 0
Returned Clinical_trials: 0
Returned Clinical_trials: 6 (total = 6)
Returned Clinical_trials: 5 (total = 5)
Returned Clinical_trials: 2 (total = 2)
Returned Clinical_trials: 20 (total = 247)
Returned Clinical_trials: 20 (total = 82)
Returned Clinical_trials: 4 (total = 4)
Returned Clinical_trials: 0
Returned Clinical_trials: 4 (total = 4)
Returned Clinical_trials: 2 (total = 2)
Returned Clinical_trials: 20 (total = 24)
Returned Clinical_trials: 0
Returned Clinical_trials: 1 (total = 1)
Returned Clinical_trials: 0
Returned Clinical_trials: 0
Return

In [100]:
df2.to_csv("orgs-grid-5-cltrials.csv")

## 6. From Grants to Publications linked to Grants 

Sample query

In [96]:
dslquery("""search publications where supporting_grant_ids in ["grant.8528575"] return publications""").stats['total_count']

Returned Publications: 0


0

In [120]:
def get_grants(gridid):
    res = dsl.query_iterative("""search grants where funders.id="{}" return grants[id]""".format(gridid))
    return [x['id'] for x in res.grants]


def get_pubs_linked_to_grants(gridid):
    "use slicing to get all related objects via an inverse relationship"
    grants = get_grants(gridid)
    pubs = set()
    q = """search publications where supporting_grant_ids in {} return publications[id]"""
    for chunk in chunks_of(grants, 400):
#         print(q.format(json.dumps(chunk)))
        res = dsl.query_iterative(q.format(json.dumps(chunk)))
        for publication in res.publications:
            pubs.add(publication['id'])
    return len(pubs)

def looper_pubs_linked_to_grants():
    counts = []
    for g in df2['grid']:
        if not pd.isna(g): 
            counts.append(get_pubs_linked_to_grants(g))
        else:
            counts.append(0)
        time.sleep(1)
    return counts


In [121]:
# Add column with Tot Grants
df2['pubs_linked_grants'] = looper_pubs_linked_to_grants()

0 / 0
239 / 239
0 / 0
0 / 0
0 / 0
474 / 474
0 / 0
0 / 0
5 / 5
0 / 0
0 / 0
564 / 564
54 / 54
9 / 9
0 / 0
0 / 0
0 / 0
0 / 0
1000 / 2731
2000 / 2731
2731 / 2731
17 / 17
81 / 81
40 / 40
71 / 71
139 / 139
75 / 75
32 / 32
52 / 52
26 / 26
20 / 20
48 / 48
16 / 16
27 / 27
15 / 15
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
677 / 677
16 / 16
43 / 43
122 / 122
51 / 51
20 / 20
14 / 14
2 / 2
1000 / 3724
2000 / 3724
3000 / 3724
3724 / 3724
114 / 114
347 / 347
907 / 907
1000 / 1640
1640 / 1640
1000 / 1128
1128 / 1128
853 / 853
384 / 384
417 / 417
573 / 573
353 / 353
384 / 384
465 / 465
406 / 406
495 / 495
530 / 530
467 / 467
328 / 328
589 / 589
715 / 715
535 / 535
463 / 463
580 / 580
876 / 876
1000 / 1102
1102 / 1102
273 / 273
526 / 526
236 / 236
613 / 613
683 / 683
584 / 584
618 / 618
569 / 569
718 / 718
333 / 333
620 / 620
485 / 485
670 / 670
92 / 92
0 / 0
1000 / 1402
1402 / 1402
7 / 7
119 / 119
86 / 86
124 / 124
109 / 109
95 / 95
91 / 91
160 / 160
133 / 133


In [122]:
df2.to_csv("orgs-grid-6-pubs-linked-to-grants.csv")

## 7. From Grants to Patents linked to Grants 

Sample query

In [126]:
dslquery("""search patents where associated_grant_ids in ["grant.8528575"] return patents""").stats['total_count']

Returned Patents: 0


0

In [127]:
def get_grants(gridid):
    res = dsl.query_iterative("""search grants where funders.id="{}" return grants[id]""".format(gridid))
    return [x['id'] for x in res.grants]


def get_patents_linked_to_grants(gridid):
    "use slicing to get all related objects via an inverse relationship"
    grants = get_grants(gridid)
    patents = set()
    q = """search patents where associated_grant_ids in {} return patents[id]"""
    for chunk in chunks_of(grants, 400):
#         print(q.format(json.dumps(chunk)))
        res = dsl.query_iterative(q.format(json.dumps(chunk)))
        for patent in res.patents:
            patents.add(patent['id'])
    return len(patents)

def looper_patents_linked_to_grants():
    counts = []
    for g in df2['grid']:
        if not pd.isna(g): 
            counts.append(get_patents_linked_to_grants(g))
        else:
            counts.append(0)
        time.sleep(1)
    return counts


In [128]:
# Add column with Tot Grants
df2['patents_linked_grants'] = looper_patents_linked_to_grants()

0 / 0
239 / 239
0 / 0
474 / 474
0 / 0
0 / 0
564 / 564
0 / 0
0 / 0
1000 / 2731
2000 / 2731
2731 / 2731
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
677 / 677
0 / 0
0 / 0
1000 / 3724
2000 / 3724
3000 / 3724
3724 / 3724
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
1000 / 1402
1402 / 1402
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
398 / 398
0 / 0
0 / 0
0 / 0
155 / 155
0 / 0
0 / 0
1000 / 2261
2000 / 2261
2261 / 2261
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
51 / 51
0 / 0
0 / 0
236 / 236
0 / 0
0 / 0
0 / 0
11 / 11
0 / 0
57 / 57
0 / 0
236 / 236
0 / 0
0 / 0
837 / 837
0 / 0
0 / 0
0 / 0
0 / 0
167 / 167
0 / 0
0 / 0
0 / 0
0 / 0
52 / 52
0 / 0
1000 / 1863
1863 / 1863
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
1000 / 25492
2000 / 25492
3000 / 25492
4000 / 25492
5000 / 25492
6000 / 25492
7000 / 25492
8000 / 25492
9000 / 25492
10000 / 25492
11000 / 25492
12000 / 25492
13000 / 25492
14000 / 25492
15000 / 25492
16000 / 25492
17000 / 25492
18000 / 25492
19000 / 25492
20000 / 25492
21000 / 25492
2

In [129]:
df2.to_csv("orgs-grid-7-patents-linked-to-grants.csv")

## 8. From Grants to ClTrials linked to Grants 

Sample query

In [130]:
dslquery("""search clinical_trials where associated_grant_ids in ["grant.8528575"] return clinical_trials""").stats['total_count']

Returned Clinical_trials: 0


0

In [135]:
def get_grants(gridid):
    res = dsl.query_iterative("""search grants where funders.id="{}" return grants[id]""".format(gridid))
    return [x['id'] for x in res.grants]


def get_cltrials_linked_to_grants(gridid):
    "use slicing to get all related objects via an inverse relationship"
    grants = get_grants(gridid)
    cltrials = set()
    q = """search clinical_trials where associated_grant_ids in {} return clinical_trials[id]"""
    for chunk in chunks_of(grants, 200):
#         print(q.format(json.dumps(chunk)))
        res = dsl.query_iterative(q.format(json.dumps(chunk)))
        for cl in res.clinical_trials:
            cltrials.add(cl['id'])
    return len(cltrials)

def looper_trials_linked_to_grants():
    counts = []
    for g in df2['grid']:
        if not pd.isna(g): 
            counts.append(get_cltrials_linked_to_grants(g))
        else:
            counts.append(0)
        time.sleep(1)
    return counts


In [136]:
# Add column with Tot Grants
df2['cltrials_linked_grants'] = looper_trials_linked_to_grants()

0 / 0
239 / 239
0 / 0
0 / 0
474 / 474
0 / 0
0 / 0
0 / 0
564 / 564
0 / 0
0 / 0
0 / 0
1000 / 2731
2000 / 2731
2731 / 2731
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0
0 / 0


KeyboardInterrupt: 

In [134]:
df2.to_csv("orgs-grid-8-cltrials-linked-to-grants.csv")