# Dimensions Patents Statistics Report, for one or more Organizations

- Get all pubs for a GRID in a period that were cited by patents 
    - https://docs.google.com/spreadsheets/d/11flWYNrnwwEJ5WzLXC_N2eItSHeatRQiLJZZ4euqIPo/edit#gid=0


## 1. Install Libraries and Log into Dimensions API


In [1]:
# @markdown # Get the API library and login 
# @markdown **Privacy tip**: leave the password blank and you'll be asked for it later. This can be handy on shared computers.
username = "dsl.demo.1@dimensions.ai"  #@param {type: "string"}
password = "1.Demo.Dsl"  #@param {type: "string"}
endpoint = "https://app.dimensions.ai"  #@param {type: "string"}


# INSTALL/LOAD LIBRARIES 
# ps optimized for Google Colab /modify installation as needed based on your environment
# 
print("==\nInstalling libraries..")
!pip install dimcli -U --quiet 

import os
import sys
import time
import json
import pandas as pd
from pandas.io.json import json_normalize
# from tqdm import tqdm_notebook as pbar
from tqdm.notebook import tqdm as pbar
import dimcli 
from dimcli.shortcuts import *

# AUTHENTICATION 
# https://github.com/digital-science/dimcli#authentication
#
# == Google Colab users ==
# If username/password not provided, the interactive setup assistant `dimcli --init` is invoked
#
# == Jupyter Notebook users == 
# If username/password not provided, try to use the global API credentials file.
# To create one, open a terminal (File/New/Terminal) and run `dimcli --init` from there
#  
#
print("==\nLogging in..")
if username and password:
  dimcli.login(username, password, endpoint)
else:
  if 'google.colab' in sys.modules:
    print("Environment: Google Colab")
    if username and not password:
      import getpass
      password = getpass.getpass(prompt='Password: ')     
      dimcli.login(username, password, endpoint)
    else:
      print("... launching interactive setup assistant")
      !dimcli --init    
      dimcli.login()
  else:
    print("Environment: Jupyter Notebook\n... looking for API credentials file")
    dimcli.login()

dsl = dimcli.Dsl()

   

==
Installing libraries..
==
Logging in..
DimCli v0.6.2.2 - Succesfully connected to <https://app.dimensions.ai> (method: manual login)


## 2. GRID organization and time frame selection

Tip: pick one from https://grid.ac/institutes. 

In [41]:
#@markdown Please enter a valid org grid

GRIDID = "grid.42505.36" #@param {type:"string"}

#@markdown The start/end year of publications used to extract patents
YEAR_START = 2000 #@param {type: "slider", min: 1950, max: 2020}
YEAR_END = 2005 #@param {type: "slider", min: 1950, max: 2020}

if YEAR_END < YEAR_START:
  YEAR_END = YEAR_START

# gen link to Dimensions
from IPython.core.display import display, HTML
display(HTML('GRID: <a href="{}" title="View selected organization in Dimensions">{} &#x29c9;</a>'.format(dimensions_url(GRIDID), GRIDID)))
display(HTML('Time period: {} to {}'.format(YEAR_START, YEAR_END)))

#
# data-saving utils 
#
DATAFOLDER = "stats_data_" + str(GRIDID)
if not os.path.exists(DATAFOLDER):
  !mkdir $DATAFOLDER
  print(f"==\nCreated data folder:", DATAFOLDER + "/")
#
#
def save_as_csv(df, save_name_without_extension):
    "usage: `save_as_csv(dataframe, 'filename')`"
    df.to_csv(f"{DATAFOLDER}/{save_name_without_extension}.csv", index=False)
    print("===\nSaving: ", f"{DATAFOLDER}/{save_name_without_extension}.csv")


## 3. Get all publications that were cited by patents

In [None]:
# ;;;;;
#
# get publications from selected grid and time period
#
# ;;;;;


print("===\nExtracting publications for: ", GRIDID, "from", YEAR_START, "to", YEAR_END)

publications = dsl.query_iterative(f"""
search publications
    where research_orgs.id = "{GRIDID}"
    and year in [{YEAR_START}:{YEAR_END}]
    return publications[id+doi+title+times_cited+recent_citations+field_citation_ratio+category_for+authors]
""").as_dataframe()

print("Total publications found: ", len(publications))


# ;;;;;
#
# get patents citations 
#
# ;;;;;



pubsids = list(publications['id'])

print("\n===\nExtracting patents citing these publications")

q = """search patents where publication_ids in {}
  return patents[id+title+category_for+inventor_names+assignee_names+publication_ids+times_cited]"""

# iterate pubids using chunks 
VERBOSE = False
CHUNKS_SIZE = 300 
results = []

for chunk in pbar(list(chunks_of(pubsids, CHUNKS_SIZE))):
    query = q.format(json.dumps(chunk))
    data = dsl.query_iterative(query, verbose=VERBOSE)
    results += data.patents
    time.sleep(0.5)

# remove duplicates
patents = pd.DataFrame().from_dict(results)
patents.drop_duplicates(subset='id', inplace=True)

print("Total related patents found: ", len(patents))

===
Extracting publications for:  grid.42505.36 
Period:  2000 2005
1000 / 17584
2000 / 17584
3000 / 17584
4000 / 17584
5000 / 17584
6000 / 17584
7000 / 17584
8000 / 17584
9000 / 17584
10000 / 17584
11000 / 17584
12000 / 17584
13000 / 17584
14000 / 17584
15000 / 17584
16000 / 17584
17000 / 17584
17584 / 17584
Total publications found:  17584

===
Extracting patents citing these publications


HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))

In [33]:
    
# ;;;;;
#
# count PATENTS per publication
#
# ;;;;;


print("\n===\nCounting patents per publication...")

# build str column version for checking inclusion
patents['publication_ids_str'] = patents['publication_ids'].apply(lambda x: ','.join(map(str, x)))                                 
def get_patents_per_pub(pubid):
    global patents
    # turn list into str and check content in one line
    return patents[patents['publication_ids_str'].str.contains(pubid)]['id']



publications['patents_count'] = 0
publications['patents_ids'] = ""
for index, row in pbar(publications.iterrows(), total=publications.shape[0]):
    match_patents = get_patents_per_pub(row['id'])
    publications.at[index,'patents_count'] = len(match_patents)
    publications.at[index,'patents_ids'] = list(match_patents)

    
if False:
    totcount = []
    ids = []
    for x in pbar(pubsids):
        match_patents = get_patents_per_pub(x)
        totcount.append(len(match_patents))
        ids.append(list(match_patents))

    publications['patents_count'] = totcount
    publications['patents_ids'] = ids

# keep only pubs with at least one patent citation and sort by citations     
publications_subset = publications[publications['patents_count'] > 0].copy()
publications_subset.sort_values("patents_count", ascending=False, inplace=True)

print("Total publications with at least one patent citation: ", len(publications_subset))



# ;;;;;
#
# count PUBLICATIONS per patent 
#
# ;;;;;


print("\n===\nCounting publications per patent ...")

# count tot publications cited
patents['publications_cited_tot'] = patents['publication_ids'].apply(lambda x: len(x))

# count tot publications cited from GRIDID
def is_in_grid_pubs(test_ids):
    "intersection of two lists: all cited pubs VS pubs from selected grid org"
    global pubsids
    return len(list(set(test_ids) & set(pubsids)))

patents['publications_cited_grid'] = patents['publication_ids'].apply(lambda x: is_in_grid_pubs(x))



# ;;;;;
#
# simplify JSON publication fields into simple strings 
#
# ;;;;;


print("===\nSimplifying publication/patents fields..")

# turn ids into URLs
publications_subset['id'] = publications_subset['id'].apply(lambda x: dimensions_url(x))
# simplify FOR codes (after filling in blanks)
publications_subset['category_for'] = publications_subset['category_for'].fillna("").apply(lambda x: "; ".join([y['name'] for y in x]))

# represent authors and affiliations as semicolon-delimited lists 

def nice_authors(authorslist):
    authors = []
    for x in authorslist:
        name = x.get('first_name', "") + " " + x.get('last_name', "")
        authors.append(name)
    return "; ".join(authors)

def nice_affiliations(authorslist):
    affiliations = []
    for x in authorslist:
        for a in x['affiliations']:
            affiliations.append(a.get('name', ""))
    return "; ".join(list(set(affiliations)))

# extract OWN authors (at any point in time!) 
def ownauthors(authorslist):
    ownauthors = []
    global GRIDID
    for x in authorslist:
        name = x.get('first_name', "") + " " + x.get('last_name', "")
        for a in x['affiliations']:
            if "id" in a and a['id'] == GRIDID:
                ownauthors.append(name)
    return "; ".join(ownauthors)

publications_subset['all_authors'] = publications_subset['authors'].fillna("").apply(lambda x: nice_authors(x))
publications_subset['own_authors'] = publications_subset['authors'].fillna("").apply(lambda x: ownauthors(x))
publications_subset['affiliations'] = publications_subset['authors'].fillna("").apply(lambda x: nice_affiliations(x))

# sort columns
publications_subset = publications_subset[['id', 'title', 'times_cited', 'recent_citations', 'field_citation_ratio', 'category_for', 'all_authors',  'own_authors', 'affiliations', 'patents_count', 'patents_ids']]



# ;;;;;
#
# simplify JSON patent fields  
#
# ;;;;;

# turn ids into URLs
patents['id'] = patents['id'].apply(lambda x: dimensions_url(x, "patents"))
# simplify FOR codes (after filling in blanks)
patents['category_for'] = patents['category_for'].fillna("").apply(lambda x: "; ".join([y['name'] for y in x]))

# transform list into semicolon delimited string
patents['inventor_names'] = patents['inventor_names'].fillna("").apply(lambda x: "; ".join([y for y in x]))
patents['assignee_names'] = patents['assignee_names'].fillna("").apply(lambda x: "; ".join([y for y in x]))
# set no value to 0
patents['times_cited'].fillna(0, inplace=True)

# sort columns
patents = patents[['id', 'title', 'times_cited', 'category_for', 'inventor_names', 'assignee_names', 'publications_cited_tot', 'publications_cited_grid', 'publication_ids']]


# ;;;;;
#
# save the data as CSV
#
# ;;;;;


save_as_csv(publications_subset, "publications")
save_as_csv(patents, "patents")


print("===\nCompleted.")


===
Counting patents per publication...


HBox(children=(FloatProgress(value=0.0, max=2358.0), HTML(value='')))


Total publications with at least one patent citation:  202

===
Counting publications per patent ...
===
Simplifying publication/patents fields..


TypeError: string indices must be integers

## 4. Downloading the results 

If you are viewing this notebook in **Google Colab**, run the following cell to download all data as a zip file. 

In [None]:

# zip up all files to make download easier
import zipfile
import os 

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

zip_name = DATAFOLDER + '.zip'
zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
zipdir(DATAFOLDER + '/', zipf)
zipf.close()

try:
  # try to download from colab: sometimes it fails hence print a message
  from google.colab import files
  time.sleep(5)
  files.download(zip_name) 
except:
  print("Google Colab failed to download - please try again.")
