# Dimcli open in colab badge

In [None]:
<a href="https://colab.research.google.com/github/digital-science/dimensions-api-lab" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open Dimensions API Lab In Google Colab"/></a>

# Dimcli Install - Short Version

A bunch of extra libraries are also installed

In [None]:
# @markdown # Get the API library and login 
# @markdown Click the 'play' button on the left (or shift+enter) after entering your API credentials

username = "" #@param {type: "string"}
password = "" #@param {type: "string"}
endpoint = "https://app.dimensions.ai" #@param {type: "string"}

!pip install dimcli plotly tqdm -U --quiet

# load common libraries
import pandas as pd
from pandas.io.json import json_normalize

import time
import json
from tqdm.notebook import tqdm as progress

import plotly.express as px
from plotly.offline import plot
if not 'google.colab' in sys.modules:
  # make js dependecies local / needed by html exports 
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=True)

import dimcli
from dimcli.shortcuts import *

dimcli.login(username, password, endpoint)
dsl = dimcli.Dsl()


# Dimcli Install - Long Version with PSW hiding

In [None]:
# @markdown # Get the API library and login 
# @markdown **Privacy tip**: leave the password blank and you'll be asked for it later. This can be handy on shared computers.
username = ""  #@param {type: "string"}
password = ""  #@param {type: "string"}
endpoint = "https://app.dimensions.ai"  #@param {type: "string"}


print("==\nInstalling libraries..")
!pip install dimcli plotly_express -U --quiet 
import dimcli
from dimcli.shortcuts import *

#
# load common libraries
import os
import sys
import time
import json
import pandas as pd
from pandas.io.json import json_normalize
from tqdm import tqdm_notebook as progressbar
#
# charts lib
import plotly_express as px
if not 'google.colab' in sys.modules:
  # make js dependecies local / needed by html exports 
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=True)



# AUTHENTICATION 
# https://github.com/digital-science/dimcli#authentication
#
# == Google Colab users ==
# If username/password not provided, the interactive setup assistant `dimcli --init` is invoked
#
# == Jupyter Notebook users == 
# If username/password not provided, try to use the global API credentials file.
# To create one, open a terminal (File/New/Terminal) and run `dimcli --init` from there
#  
#
print("==\nLogging in..")
if username and password:
  dimcli.login(username, password, endpoint)
else:
  if 'google.colab' in sys.modules:
    print("Environment: Google Colab")
    if username and not password:
      import getpass
      password = getpass.getpass(prompt='Password: ')     
      dimcli.login(username, password, endpoint)
    else:
      print("... launching interactive setup assistant")
      !dimcli --init    
      dimcli.login()
  else:
    print("Environment: Jupyter Notebook\n... looking for API credentials file")
    dimcli.login()

dsl = dimcli.Dsl()



#
# data-saving utils 
#
DATAFOLDER = "extraction1"
if not os.path.exists(DATAFOLDER):
  !mkdir $DATAFOLDER
  print(f"==\nCreated data folder:", DATAFOLDER + "/")
#
#
def save_as_csv(df, save_name_without_extension):
    "usage: `save_as_csv(dataframe, 'filename')`"
    df.to_csv(f"{DATAFOLDER}/{save_name_without_extension}.csv", index=False)
    print("===\nSaved: ", f"{DATAFOLDER}/{save_name_without_extension}.csv")
   

# Dimcli Install - Newest version with PSW hiding

In [None]:
# @markdown # Get the API library and login 
# @markdown Note: on Google Colab you'll be asked to input credentials each time, so to prevent sharing them accidentally.
!pip install dimcli --quiet 

import dimcli
from dimcli.shortcuts import *
import json
import sys
import pandas as pd
#

print("==\nLogging in..")
# https://github.com/digital-science/dimcli#authentication
ENDPOINT = "https://app.dimensions.ai"
if 'google.colab' in sys.modules:
  import getpass
  USERNAME = getpass.getpass(prompt='Username: ')
  PASSWORD = getpass.getpass(prompt='Password: ')    
  dimcli.login(USERNAME, PASSWORD, ENDPOINT)
else:
  USERNAME, PASSWORD  = "", ""
  dimcli.login(USERNAME, PASSWORD, ENDPOINT)
dsl = dimcli.Dsl()

# Dimcli Selector With Dropdowns 

In [None]:
# tip 
# the subjects dropdown can be generated via 
# str(["%s" % s for s in  sorted(dimcli.G.categories('category_for')) if len(s.split()[0]) > 2])
#

start_year = 2015  #@param {type: "slider", min: 1980, max: 2020}
end_year = 2019  #@param {type: "slider", min: 1980, max: 2020}
subject1 = "0911 Maritime Engineering"  #@param ['0101 Pure Mathematics', '0102 Applied Mathematics', '0103 Numerical and Computational Mathematics', '0104 Statistics', '0105 Mathematical Physics', '0201 Astronomical and Space Sciences', '0202 Atomic, Molecular, Nuclear, Particle and Plasma Physics', '0203 Classical Physics', '0204 Condensed Matter Physics', '0205 Optical Physics', '0206 Quantum Physics', '0299 Other Physical Sciences', '0301 Analytical Chemistry', '0302 Inorganic Chemistry', '0303 Macromolecular and Materials Chemistry', '0304 Medicinal and Biomolecular Chemistry', '0305 Organic Chemistry', '0306 Physical Chemistry (incl. Structural)', '0307 Theoretical and Computational Chemistry', '0399 Other Chemical Sciences', '0401 Atmospheric Sciences', '0402 Geochemistry', '0403 Geology', '0404 Geophysics', '0405 Oceanography', '0406 Physical Geography and Environmental Geoscience', '0499 Other Earth Sciences', '0501 Ecological Applications', '0502 Environmental Science and Management', '0503 Soil Sciences', '0599 Other Environmental Sciences', '0601 Biochemistry and Cell Biology', '0602 Ecology', '0603 Evolutionary Biology', '0604 Genetics', '0605 Microbiology', '0606 Physiology', '0607 Plant Biology', '0608 Zoology', '0699 Other Biological Sciences', '0701 Agriculture, Land and Farm Management', '0702 Animal Production', '0703 Crop and Pasture Production', '0704 Fisheries Sciences', '0705 Forestry Sciences', '0706 Horticultural Production', '0707 Veterinary Sciences', '0799 Other Agricultural and Veterinary Sciences', '0801 Artificial Intelligence and Image Processing', '0802 Computation Theory and Mathematics', '0803 Computer Software', '0804 Data Format', '0805 Distributed Computing', '0806 Information Systems', '0807 Library and Information Studies', '0899 Other Information and Computing Sciences', '0901 Aerospace Engineering', '0902 Automotive Engineering', '0903 Biomedical Engineering', '0904 Chemical Engineering', '0905 Civil Engineering', '0906 Electrical and Electronic Engineering', '0907 Environmental Engineering', '0908 Food Sciences', '0909 Geomatic Engineering', '0910 Manufacturing Engineering', '0911 Maritime Engineering', '0912 Materials Engineering', '0913 Mechanical Engineering', '0914 Resources Engineering and Extractive Metallurgy', '0915 Interdisciplinary Engineering', '0999 Other Engineering', '1001 Agricultural Biotechnology', '1002 Environmental Biotechnology', '1003 Industrial Biotechnology', '1004 Medical Biotechnology', '1005 Communications Technologies', '1006 Computer Hardware', '1007 Nanotechnology', '1099 Other Technology', '1101 Medical Biochemistry and Metabolomics', '1102 Cardiorespiratory Medicine and Haematology', '1103 Clinical Sciences', '1104 Complementary and Alternative Medicine', '1105 Dentistry', '1106 Human Movement and Sports Science', '1107 Immunology', '1108 Medical Microbiology', '1109 Neurosciences', '1110 Nursing', '1111 Nutrition and Dietetics', '1112 Oncology and Carcinogenesis', '1113 Ophthalmology and Optometry', '1114 Paediatrics and Reproductive Medicine', '1115 Pharmacology and Pharmaceutical Sciences', '1116 Medical Physiology', '1117 Public Health and Health Services', '1199 Other Medical and Health Sciences', '1201 Architecture', '1202 Building', '1203 Design Practice and Management', '1205 Urban and Regional Planning', '1299 Other Built Environment and Design', '1301 Education Systems', '1302 Curriculum and Pedagogy', '1303 Specialist Studies In Education', '1399 Other Education', '1401 Economic Theory', '1402 Applied Economics', '1403 Econometrics', '1499 Other Economics', '1501 Accounting, Auditing and Accountability', '1502 Banking, Finance and Investment', '1503 Business and Management', '1504 Commercial Services', '1505 Marketing', '1506 Tourism', '1507 Transportation and Freight Services', '1601 Anthropology', '1602 Criminology', '1603 Demography', '1604 Human Geography', '1605 Policy and Administration', '1606 Political Science', '1607 Social Work', '1608 Sociology', '1699 Other Studies In Human Society', '1701 Psychology', '1702 Cognitive Sciences', '1799 Other Psychology and Cognitive Sciences', '1801 Law', '1899 Other Law and Legal Studies', '1901 Art Theory and Criticism', '1902 Film, Television and Digital Media', '1903 Journalism and Professional Writing', '1904 Performing Arts and Creative Writing', '1905 Visual Arts and Crafts', '1999 Other Studies In Creative Arts and Writing', '2001 Communication and Media Studies', '2002 Cultural Studies', '2003 Language Studies', '2004 Linguistics', '2005 Literary Studies', '2099 Other Language, Communication and Culture', '2101 Archaeology', '2102 Curatorial and Related Studies', '2103 Historical Studies', '2199 Other History and Archaeology', '2201 Applied Ethics', '2202 History and Philosophy of Specific Fields', '2203 Philosophy', '2204 Religion and Religious Studies', '2299 Other Philosophy and Religious Studies']
subject2 = "None"  #@param ['None', '0101 Pure Mathematics', '0102 Applied Mathematics', '0103 Numerical and Computational Mathematics', '0104 Statistics', '0105 Mathematical Physics', '0201 Astronomical and Space Sciences', '0202 Atomic, Molecular, Nuclear, Particle and Plasma Physics', '0203 Classical Physics', '0204 Condensed Matter Physics', '0205 Optical Physics', '0206 Quantum Physics', '0299 Other Physical Sciences', '0301 Analytical Chemistry', '0302 Inorganic Chemistry', '0303 Macromolecular and Materials Chemistry', '0304 Medicinal and Biomolecular Chemistry', '0305 Organic Chemistry', '0306 Physical Chemistry (incl. Structural)', '0307 Theoretical and Computational Chemistry', '0399 Other Chemical Sciences', '0401 Atmospheric Sciences', '0402 Geochemistry', '0403 Geology', '0404 Geophysics', '0405 Oceanography', '0406 Physical Geography and Environmental Geoscience', '0499 Other Earth Sciences', '0501 Ecological Applications', '0502 Environmental Science and Management', '0503 Soil Sciences', '0599 Other Environmental Sciences', '0601 Biochemistry and Cell Biology', '0602 Ecology', '0603 Evolutionary Biology', '0604 Genetics', '0605 Microbiology', '0606 Physiology', '0607 Plant Biology', '0608 Zoology', '0699 Other Biological Sciences', '0701 Agriculture, Land and Farm Management', '0702 Animal Production', '0703 Crop and Pasture Production', '0704 Fisheries Sciences', '0705 Forestry Sciences', '0706 Horticultural Production', '0707 Veterinary Sciences', '0799 Other Agricultural and Veterinary Sciences', '0801 Artificial Intelligence and Image Processing', '0802 Computation Theory and Mathematics', '0803 Computer Software', '0804 Data Format', '0805 Distributed Computing', '0806 Information Systems', '0807 Library and Information Studies', '0899 Other Information and Computing Sciences', '0901 Aerospace Engineering', '0902 Automotive Engineering', '0903 Biomedical Engineering', '0904 Chemical Engineering', '0905 Civil Engineering', '0906 Electrical and Electronic Engineering', '0907 Environmental Engineering', '0908 Food Sciences', '0909 Geomatic Engineering', '0910 Manufacturing Engineering', '0911 Maritime Engineering', '0912 Materials Engineering', '0913 Mechanical Engineering', '0914 Resources Engineering and Extractive Metallurgy', '0915 Interdisciplinary Engineering', '0999 Other Engineering', '1001 Agricultural Biotechnology', '1002 Environmental Biotechnology', '1003 Industrial Biotechnology', '1004 Medical Biotechnology', '1005 Communications Technologies', '1006 Computer Hardware', '1007 Nanotechnology', '1099 Other Technology', '1101 Medical Biochemistry and Metabolomics', '1102 Cardiorespiratory Medicine and Haematology', '1103 Clinical Sciences', '1104 Complementary and Alternative Medicine', '1105 Dentistry', '1106 Human Movement and Sports Science', '1107 Immunology', '1108 Medical Microbiology', '1109 Neurosciences', '1110 Nursing', '1111 Nutrition and Dietetics', '1112 Oncology and Carcinogenesis', '1113 Ophthalmology and Optometry', '1114 Paediatrics and Reproductive Medicine', '1115 Pharmacology and Pharmaceutical Sciences', '1116 Medical Physiology', '1117 Public Health and Health Services', '1199 Other Medical and Health Sciences', '1201 Architecture', '1202 Building', '1203 Design Practice and Management', '1205 Urban and Regional Planning', '1299 Other Built Environment and Design', '1301 Education Systems', '1302 Curriculum and Pedagogy', '1303 Specialist Studies In Education', '1399 Other Education', '1401 Economic Theory', '1402 Applied Economics', '1403 Econometrics', '1499 Other Economics', '1501 Accounting, Auditing and Accountability', '1502 Banking, Finance and Investment', '1503 Business and Management', '1504 Commercial Services', '1505 Marketing', '1506 Tourism', '1507 Transportation and Freight Services', '1601 Anthropology', '1602 Criminology', '1603 Demography', '1604 Human Geography', '1605 Policy and Administration', '1606 Political Science', '1607 Social Work', '1608 Sociology', '1699 Other Studies In Human Society', '1701 Psychology', '1702 Cognitive Sciences', '1799 Other Psychology and Cognitive Sciences', '1801 Law', '1899 Other Law and Legal Studies', '1901 Art Theory and Criticism', '1902 Film, Television and Digital Media', '1903 Journalism and Professional Writing', '1904 Performing Arts and Creative Writing', '1905 Visual Arts and Crafts', '1999 Other Studies In Creative Arts and Writing', '2001 Communication and Media Studies', '2002 Cultural Studies', '2003 Language Studies', '2004 Linguistics', '2005 Literary Studies', '2099 Other Language, Communication and Culture', '2101 Archaeology', '2102 Curatorial and Related Studies', '2103 Historical Studies', '2199 Other History and Archaeology', '2201 Applied Ethics', '2202 History and Philosophy of Specific Fields', '2203 Philosophy', '2204 Religion and Religious Studies', '2299 Other Philosophy and Religious Studies']
connector = "or"  #@param ['or', 'and']
#
#

# Dimcli looped query using `chunks`

For example, if we are extracting several researchers infos based on a long list of IDs 

In [None]:
import json
from dimcli.shortcuts import chunks_of
from tqdm import tqdm_notebook as pbar

query = """search researchers where id in {} return researchers[id+last_grant_year+total_grants] limit 1000"""

ids = ["ur.011177563061.78", "ur.011177563061.55", "ur.011177563061.10"] # etc..  

results = []
for chunk in pbar(list(chunks_of(list(ids), 400))):
    q = dsl.query(query.format(json.dumps(chunk)))
    results += q.researchers

# save to a dataframe for further analysis..
df = pd.DataFrame().from_dict(results)

# Dimcli nested looped query with `chunks`

In [None]:
import json
import time
from dimcli.shortcuts import chunks_of
from tqdm import tqdm_notebook as pbar

queries = {
    "Publications" : """search publications where research_orgs in {} return publications limit 1""" ,
    "Grants" : """search grants where research_orgs in {} return grants limit 1""" ,
    "Clinical Trials" : """search clinical_trials where organizations in {} return clinical_trials limit 1""" ,
    "Patents" : """search patents where assignees in {} return patents limit 1""" ,
    "Policy Documents" : """search policy_documents where publisher_org in {} return policy_documents limit 1""" ,
    "Altmetric" : """search publications where research_orgs in {} and altmetric > 0 return publications limit 1""" ,
}

gridids = ['grid.461628.f', 'grid.418010.c', 'grid.461634.2'] # and more...
#
results = []
#
loop1 = pbar(list(queries))
#
for doctype in loop1:
  loop1.set_description("Processing %s" % doctype)
  #
  loop2 = pbar(list(chunks_of(list(gridids), 20)))
  #
  tot = 0
  for chunk in loop2: 
    loop2.set_description("Processing Grid IDs..")
    q = queries[doctype].format(json.dumps(chunk))
    #
    data = dsl.query(q, verbose=False)
    #
    tot += data.count_total
    time.sleep(1)
  results.append({'doctype': doctype, 'count' : tot})


# save to a dataframe
df = pd.DataFrame().from_dict(results)
df

# Print libraries versions

In [None]:
# print versions of libraries 
print("Libraries versions:\n---")
import pkg_resources
for lib in ["pandas", "plotly", "dimcli", "tqdm"]:
    print("=>", lib, ": ", pkg_resources.get_distribution(lib).version)

Libraries versions:
---
=> pandas :  1.0.3
=> plotly :  4.6.0
=> dimcli :  0.6.7.2
=> tqdm :  4.43.0


# Plotly chart with basic stats about publication authors or affiliations


Extract all authors from a list of publications and obtain basic stats about them.

In [None]:
from dimcli import dslquery
import pandas as pd
import plotly.express as px

authors = dslquery("""search publications for "bmw" return publications limit 1000""").as_dataframe_authors()
researchers = authors.query("researcher_id!=''")
# build df
df = pd.DataFrame({
    'measure' : ['Authors in total (non unique)', 'Authors with a researcher ID', 'Authors with a researcher ID (unique)'],
    'count' : [len(authors), len(researchers), researchers['researcher_id'].nunique()],
})
px.bar(df, x="measure", y="count", title=f"Author stats for {journal_title} (from {start_year})")


#
# for affiliations 
#
affiliations = dslquery("""search publications for "bmw" return publications limit 1000""").as_dataframe_authors_affiliations()
gridaffiliations = affiliations.query("aff_id != ''")
df = pd.DataFrame({
    'measure' : ['Affiliations in total (non unique)', 'Affiliations with a GRID ID', 'Affiliations with a GRID ID (unique)'],
    'count' : [len(affiliations), len(gridaffiliations), gridaffiliations['aff_id'].nunique()],
})
px.bar(df, x="measure", y="count", title=f"Affiliations stats for {journal_title} (from {start_year})")


# Dimcli chart with basic stats about publication affiliations

In [None]:
affiliations = dsl.query("""search publications for "bmw" return publications limit 1000""").as_dataframe_authors_affiliations()

gridaffiliations = affiliations.query("aff_id != ''")

df = pd.DataFrame({
    'measure' : ['Affiliations in total (non unique)', 'Affiliations with a GRID ID', 'Affiliations with a GRID ID (unique)'],
    'count' : [len(affiliations), len(gridaffiliations), gridaffiliations['aff_id'].nunique()],
})
# requires plotly_express library
px.bar(df, x="measure", y="count", title=f"Affiliations stats for {journal_title} (from {start_year})")

# Plotly quick chart

In [None]:
!pip install plotly --quiet
import plotly.express as px

# bar chart 
px.bar(df, x="name", y="funding", hover_name="name", color="country_name")

# scatter plot 
px.scatter(df, x="year", y="times_cited", color="type", 
           hover_name="title", 
           hover_data=['type', 'doi', 'year', 'times_cited', 'journal.title'], 
           height=600, title="Publications most cited by year of publication")

# with marginal data
px.scatter(df, x="times_cited", y="name", 
           hover_name="name", hover_data=['times_cited'],
           marginal_x="histogram", marginal_y="histogram", 
           height=900, title="Research Areas VS Citations (marginal subplots = X/Y distribution)")

# sort data by tot count
px.histogram(affiliations, x="aff_name").update_xaxes(categoryorder="total descending")

# Plotly Export to a file 

In [None]:
# https://community.plot.ly/t/plotly-express-plots-automatically-open-new-tabs-when-using-plotly-offline/24862

# note: for image export you need extra libraries https://plot.ly/python/static-image-export/


import plotly.express as px
from plotly.offline import plot 

iris = px.data.iris()
scatter_plot = px.scatter(iris, x="sepal_width", y="sepal_length")

plot(scatter_plot, filename = 'filename.html', auto_open=False)

'temp-plot.html'

# Plotly Create Multiple Charts at the same time


In [None]:
# https://plot.ly/python/creating-and-updating-figures/
# https://plot.ly/python/renderers/

fig1 = px.bar(iris, x="sepal_width", y="sepal_length", color="petal_width")
fig2 = px.bar(iris, x="sepal_width", y="petal_length", facet_col="species")

fig1.show()
fig2.show()

# Plotly fill in missing years with pandas

In [None]:
yrange = [dfbyjournal['year'].min(), dfbyjournal['year'].max()]
all_years = [x for x in range(yrange[0], yrange[1]+1)]  # add one to make sure max value is included

def add_missing_years_per_journal(ajournal):
    global dfbyjournal
    # list of years that already have values 
    known_years = list(dfbyjournal[dfbyjournal["journal.title"] == ajournal]['year'])
    l = [] 
    for x in all_years:
        if x not in known_years:
            l.append({'journal.title' : ajournal , 'year' : x, 'times_cited': 0, 'totdois' : 0, 'totcitations' : 0})
    # note about pandas append: appending a full list once per journal is much faster! 
    dfbyjournal = dfbyjournal.append(l, ignore_index=True )
    

# now call the routine for all journals
# 
journals = dfbyjournal['journal.title'].value_counts().index.tolist()
for j in journals:
    add_missing_years_per_journal(j)

# Zip file archive and download from Colab

In [None]:

# zip up all files to make download easier
import zipfile
import os 

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

zipf = zipfile.ZipFile('output.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('out/', zipf)
zipf.close()


# try to download from colab: sometimes it fails hence print a message
from google.colab import files

try:
  time.sleep(2)
  files.download('output.zip') 
except:
  print("Google Colab couldn't download - please try again.")


# Pandas Create Dataframe

Reminder about [tidy data](http://www.jeannicholashould.com/tidy-data-in-python.html)

* Each variable forms a column and contains values
* Each observation forms a row
* Each type of observational unit forms a table

A few definitions:

* Variable: A measurement or an attribute. Height, weight, sex, etc.
* Value: The actual measurement or attribute. 152 cm, 80 kg, female, etc.
* Observation: All values measure on the same unit. Each person.


In [None]:
# With empty columns
df = pd.DataFrame(columns=['A','B','C','D','E','F','G'])

# by columns
df = pd.DataFrame({
    'name' : ['val1', 'val2', 'etc..'],
    'category' : ['val1', 'val2', 'etc..'],
})

# by rows (records)
df = pd.DataFrame.from_dict([
    {'name': 'val1', 'category' : 'val1' },
    {'name': 'val1', 'category' : 'val1' },
    ...
})

# from a simple dict
d = {'location': list(results.keys()), 'count': list(results.values())}
df = pd.DataFrame.from_dict(d)



# from JSON
pd.read_json('file.json', orient='columns') # for rows, use 'records' 
pd.to_json('out.json', orient='columns')

# from CSV
df = pd.read_csv("/tmp/tmp07wuam09/data/cereal.csv")

# create DF from Series
counts = pubs_citing['journal.title'].value_counts()
df = counts.to_frame().reset_index() # reset index can mess up column names
df = df.rename(columns= {0: 'list'})
df.index.name = 'index'


# Pandas Describe and Count Values

In [None]:
# Return a tuple representing the dimensionality of the DataFrame.
df.shape

# Return an int representing the number of axes / array dimensions.
df.ndim

# This returns a Series with the data type of each column. 
# The result’s index is the original DataFrame’s columns. Columns with mixed types are stored with the object dtype.
df.dtypes

# the 'describe' method returns basic statistic for all columns of a dataframe
df.describe(include='all')

# Count distict values, use nunique:
df['hID'].nunique()

# Count only non-null values, use count:
df['hID'].count()

# Count total values including null values, use size attribute:
df['hID'].size

# this will show you the distinct element and their number of occurence.
df['race'].value_counts()
# only top ten
df['race'].value_counts()[:10]
# max value
df['race'].value_counts().idxmax()
# get he values not the counts
df['race'].value_counts().index.tolist()


# count missing values
df.isnull().sum()

# Frequency count based on two columns (variables)
df.groupby(["Group", "Size"]).size()
df.groupby(["Group", "Size"]).size().reset_index(name="Freq")

# Pandas get Dataframe Cell Values 

In [None]:
In [15]: df = pandas.DataFrame(numpy.random.randn(5,3),columns=list('ABC'))

In [16]: df
Out[16]: 
          A         B         C
0 -0.074172 -0.090626  0.038272
1 -0.128545  0.762088 -0.714816
2  0.201498 -0.734963  0.558397
3  1.563307 -1.186415  0.848246
4  0.205171  0.962514  0.037709

In [17]: df.iat[0,0]
Out[17]: -0.074171888537611502

In [18]: df.at[0,'A']
Out[18]: -0.074171888537611502

# OR

val = d2['col_name'].values[0]

# Pandas Select and Rename Dataframe

In [None]:
# select by position
df.iloc[2] 

# label based indexing
df.loc['index_label_1'] 

# Select rows based on cell values and update
df.loc[df['journal'].isnull(), "journal"] = "unknown"

#
#
#

# Rename columns using a dictionary to map values
# Rename the Area columnn to 'place_name'
data = data.rename(columns={"Area": "place_name"})

# Again, the inplace parameter will change the dataframe without assignment
data.rename(columns={"Area": "place_name"}, inplace=True)

# Rename multiple columns in one go with a larger dictionary
data.rename(
    columns={
        "Area": "place_name",
        "Y2001": "year_2001"
    },
    inplace=True
)

# Rename all columns using a function, e.g. convert all column names to lower case:
data.rename(columns=str.lower)

# Pandas Subset Dataframe via Slicing or Query

In [None]:
# Create a new DF by selecting columns
df2 = df[['id', 'issue', 'pages', 'title', 'type', 'volume', 'year']]


# from list
 df[df['A'].isin([3, 6])]
# negative version
df[~df['A'].isin([3, 6])]

# with query
df.query(' column_a == ["val1", "val2", ...]', inplace=True)
df.query("type=='article'")
# query with variable
my_symbol = 'BUD US'
df.query("Symbol=='{0}'".format(my_symbol))

# if we have a list, it needs to be turned into a str first, then it's possible to use `str.contains`
patents[patents['publication_ids'].apply(lambda x: ','.join(map(str, x))).str.contains("pub.1032163135")]


# Pandas Modify Dataframe Rows / Columns

In [None]:
# drop columns
df.drop(['author_affiliations'], axis=1, inplace=True)

# drop column with missing values
# Drop the rows where at least one element is missing.
df.dropna()

# Drop the columns where at least one element is missing.
df.dropna(axis='columns')

#Drop the rows where all elements are missing.
df.dropna(how='all')

#Keep only the rows with at least 2 non-NA values.
df.dropna(thresh=2)

#Define in which columns to look for missing values.
df.dropna(subset=['name', 'born'])

#Keep the DataFrame with valid entries in the same variable.
df.dropna(inplace=True)


#
#
# Drop empty values rows, after replacing empty strings
df['FOR'].replace('', np.nan, inplace=True)
#
df.dropna(subset=['FOR'], inplace=False).head()

# replace with empty list (can't be done with 'replace')
for row in df.loc[df.ids.isnull(), 'ids'].index:
    df.at[row, 'ids'] = []

#    
#
# Drop rows based on values
df = df[df.col != "val"]    
    
    
    
#
#
# Add new column to existing dataframe
# Use the original df1 indexes to create the series:
df1['e'] = pd.Series(np.random.randn(sLength), index=df1.index)

# Declare a list that is to be converted into a column 
address = ['Delhi', 'Bangalore', 'Chennai', 'Patna'] 
df['Address'] = address 

# change column order
df = df[['mean', '0', '1', '2', '3']]
#You can get the list of columns with:
cols = list(df.columns.values)

#
#
# fill in empty values
# fill everywhere, returns a frame
ddf.fillna(0) 
# this returns a series, not a frame!
ddf['FOR'].fillna("aaa") # NOTE doesn't save anything! 

# Pandas Add Columns by Transforming

In [None]:
# add new column 
df['M1_list'] = df['M1'].apply(lambda x: x.split(","))
# use apply with two cols // axis=1
test['search_url'] = test.apply(lambda x: google_url(x['first_name'] + " " +x['last_name'] ), axis=1)


# transpose axix
df3 = df.transpose()


#
# group by
#
# The groupby output will have an index or multi-index on rows corresponding to your chosen grouping variables. 
# To avoid setting this index, pass “as_index=False” to the groupby operation.
df2 = df.groupby('year', as_index=False)
df2.groups.keys()
group2003 = df2.get_group(2003)
group2003.head()

# counting on groupby https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
df.groupby('year', as_index=False)['id'].count()

# Groupby and sum only one column https://stackoverflow.com/questions/38985053/pandas-groupby-and-sum-only-one-column
df_by_concept = df.groupby('concept', as_index=False)['score'].sum()

# Using groupby to filter items that occur more than once
df_top_journals = df.groupby('journal.title').filter(lambda x: len(x) > 3)

# Add a column that counts a variable in groupby
df['count'] = df.groupby('group')['group'].transform('count')

# add new column by counting unique instances in another column than the grouping one
gridaffiliations["tot_pubs"] = gridaffiliations.groupby(['aff_id'])['pub_id'].transform('nunique')

# group by two variables
gridaffiliations.groupby(['aff_id', 'pub_id']).count()

#
# Splitting dictionary/list inside a Pandas Column into Separate Columns

df['dict_column'].apply(pd.Series) # return new df only for those cols

pd.concat([df.drop(['dict_column'], axis=1), df['dict_column'].apply(pd.Series)], axis=1) # add to existing df


# Pandas Add Rows

In [None]:
# by appending a DICT
# https://www.w3resource.com/python-exercises/pandas/python-pandas-data-frame-exercise-26.php

d = {'col1': [1, 4, 3, 4, 5], 'col2': [4, 5, 6, 7, 8], 'col3': [7, 8, 9, 0, 1]}
df = pd.DataFrame(data=d)
print("Original DataFrame")
print(df)
print('After add one row:')
data = {'col1': 10, 'col2': 11, 'col3': 12}
df = df.append(data, ignore_index=True) # ignore index preserves the original index



# appending more than one dict
l = []
for x in source_data:
  l.append({'col1': x[0], 'col2': x[1], 'col3': x[2]}) # include all cols!
df = df.append(data, ignore_index=True) 



#
# gen df by adding rows
# https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
rows_list = []
for row in input_rows:

        dict1 = {"col1": 'val', "col2": "val"}
        rows_list.append(dict1)

df = pd.DataFrame(rows_list) 

# Pandas Sort and Remove Duplicates Dataframe

https://thispointer.com/pandas-sort-rows-or-columns-in-dataframe-based-on-values-using-dataframe-sort_values/

In [None]:
df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
# Arguments :

# by : A string or list of strings basically either column names or index labels based on which sorting will be done.
# axis : If axis is 0, then name or list of names in by argument will be considered as column names. Default is 0
# If axis is 1, then name or list of names in by argument will be considered as row index labels
# ascending : If True sort in ascending else sort in descending order. Default is True
# inplace : If True, perform operation in-place in Dataframe
# na_position : Decides the position of NaNs after sorting i.e. irst puts NaNs at the beginning, last puts NaNs at the end
# Default value is ‘first’

# sort in place and update the index as well
df.sort_values(by=["year"], inplace=True) 
df.reset_index(drop=True)


# dropping ALL duplicate values 
df.drop_duplicates(subset ="First Name", 
                     keep = 'first', inplace = True) 

# Pandas Iterate Dataframe

https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

In [None]:
for index, row in df.iterrows():
    print(row['c1'], row['c2'])

# Pandas Merge and Aggregate Dataframe

In [None]:
df2 = pd.merge(dfy, dfyears_nl, how='outer')

# concenate dataframes simply add new rows at the bottom
res = df1.append([df2, df3])
# then usually sort and reset index for visualizations etc...
res.rename(columns={'id':'years'}, inplace=True)
res.sort_values(by="years", inplace=True)
res.reset_index(drop=True)


# melt
formatted_df = pd.melt(df,
                       ["religion"],  # the columns to keep as is
                       var_name="income",  # the columnn grouping all melted columns 
                       value_name="freq")  # the column counting the objects melted
formatted_df = formatted_df.sort_values(by=["religion"])
formatted_df.head(10)

# Pandas flatten JSON data

In [None]:
from pandas.io.json import json_normalize

dfjournals = json_normalize(data.publications) # returns a new column journal.title
dfjournals.reset_index()

#
# normalize a nested object
#
# ensure that all pubs have a valid (empty, even) key
for x in data.publications:
    if not 'FOR' in x:
        x['FOR'] = ""
    else:
        x['FOR'] = [{'name' : x['name'][5:]} for x in x['FOR']] # also remove the digit prefix to improve legibility
# then
json_normalize(data.publications, record_path=['FOR'], meta=["doi", "title"], errors='ignore', record_prefix='for_').head()


# recursive applications
import json
json_normalize(json.loads(df_aff1.to_json(orient='records')), record_path=['affiliations'], 
               meta=['id', 'researcher_id', 'first_name', 'last_name'], record_prefix='aff_')


# unpack a dict value into separate columns
# https://stackoverflow.com/questions/50512188/unpack-dictionary-from-pandas-column
concepts['concepts_scores'].dropna().apply(pd.Series)

# enrich original df with unpacked columns - need to dropna on original df first
concepts.dropna(subset=['concepts_scores']).drop('concepts_scores', 1).assign(**concepts['concepts_scores'].dropna().apply(pd.Series))



# Pandas Update Data in Frame While Iterating

In [None]:
print("\n===\nCounting clinical_trials for each publication...")

# build str column version for checking inclusion
clinical_trials['publication_ids_str'] = clinical_trials['publication_ids'].apply(lambda x: ','.join(map(str, x)))                                 
def get_clinical_trials_per_pub(pubid):
    global clinical_trials
    # turn list into str and check content in one line
    return clinical_trials[clinical_trials['publication_ids_str'].str.contains(pubid)]['id']

# using 'AT' method
publications['clinical_trials_count'] = 0
publications['clinical_trials_ids'] = ""
for index, row in pbar(publications.iterrows(), total=publications.shape[0]):
    match_clinical_trials = get_clinical_trials_per_pub(row['id'])
    publications.at[index,'clinical_trials_count'] = len(match_clinical_trials)
    publications.at[index,'clinical_trials_ids'] = list(match_clinical_trials)

# Python strip punctuation

In [None]:
import string
s = "&&**^^hello"
s.translate(str.maketrans('', '', string.punctuation))

# Pandas avoid setting with copy warning

The warning was generated because we have chained two indexing operations together. This is made easier to spot because we’ve used square brackets twice, but the same would be true if we used other access methods such as .bidderrate, .loc[], .iloc[], .ix[] and so on.

* https://www.dataquest.io/blog/settingwithcopywarning/

In [None]:
#
# explicit chaining
#
data[data.bidder == 'parakeet2004']['bidderrate'] = 100
# instead we do 
data.loc[data.bidder == 'parakeet2004', 'bidderrate'] = 100

#
# hidden chaining
#
winners = data.loc[data.bid == data.price]
winners.loc[304, 'bidder'] = 'therealname' # => warning
# instead we do
winners = data.loc[data.bid == data.price].copy()
winners.loc[304, 'bidder'] = 'therealname' # => works!