# NanoSafety data summary

- using aggregated search

In [None]:
import yaml
from solrscope import aa
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import requests
from importlib import reload 
from solrscope import client_solr
from solrscope import client_ambit
from solrscope import annotation
import pandas as pd
import numpy as np
import json
import pandas as pd
#import qgrid
import warnings
warnings.simplefilter("ignore")

In [None]:
print('Select enanoMapper aggregated search service:')
style = {'description_width': 'initial'}
config,config_servers, config_security, auth_object, msg = aa.parseOpenAPI3()    
service_widget = widgets.Dropdown(
    options=config_servers['url'],
    description='Service:',
    disabled=False,
    style=style
)
if config_security is None:
    service = interactive(aa.search_service_open,url=service_widget)
else:
    print(msg)
    apikey_widget=widgets.Text(
            placeholder='',
            description=config_security,
            disabled=False,
            style=style
    )    
    service = interactive(aa.search_service_protected,url=service_widget,apikey=apikey_widget)    

display(service)

In [None]:
service_uri=service_widget.value
if auth_object!=None:
    auth_object.setKey(apikey_widget.value)
print("Sending queries to {}".format(service_uri))
facets = client_solr.Facets()


In [None]:
    query_widget=widgets.Text(
        placeholder='',
        value="*:*",
        description="Query",
        disabled=False,
        style=style
    )
    display(query_widget)

In [None]:
method_field="E.method_s"
method_synonym="E.method_synonym_ss"
# method_field="guidance_s"

def summary(query=query_widget.value,fq="type_s:study",statistics="Number of data points",fields=["topcategory_s","endpointcategory_s",method_field,"substanceType_s","publicname_s","reference_owner_s"],log_query=None,log_result=None):
    colnames=["Z"]
    colnames.extend(fields)
    colnames.append(statistics)
    _stats=[]
    reload(client_solr)
    def process(prefix,val,count,key,_tuple):
        if len(_tuple)==len(fields):
            _tuple = (*_tuple,val,count)
            _stats.append(_tuple)
    q=facets.getQuery(query=query,facets=fields,fq=fq)
    if log_query!=None:
        log_query(q)
    
    r = client_solr.post(service_uri,query=q,auth=auth_object)
    response_json=r.json()

    if r.status_code==200:
        if log_result!=None:
            log_result(response_json)
        facets.parse(response_json['facets'],prefix=">",process=process)
        df = pd.DataFrame(_stats,columns=colnames).drop("Z", axis=1)
        if "substanceType_s" in df.columns:
            a = annotation.DictionarySubstancetypes()
            df[ 'substanceType_name']=df[ 'substanceType_s'].apply(a.annotate)
        if "substanceType_hs" in df.columns:
            a = annotation.DictionarySubstancetypes()
            df[ 'substanceType_name']=df[ 'substanceType_hs'].apply(a.annotate)            
        if "endpointcategory_s" in df.columns:    
            a = annotation.DictionaryEndpointCategory()
            df[ 'endpointcategory_term']=df[ 'endpointcategory_s'].apply(a.annotate)
            a = annotation.DictionaryEndpointCategoryNames()
            df[ 'endpointcategory_name']=df[ 'endpointcategory_s'].apply(a.annotate)
        
        if "method_term" in df.columns:
            a = annotation.DictionaryAssays()
            df[ 'method_term']=df[method_field].apply(a.annotate)
        return (df)
    else:
        print(r.status_code)
        return (None)
    
df = summary(query=query_widget.value,fields=["topcategory_s","endpointcategory_s",method_synonym,method_field,"substanceType_s","publicname_s","reference_owner_s"])    
df.head()

In [None]:
table = pd.pivot_table(df, values='Number of data points', index=['topcategory_s','endpointcategory_name',method_synonym,method_field], columns=[ 'substanceType_s',"publicname_s"], aggfunc=np.sum).fillna("");
display(table)

table.reset_index().to_excel("method_material.xlsx")

In [None]:
table = pd.pivot_table(df, values='reference_owner_s', index=['topcategory_s','endpointcategory_name',method_field], columns=[ 'substanceType_name','publicname_s'], 
                             aggfunc=lambda x: ' '.join(str(v) for v in x)).fillna("")

display(table)



In [None]:
table.reset_index().to_excel("method_material_provider.xlsx")

# Misc statistics 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set_palette(sns.color_palette("hls", 8))
sns.set_palette(sns.color_palette("Set1"))


In [None]:
df = summary(query=query_widget.value,fq="type_s:study",statistics="Number of data points",
                 fields=["topcategory_s","endpointcategory_s",method_field,"owner_name_s","reference_owner_s","substanceType_s"])    
df.rename(columns={"owner_name_s" : "Project","topcategory_s":"Study type","substanceType_name": "NM type"},inplace=True)    
df.head()

In [None]:
#use this to find where method field is missing
#table = pd.pivot_table(df, values=method_field, index=['topcategory_s','endpointcategory_name'], columns=[ 'owner_name_s'], aggfunc=lambda x: ' '.join(str(v) for v in x)).fillna("")
table = pd.pivot_table(df, values="Number of data points", index=['Study type'], columns=[ 'Project'], aggfunc=np.sum).fillna(0);
display(table)
p = table.plot(kind="bar",  figsize=(10,10) ,sharey =True,title="Number of data points", grid=True)
fig = p.get_figure()
fig.savefig("studytype_datapoints.png")

## Methods summary

In [None]:
#Methods summary
#table = pd.pivot_table(df, values=method_field, index=['Study type','endpointcategory_name'], columns=[ 'Project'], aggfunc=lambda x: ' '.join(str(v) for v in x)).fillna("")
table = pd.pivot_table(df, values=method_field, index=['Study type','endpointcategory_name'], columns=[ 'Project'], aggfunc=lambda x: ';'.join(set(str(v) for v in x))).fillna("")
table.head(20)

## Number of data points per material and project

In [None]:
table = pd.pivot_table(df, values="Number of data points", index=['NM type'], columns=[ 'Project'], aggfunc=np.sum).fillna(0);
display(table)
p = table.plot(kind="bar",  figsize=(20,10) ,title="Number of data points", grid=True, stacked =False, sharey=True, subplots=True)
#fig = p.get_figure()
#fig.savefig("nmtype_datapoints.png")

## Number of data points per material and study type

In [None]:
table = pd.pivot_table(df, values="Number of data points", columns=['NM type'], index=[ 'Study type'], aggfunc=np.sum).fillna(0);
display(table)
p = table.plot(kind="bar",  figsize=(20,10) ,title="Number of data points", grid=True, stacked =False, sharey=True, subplots=False)
#fig = p.get_figure()
#fig.savefig("nmtype_datapoints.png")

# Materials

In [None]:
import matplotlib.pyplot as plt
binary = ["#ff0000","#00ff00"]
sns.set_palette(binary)
#red/green palette
#sns.palplot(sns.color_palette())
  
def plot_heatmap(table,q):    
    sns.set_style('ticks')
    fig, ax = plt.subplots()
    # the size of A4 paper
    fig.set_size_inches(10,10)
    #fig.set_size_inches(11.7/4, 8.27/4)
    sns_plot = sns.heatmap(table, cmap=binary)
    sns_plot.set_title(q)
    sns_plot.get_figure().savefig(q+".png")
    return (sns_plot)


In [None]:
    query_widget_header=widgets.Text(
        placeholder='',
        value="*:*",
        description="Query",
        disabled=False,
        style=style
    )
    display(query_widget_header)

In [None]:
df = summary(query=query_widget_header.value,fq="type_s:substance",statistics="Number of NM", fields=["owner_name_hs","substanceType_hs","publicname_hs"])    
df.rename(columns={"owner_name_hs" : "Project","topcategory_s":"Study type","substanceType_name": "NM type"},inplace=True)

In [None]:
table = pd.pivot_table(df, values="Number of NM", index=['publicname_hs'], columns=[ 'Project'], aggfunc='any').fillna(False)
plot_heatmap(table,"NM")

# Endpoints

In [None]:
def queryEndpoints(_query="topcategory_s:TOX"):
    print(_query)
    df = summary(query=_query,fq="type_s:study",statistics="Number of data points", fields=["topcategory_s","endpointcategory_s","effectendpoint_s","substanceType_s","publicname_s"])        
    df.rename(columns={"owner_name_s" : "Project","topcategory_s":"Study type","substanceType_name": "NM type","endpointcategory_name":"endpoint"},inplace=True)
    table = pd.pivot_table(df, values="Number of data points", index=['NM type'], columns=['endpoint' ], aggfunc='any').fillna(False)
    return (df,table)



In [None]:
q="TOX"
df,table=queryEndpoints(_query="topcategory_s:"+q + " and " + query_widget.value)
display(df.head())
display(table)
plot_heatmap(table,q)

In [None]:
q="P-CHEM"
df,table=queryEndpoints(_query="topcategory_s:"+q + " and " + query_widget.value)
plot_heatmap(table,q)

In [None]:
q="ECOTOX"
df,table=queryEndpoints(_query="topcategory_s:"+q + " and " + query_widget.value)
try:
    plot_heatmap(table,q)
except:
    pass