# QCBrowser Demo

In [1]:
import pandas as pd
import qcportal as ptl

from main import create_dataset_browser



In [2]:
client = ptl.PortalClient("https://api.qcarchive.molssi.org")

In [3]:
datasets = pd.DataFrame(client.list_datasets())
datasets.head()

Unnamed: 0,id,dataset_type,dataset_name,record_count
0,35,torsiondrive,OpenFF Fragmenter Phenyl Benchmark,454
1,36,torsiondrive,OpenFF Group1 Torsions,820
2,41,optimization,OpenFF Optimization Set 1,937
3,42,torsiondrive,Fragment Stability Benchmark,86
4,43,optimization,SMIRNOFF Coverage Set 1,1132


In [4]:
singlepoints = datasets[datasets["dataset_type"] == "singlepoint"].sort_values("record_count", ascending=False)
singlepoints.head(20)

Unnamed: 0,id,dataset_type,dataset_name,record_count
149,391,singlepoint,MLPepper RECAP Optimized Fragments v1.0,137932
131,373,singlepoint,RNA Trinucleotide Single Point Dataset v1.0,81670
107,323,singlepoint,OpenFF Theory Benchmarking Single Point Energi...,66552
116,347,singlepoint,OpenFF ESP Fragment Conformers v1.0,65116
17,152,singlepoint,OpenFF VEHICLe Set 1,48280
120,357,singlepoint,OpenFF ESP Industry Benchmark Set v1.1,39983
111,329,singlepoint,TorsionNet500 Single Points Dataset v1.0,24000
133,375,singlepoint,RNA Nucleoside Single Point Dataset v1.0,19110
20,159,singlepoint,OpenFF Discrepancy Benchmark 1,18864
129,371,singlepoint,RNA Single Point Dataset v1.0,13467


In [8]:
ds = client.get_dataset_by_id(357)

## Dataset Browser Widget

In [9]:
dsb = create_dataset_browser(ds)

In [10]:
dsb

VBox(children=(HTML(value='\n        <div style="\n            background: #f5f5f5;\n            border: 1px s…

## Dataset Browser Pandas Functions

In [11]:
# get the specifications as a dataframe
dsb.get_specifications() 

Unnamed: 0,Specification Name,Program,Method,Basis,Num Complete,Num Error,Num Invalid,Protocols,Properties
0,spec_1,psi4,hf,6-31g*,39983,0,0,{'wavefunction': 'WavefunctionProtocolEnum.orb...,"[pe energy, scf dipole, calcinfo_nmo, mbis cha..."


In [12]:
# Get records as a dataframe
dsb.get_records(stop=10)

Unnamed: 0,Entry Name,spec_1
0,c1cnccc1CO,<SinglepointRecord id=102602373 status=RecordS...
1,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-0,<SinglepointRecord id=102591094 status=RecordS...
2,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-1,<SinglepointRecord id=102590998 status=RecordS...
3,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-2,<SinglepointRecord id=102591106 status=RecordS...
4,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-3,<SinglepointRecord id=102591068 status=RecordS...
5,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-4,<SinglepointRecord id=102591052 status=RecordS...
6,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-5,<SinglepointRecord id=102591103 status=RecordS...
7,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-6,<SinglepointRecord id=102591148 status=RecordS...
8,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-7,<SinglepointRecord id=102591048 status=RecordS...
9,C[C@@H](CC(=O)O)C(=O)c1ccc(c(c1)O)NCc2ccccc2-0,<SinglepointRecord id=102591134 status=RecordS...


In [None]:
# Get entries as a dataframe.
# Can choose to save rdkit in a column
# Getting this many entries may take a few minutes.
mol_df = dsb.get_entries(stop=10000, get_openff=True, get_rdkit=True)

In [19]:
mol_df.head(20)

Unnamed: 0,Entry Name,OpenFFMol,RDKit Molecule
0,c1cnccc1CO,Molecule with name '' and SMILES '[H][O][C]([H...,<rdkit.Chem.rdchem.Mol object at 0x7f80cc4d8770>
1,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-0,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80cc436250>
2,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-1,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80ec420310>
3,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-2,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80cc521d00>
4,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-3,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80ec3e9620>
5,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-4,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80cc3bf6f0>
6,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-5,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80cc3a80e0>
7,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-6,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80ec448b80>
8,c1cc(ccc1C(=O)N)NC(=O)CN2CCC(CC2)NC(=O)O-7,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80cc3b5800>
9,C[C@@H](CC(=O)O)C(=O)c1ccc(c(c1)O)NCc2ccccc2-0,Molecule with name '' and SMILES '[H][O][C](=[...,<rdkit.Chem.rdchem.Mol object at 0x7f80cc3b6520>


In [None]:
# With RDKit molecules, we can do things like substructure searching.

from rdkit import Chem

# SMARTS pattern for searching
smarts_pattern = "[NX3]"  # amine
query = Chem.MolFromSmarts(smarts_pattern)

# Filtering DataFrame using HasSubstructMatch, only if mol is not None
mol_df_filtered = mol_df[mol_df["RDKit Molecule"].apply(lambda mol: mol is not None and mol.HasSubstructMatch(query))]

mol_df_filtered.info()


<class 'pandas.core.frame.DataFrame'>
Index: 9259 entries, 1 to 9989
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Entry Name      9259 non-null   object
 1   OpenFFMol       9259 non-null   object
 2   RDKit Molecule  9259 non-null   object
dtypes: object(3)
memory usage: 289.3+ KB
