# Run workflow up to storing in database

In [1]:
from pathlib import Path
from pprint import pprint

from protein_detective.uniprot import Query
from protein_detective.workflow import retrieve_structures

In [2]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

In [3]:
session_dir = Path("session1")
session_dir

PosixPath('session1')

## Fetch structures from Uniprot, PDBe and Alphafold


In [4]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go="GO:0005634",  # Cellular component - Nucleus
    molecular_function_go="GO:0003677",  # Molecular function - DNA binding
)

In [5]:
download_path, nr_pdbs, nr_alphafolds = retrieve_structures(query, session_dir, limit=70)
download_path, nr_pdbs, nr_alphafolds

Downloading PDBe files: 100%|██████████| 13/13 [00:00<00:00, 40.83it/s]
Fetching Alphafold summaries: 100%|██████████| 61/61 [00:02<00:00, 25.46it/s]
Downloading AlphaFold files: 100%|██████████| 122/122 [00:01<00:00, 117.49it/s]


(PosixPath('session1/downloads'), 13, 61)

## Filter out AlphafoldDB structures with low confidence

And write PDB files without those low confidence residues to new directory.

In [19]:
from protein_detective.alphafold.density import DensityFilterQuery
from protein_detective.workflow import density_filter

dquery = DensityFilterQuery(
    confidence=70.0,
    min_threshold=100,
    max_threshold=500,
)

In [20]:
result = density_filter(session_dir, dquery)
pprint(result)

DensityFilterSessionResult(density_filtered_dir=PosixPath('session1/density_filtered'),
                           nr_kept=28,
                           nr_discarded=33)


## Prune PDBe files
Prune the PDB files to only keep the first chain of the found Uniprot entries and rename that chain to A.

In [7]:
from protein_detective.workflow import prune_pdbs

(single_chain_dir, nr_passed) = prune_pdbs(session_dir)
(single_chain_dir, nr_passed)

Saving single chain PDB files from PDBe: 100%|██████████| 13/13 [00:02<00:00,  5.98it/s]


(PosixPath('session1/single_chain'), 13)

### Query session database

In [8]:
from protein_detective.db import db_path

database = db_path(session_dir)

In [10]:
import duckdb

%load_ext sql
conn = duckdb.connect(database)
%sql conn --alias duckdb

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [11]:
%sql SELECT * FROM proteins

uniprot_acc
A0A1W2PPF3
A8MQ14
A8MUZ8
A0A1W2PPK0
A0A5F9ZHS7
A6NDZ8
A6NJL1
A8MTJ6
A0A0U1RQI7
A0A1B0GTS1


In [12]:
%sql SELECT * FROM pdbs

pdb_id,method,resolution,pdb_file
4E45,X-Ray_Crystallography,2.0,session1/downloads/pdb4e45.ent.gz
7XHN,Electron_Microscopy,3.7100000381469727,session1/downloads/pdb7xhn.ent.gz
4NDY,X-Ray_Crystallography,7.0,session1/downloads/pdb4ndy.ent.gz
4NE6,X-Ray_Crystallography,2.0999999046325684,session1/downloads/pdb4ne6.ent.gz
7XHO,Electron_Microscopy,3.289999961853028,session1/downloads/pdb7xho.ent.gz
7R5S,Electron_Microscopy,2.8299999237060547,session1/downloads/pdb7r5s.ent.gz
4DRA,X-Ray_Crystallography,2.4100000858306885,session1/downloads/pdb4dra.ent.gz
4NE5,X-Ray_Crystallography,2.5,session1/downloads/pdb4ne5.ent.gz
4NE1,X-Ray_Crystallography,6.5,session1/downloads/pdb4ne1.ent.gz
4E44,X-Ray_Crystallography,2.0999999046325684,session1/downloads/pdb4e44.ent.gz


In [13]:
%sql SELECT * FROM proteins_pdbs

uniprot_acc,pdb_id,chain,single_chain_pdb_file
A8MT69,4E45,B/D/G/I/L/N=1-81,session1/single_chain/A8MT69_pdb4e45.ent_B2A.pdb
A8MT69,7XHN,X=1-81,session1/single_chain/A8MT69_pdb7xhn.ent_X2A.pdb
A8MT69,4NDY,B/D/H/L/M/N/U/V/W/X=8-81,session1/single_chain/A8MT69_pdb4ndy.ent_B2A.pdb
A8MT69,4NE6,B/D=8-81,session1/single_chain/A8MT69_pdb4ne6.ent_B2A.pdb
A8MT69,7XHO,X=1-81,session1/single_chain/A8MT69_pdb7xho.ent_X2A.pdb
A8MT69,7R5S,X=1-81,session1/single_chain/A8MT69_pdb7r5s.ent_X2A.pdb
A8MT69,4DRA,E/F/G/H=1-81,session1/single_chain/A8MT69_pdb4dra.ent_E2A.pdb
A8MT69,4NE5,B/D/F/H=8-81,session1/single_chain/A8MT69_pdb4ne5.ent_B2A.pdb
A8MT69,4NE1,B/D/H/L/M/N/U/V/W/X/Z/b/d/h/i/j/o/p/q/r=8-81,session1/single_chain/A8MT69_pdb4ne1.ent_B2A.pdb
A8MT69,4E44,B/D=1-81,session1/single_chain/A8MT69_pdb4e44.ent_B2A.pdb


In [14]:
%sql SELECT * FROM alphafolds LIMIT 1

uniprot_acc,summary,pdb_file,pae_file
A0A1W2PPF3,"{""entryId"":""AF-A0A1W2PPF3-F1"",""gene"":""DUXB"",""sequenceChecksum"":""D34EEAFD50A88A34"",""sequenceVersionDate"":""2017-06-07"",""uniprotAccession"":""A0A1W2PPF3"",""uniprotId"":""DUXB_HUMAN"",""uniprotDescription"":""Double homeobox protein B"",""taxId"":9606,""organismScientificName"":""Homo sapiens"",""uniprotStart"":1,""uniprotEnd"":345,""uniprotSequence"":""MNLEGTSGGILQKEFWRNRIQYNQSQKDILQSWFQHDPFPDKAAREQLAKEIGVPESNIQVWFKNYRVKQRKLDYKCFSEKDQTQGHDQSQHLTQEYLPKEARQKQTFITWTQKNRLVQAFERNPFPDIATRKKLAEQTGLQESRIQMWFQKQRSLYLKKSRMEPMNLLVDDPNERPDATVGWHPINLFLPTDSSHYFSCSHSSSGHETLPPVLPSTQAPWDPFRFHVSQGPNVMIMQPTQAVQEGEKSDQPLIIPNHLLTLPILTKDLDTPTPFWLQYQEEHQNHKEHSGSGVPQVKSHSQPEPEHREQQPLNLGQFDISNILQRWDEICQALLAEWDPLKGTH"",""modelCreatedDate"":""2022-06-01"",""latestVersion"":4,""allVersions"":[1,2,3,4],""bcifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPF3-F1-model_v4.bcif"",""cifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPF3-F1-model_v4.cif"",""pdbUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPF3-F1-model_v4.pdb"",""paeImageUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPF3-F1-predicted_aligned_error_v4.png"",""paeDocUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPF3-F1-predicted_aligned_error_v4.json"",""amAnnotationsUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPF3-F1-aa-substitutions.csv"",""amAnnotationsHg19Url"":null,""amAnnotationsHg38Url"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPF3-F1-hg38.csv"",""isReviewed"":true,""isReferenceProteome"":true}",session1/downloads/AF-A0A1W2PPF3-F1-model_v4.pdb,session1/downloads/AF-A0A1W2PPF3-F1-predicted_aligned_error_v4.json


In [15]:
%sql SELECT count(*) FROM alphafolds

count_star()
61


In [16]:
# Fetch fields from inside summary
%sql SELECT uniprot_acc, summary.taxId, summary.uniprotStart, summary.uniprotEnd, summary.gene FROM alphafolds

uniprot_acc,taxId,uniprotStart,uniprotEnd,gene
A0A1W2PPF3,9606,1,345,"""DUXB"""
A8MQ14,9606,1,1090,"""ZNF850"""
A8MUZ8,9606,1,300,"""ZNF705G"""
A0A1W2PPK0,9606,1,400,"""Unknown"""
A0A5F9ZHS7,9606,1,289,"""NFILZ"""
A6NDZ8,9606,1,208,"""MBD3L4"""
A6NJL1,9606,1,495,"""ZSCAN5B"""
A8MTJ6,9606,1,420,"""FOXI3"""
A0A0U1RQI7,9606,1,1052,"""KLF18"""
A0A1B0GTS1,9606,1,333,"""HSFX4"""


In [21]:
%config SqlMagic.displaylimit = 100

In [22]:

%%sql
SELECT 
f.confidence, f.min_threshold, f.max_threshold,
density_filtered_alphafolds.*, 
alphafolds.summary.uniprotStart, 
alphafolds.summary.uniprotEnd, 
length(alphafolds.summary.uniprotSequence) AS uniprot_length
FROM density_filtered_alphafolds
JOIN density_filters  AS f USING (density_filter_id) 
JOIN alphafolds USING (uniprot_acc)
LIMIT 100;

confidence,min_threshold,max_threshold,density_filter_id,uniprot_acc,nr_residues_above_confidence,keep,pdb_file,uniprotStart,uniprotEnd,uniprot_length
70.0,100,500,1,A0A1W2PPF3,124,True,session1/density_filtered/AF-A0A1W2PPF3-F1-model_v4.pdb,1,345,347
70.0,100,500,1,A8MQ14,811,False,,1,1090,1092
70.0,100,500,1,A8MUZ8,137,True,session1/density_filtered/AF-A8MUZ8-F1-model_v4.pdb,1,300,302
70.0,100,500,1,A0A1W2PPK0,71,False,,1,400,402
70.0,100,500,1,A0A5F9ZHS7,71,False,,1,289,291
70.0,100,500,1,A6NDZ8,64,False,,1,208,210
70.0,100,500,1,A6NJL1,220,True,session1/density_filtered/AF-A6NJL1-F1-model_v4.pdb,1,495,497
70.0,100,500,1,A8MTJ6,94,False,,1,420,422
70.0,100,500,1,A0A0U1RQI7,192,True,session1/density_filtered/AF-A0A0U1RQI7-F1-model_v4.pdb,1,1052,1054
70.0,100,500,1,A0A1B0GTS1,116,True,session1/density_filtered/AF-A0A1B0GTS1-F1-model_v4.pdb,1,333,335
