# Run workflow up to storing in database

In [1]:
from pathlib import Path
from pprint import pprint

from protein_detective.uniprot import Query
from protein_detective.workflow import retrieve_structures

In [None]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

In [3]:
session_dir = Path("session1")
session_dir

PosixPath('session1')

## Fetch structures from Uniprot, PDBe and Alphafold


In [4]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go="GO:0005634",  # Cellular component - Nucleus
    molecular_function_go="GO:0003677",  # Molecular function - DNA binding
)

In [5]:
download_path, nr_pdbs, nr_alphafolds = retrieve_structures(query, session_dir, limit=70)
download_path, nr_pdbs, nr_alphafolds

INFO:protein_detective.uniprot:Executing SPARQL query for UniProt: 
        PREFIX up: <http://purl.uniprot.org/core/>
        PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX GO:<http://purl.obolibrary.org/obo/GO_>

        SELECT ?protein
        WHERE {

        # --- Protein Selection ---
        ?protein a up:Protein .
        ?protein up:organism taxon:9606 .
?protein up:reviewed true .

            {

?protein up:annotation ?subcellAnnotation .
?subcellAnnotation up:locatedIn/up:cellularComponent ?cellcmpt .
?cellcmpt skos:prefLabel "nucleus" .

            } UNION {

?protein up:classifiedWith|(up:classifiedWith/rdfs:subClassOf) GO:0005634 .

            }


?protein up:classifiedWith|(up:classifiedWith/rdfs:subClassOf) GO:0003677 .


        }
        LIMIT 70

INFO:protein_det

({'7XHN', '4NE5', '4NDY', '4NE3', '4NE1', '7R5S', '4E45', '7YWX', '4NE6', '7XHO', '4E44', '4DRB', '4DRA'}, [PosixPath('session1/downloads/pdb4ne1.ent.gz'), PosixPath('session1/downloads/pdb4ndy.ent.gz'), PosixPath('session1/downloads/pdb4e45.ent.gz'), PosixPath('session1/downloads/pdb4drb.ent.gz'), PosixPath('session1/downloads/pdb7ywx.ent.gz'), PosixPath('session1/downloads/pdb7r5s.ent.gz'), PosixPath('session1/downloads/pdb4ne3.ent.gz'), PosixPath('session1/downloads/pdb4ne6.ent.gz'), PosixPath('session1/downloads/pdb4dra.ent.gz'), PosixPath('session1/downloads/pdb4ne5.ent.gz'), PosixPath('session1/downloads/pdb7xhn.ent.gz'), PosixPath('session1/downloads/pdb7xho.ent.gz'), PosixPath('session1/downloads/pdb4e44.ent.gz')])


Fetching Alphafold summaries: 100%|██████████| 61/61 [00:02<00:00, 22.76it/s]
Downloading AlphaFold files: 100%|██████████| 122/122 [00:00<00:00, 151.50it/s]


(PosixPath('session1/downloads'), 13, 61)

## Filter out AlphafoldDB structures with low confidence

And write PDB files without those low confidence residues to new directory.

In [None]:
from protein_detective.alphafold.density import DensityFilterQuery
from protein_detective.workflow import density_filter

dquery = DensityFilterQuery(
    confidence=70.0,
    min_threshold=100,
    max_threshold=500,
)

In [7]:
result = density_filter(session_dir, dquery)
pprint(result)

DensityFilterSessionResult(density_filtered_dir=PosixPath('session1/density_filtered'),
                           nr_kept=5,
                           nr_discarded=5)


## Prune PDBe files
Prune the PDB files to only keep the first chain of the found Uniprot entries and rename that chain to A.

In [None]:
from pathlib import Path

from protein_detective.pdbe.io import write_single_chain_pdb_file


In [7]:
write_single_chain_pdb_file('session1/downloads/pdb4ne5.ent.gz', "X=1-81", Path('/tmp/pdb4ne5.A.pdb'))

INFO:protein_detective.pdbe.io:{'pdb_file': 'session1/downloads/pdb4ne5.ent.gz', 'uniprot_chain': 'X=1-81', 'chain2keep': 'X', 'out_chain': 'A'}


AttributeError: 'NoneType' object has no attribute 'copy'

In [8]:
import atomium

In [9]:
pdb = atomium.open('session1/downloads/pdb4ne5.ent.gz')

In [14]:
pdb.model.chains()

{<Chain A (93 residues)>,
 <Chain B (74 residues)>,
 <Chain C (93 residues)>,
 <Chain D (74 residues)>,
 <Chain E (93 residues)>,
 <Chain F (74 residues)>,
 <Chain G (93 residues)>,
 <Chain H (74 residues)>}

In [4]:
from protein_detective.workflow import prune_pdbs

(single_chain_dir, nr_passed) = prune_pdbs(session_dir)
(single_chain_dir, nr_passed)

Saving single chain PDB files from PDBe:   0%|          | 0/13 [00:00<?, ?it/s]INFO:protein_detective.pdbe.io:Output file session1/single_chain/A8MT69_pdb7xhn.ent.pdb already exists. Skipping saving single chain PDB file for session1/downloads/pdb7xhn.ent.gz.
INFO:protein_detective.pdbe.io:Output file session1/single_chain/A8MT69_pdb4ndy.ent.pdb already exists. Skipping saving single chain PDB file for session1/downloads/pdb4ndy.ent.gz.
INFO:protein_detective.pdbe.io:{'pdb_file': PosixPath('session1/downloads/pdb4ne5.ent.gz'), 'uniprot_chain': 'X=1-81', 'chain2keep': 'X', 'out_chain': 'A'}
Saving single chain PDB files from PDBe:  15%|█▌        | 2/13 [00:00<00:00, 24.35it/s]


AttributeError: 'NoneType' object has no attribute 'copy'

### Query session database

In [9]:
from protein_detective.db import db_path

database = db_path(session_dir)

In [10]:
import duckdb

%load_ext sql
conn = duckdb.connect(database)
%sql conn --alias duckdb

In [10]:
%sql SELECT * FROM proteins

uniprot_acc
A0A1W2PPM1
A0A0U1RQI7
A0A0C5B5G6
A0A1W2PPK0
A0A1B0GWH4
A0A1B0GVZ6
A0A1B0GTS1
A0A1W2PPF3
A0A1W2PQ73
A0A087WUV0


In [11]:
%sql SELECT * FROM pdbs

pdb_id,method,resolution,pdb_file


In [12]:
%sql SELECT * FROM proteins_pdbs

uniprot_acc,pdb_id,chain,single_chain_pdb_file


In [13]:
%sql SELECT * FROM alphafolds LIMIT 1

uniprot_acc,summary,pdb_file,pae_file
A0A1W2PPM1,"{""entryId"":""AF-A0A1W2PPM1-F1"",""gene"":""CPHXL"",""sequenceChecksum"":""D0786215762FBC17"",""sequenceVersionDate"":""2017-06-07"",""uniprotAccession"":""A0A1W2PPM1"",""uniprotId"":""CPHXL_HUMAN"",""uniprotDescription"":""Cytoplasmic polyadenylated homeobox-like"",""taxId"":9606,""organismScientificName"":""Homo sapiens"",""uniprotStart"":1,""uniprotEnd"":405,""uniprotSequence"":""MNLDGTSGGFPAEEDHHNEERQTKNKRKTKHRHKFSEELLQELKEIFGENCYPDYTTRKTLAIKFDCPVNVIDNWFQNKRARLPPAERRRIFVLQKKHDFPVQAHSFLSCQETQAAAHNYATKQSLSGAQRALMRRAGCSHLEKQWIPSQEMGYNCFSLENQETPSQQVGPQCSYLEKPGIPSQQVGSQCSYLEKLGIPSQQVASQSSYLVTGTEKHPGCAMGYGGDTGSGHSGSGHSTAYHFLSYNSAECLHPPPSSVPYFHGERTETKESQHASPFLLDYAQGAYGVKKDHCLCSFCLSLLGQQQQNDWQYHLQQHQQPQNYLEGMMLQEQLPMDSGPWDLGKQWSSAQSQLQSQLPQNNGKPLCSQLQHMSLQIAADSPLLPLGQDMQERASEQPRTQMQQL"",""modelCreatedDate"":""2022-06-01"",""latestVersion"":4,""allVersions"":[1,2,3,4],""bcifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-model_v4.bcif"",""cifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-model_v4.cif"",""pdbUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-model_v4.pdb"",""paeImageUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-predicted_aligned_error_v4.png"",""paeDocUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-predicted_aligned_error_v4.json"",""amAnnotationsUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-aa-substitutions.csv"",""amAnnotationsHg19Url"":null,""amAnnotationsHg38Url"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-hg38.csv"",""isReviewed"":true,""isReferenceProteome"":true}",session1/downloads/AF-A0A1W2PPM1-F1-model_v4.pdb,session1/downloads/AF-A0A1W2PPM1-F1-predicted_aligned_error_v4.json


In [14]:
%sql SELECT count(*) FROM alphafolds

count_star()
10


In [15]:
# Fetch fields from inside summary
%sql SELECT uniprot_acc, summary.taxId, summary.uniprotStart, summary.uniprotEnd, summary.gene FROM alphafolds

uniprot_acc,taxId,uniprotStart,uniprotEnd,gene
A0A1W2PPM1,9606,1,405,"""CPHXL"""
A0A0U1RQI7,9606,1,1052,"""KLF18"""
A0A0C5B5G6,9606,1,16,"""MT-RNR1"""
A0A1W2PPK0,9606,1,400,"""Unknown"""
A0A1B0GWH4,9606,1,333,"""HSFX3"""
A0A1B0GVZ6,9606,1,204,"""MBD3L2B"""
A0A1B0GTS1,9606,1,333,"""HSFX4"""
A0A1W2PPF3,9606,1,345,"""DUXB"""
A0A1W2PQ73,9606,1,354,"""ERFL"""
A0A087WUV0,9606,1,522,"""Unknown"""


In [12]:
%config SqlMagic.displaylimit = 100

In [13]:

%%sql
SELECT 
f.confidence, f.min_threshold, f.max_threshold,
density_filtered_alphafolds.*, 
alphafolds.summary.uniprotStart, 
alphafolds.summary.uniprotEnd, 
length(alphafolds.summary.uniprotSequence) AS uniprot_length
FROM density_filtered_alphafolds
JOIN density_filters  AS f USING (density_filter_id) 
JOIN alphafolds USING (uniprot_acc)
LIMIT 100;

confidence,min_threshold,max_threshold,density_filter_id,uniprot_acc,nr_residues_above_confidence,keep,pdb_file,uniprotStart,uniprotEnd,uniprot_length
70.0,100,500,1,A0A1W2PPM1,68,False,,1,405,407
70.0,100,500,1,A0A0U1RQI7,192,True,session1/density_filtered/AF-A0A0U1RQI7-F1-model_v4.pdb,1,1052,1054
70.0,100,500,1,A0A0C5B5G6,10,False,,1,16,18
70.0,100,500,1,A0A1W2PPK0,71,False,,1,400,402
70.0,100,500,1,A0A1B0GWH4,117,True,session1/density_filtered/AF-A0A1B0GWH4-F1-model_v4.pdb,1,333,335
70.0,100,500,1,A0A1B0GVZ6,54,False,,1,204,206
70.0,100,500,1,A0A1B0GTS1,116,True,session1/density_filtered/AF-A0A1B0GTS1-F1-model_v4.pdb,1,333,335
70.0,100,500,1,A0A1W2PPF3,124,True,session1/density_filtered/AF-A0A1W2PPF3-F1-model_v4.pdb,1,345,347
70.0,100,500,1,A0A1W2PQ73,86,False,,1,354,356
70.0,100,500,1,A0A087WUV0,283,True,session1/density_filtered/AF-A0A087WUV0-F1-model_v4.pdb,1,522,524
