# Run workflow up to storing in database

In [None]:
from pathlib import Path
from pprint import pprint

from protein_detective.uniprot import Query
from protein_detective.workflow import density_filter, retrieve_structures

In [2]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

In [4]:
session_dir = Path("session1")
session_dir

PosixPath('session1')

In [3]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go="GO:0005634",  # Cellular component - Nucleus
    molecular_function_go="GO:0003677",  # Molecular function - DNA binding
)

In [5]:
download_path, nr_pdbs, nr_alphafolds = retrieve_structures(query, session_dir, limit=10)
download_path, nr_pdbs, nr_alphafolds

Downloading files: 0it [00:00, ?it/s]
Fetching summaries: 100%|██████████| 10/10 [00:00<00:00, 14.06it/s]
Downloading files: 100%|██████████| 20/20 [00:00<00:00, 74.69it/s]


(PosixPath('session1/downloads'), 0, 10)

In [5]:
from protein_detective.alphafold.density import DensityFilterQuery

dquery = DensityFilterQuery(
    confidence=70.0,
    min_threshold=100,
    max_threshold=500,
)

In [7]:
result = density_filter(session_dir, dquery)
pprint(result)

DensityFilterSessionResult(density_filtered_dir=PosixPath('session1/density_filtered'),
                           nr_kept=5,
                           nr_discarded=5)


### Query session database

In [9]:
from protein_detective.db import db_path

database = db_path(session_dir)

In [10]:
import duckdb

%load_ext sql
conn = duckdb.connect(database)
%sql conn --alias duckdb

In [10]:
%sql SELECT * FROM proteins

uniprot_acc
A0A1W2PPM1
A0A0U1RQI7
A0A0C5B5G6
A0A1W2PPK0
A0A1B0GWH4
A0A1B0GVZ6
A0A1B0GTS1
A0A1W2PPF3
A0A1W2PQ73
A0A087WUV0


In [11]:
%sql SELECT * FROM pdbs

pdb_id,method,resolution,pdb_file


In [12]:
%sql SELECT * FROM proteins_pdbs

uniprot_acc,pdb_id,chain,single_chain_pdb_file


In [13]:
%sql SELECT * FROM alphafolds LIMIT 1

uniprot_acc,summary,pdb_file,pae_file
A0A1W2PPM1,"{""entryId"":""AF-A0A1W2PPM1-F1"",""gene"":""CPHXL"",""sequenceChecksum"":""D0786215762FBC17"",""sequenceVersionDate"":""2017-06-07"",""uniprotAccession"":""A0A1W2PPM1"",""uniprotId"":""CPHXL_HUMAN"",""uniprotDescription"":""Cytoplasmic polyadenylated homeobox-like"",""taxId"":9606,""organismScientificName"":""Homo sapiens"",""uniprotStart"":1,""uniprotEnd"":405,""uniprotSequence"":""MNLDGTSGGFPAEEDHHNEERQTKNKRKTKHRHKFSEELLQELKEIFGENCYPDYTTRKTLAIKFDCPVNVIDNWFQNKRARLPPAERRRIFVLQKKHDFPVQAHSFLSCQETQAAAHNYATKQSLSGAQRALMRRAGCSHLEKQWIPSQEMGYNCFSLENQETPSQQVGPQCSYLEKPGIPSQQVGSQCSYLEKLGIPSQQVASQSSYLVTGTEKHPGCAMGYGGDTGSGHSGSGHSTAYHFLSYNSAECLHPPPSSVPYFHGERTETKESQHASPFLLDYAQGAYGVKKDHCLCSFCLSLLGQQQQNDWQYHLQQHQQPQNYLEGMMLQEQLPMDSGPWDLGKQWSSAQSQLQSQLPQNNGKPLCSQLQHMSLQIAADSPLLPLGQDMQERASEQPRTQMQQL"",""modelCreatedDate"":""2022-06-01"",""latestVersion"":4,""allVersions"":[1,2,3,4],""bcifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-model_v4.bcif"",""cifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-model_v4.cif"",""pdbUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-model_v4.pdb"",""paeImageUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-predicted_aligned_error_v4.png"",""paeDocUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-predicted_aligned_error_v4.json"",""amAnnotationsUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-aa-substitutions.csv"",""amAnnotationsHg19Url"":null,""amAnnotationsHg38Url"":""https://alphafold.ebi.ac.uk/files/AF-A0A1W2PPM1-F1-hg38.csv"",""isReviewed"":true,""isReferenceProteome"":true}",session1/downloads/AF-A0A1W2PPM1-F1-model_v4.pdb,session1/downloads/AF-A0A1W2PPM1-F1-predicted_aligned_error_v4.json


In [14]:
%sql SELECT count(*) FROM alphafolds

count_star()
10


In [15]:
# Fetch fields from inside summary
%sql SELECT uniprot_acc, summary.taxId, summary.uniprotStart, summary.uniprotEnd, summary.gene FROM alphafolds

uniprot_acc,taxId,uniprotStart,uniprotEnd,gene
A0A1W2PPM1,9606,1,405,"""CPHXL"""
A0A0U1RQI7,9606,1,1052,"""KLF18"""
A0A0C5B5G6,9606,1,16,"""MT-RNR1"""
A0A1W2PPK0,9606,1,400,"""Unknown"""
A0A1B0GWH4,9606,1,333,"""HSFX3"""
A0A1B0GVZ6,9606,1,204,"""MBD3L2B"""
A0A1B0GTS1,9606,1,333,"""HSFX4"""
A0A1W2PPF3,9606,1,345,"""DUXB"""
A0A1W2PQ73,9606,1,354,"""ERFL"""
A0A087WUV0,9606,1,522,"""Unknown"""


In [12]:
%config SqlMagic.displaylimit = 100

In [13]:

%%sql
SELECT 
f.confidence, f.min_threshold, f.max_threshold,
density_filtered_alphafolds.*, 
alphafolds.summary.uniprotStart, 
alphafolds.summary.uniprotEnd, 
length(alphafolds.summary.uniprotSequence) AS uniprot_length
FROM density_filtered_alphafolds
JOIN density_filters  AS f USING (density_filter_id) 
JOIN alphafolds USING (uniprot_acc)
LIMIT 100;

confidence,min_threshold,max_threshold,density_filter_id,uniprot_acc,nr_residues_above_confidence,keep,pdb_file,uniprotStart,uniprotEnd,uniprot_length
70.0,100,500,1,A0A1W2PPM1,68,False,,1,405,407
70.0,100,500,1,A0A0U1RQI7,192,True,session1/density_filtered/AF-A0A0U1RQI7-F1-model_v4.pdb,1,1052,1054
70.0,100,500,1,A0A0C5B5G6,10,False,,1,16,18
70.0,100,500,1,A0A1W2PPK0,71,False,,1,400,402
70.0,100,500,1,A0A1B0GWH4,117,True,session1/density_filtered/AF-A0A1B0GWH4-F1-model_v4.pdb,1,333,335
70.0,100,500,1,A0A1B0GVZ6,54,False,,1,204,206
70.0,100,500,1,A0A1B0GTS1,116,True,session1/density_filtered/AF-A0A1B0GTS1-F1-model_v4.pdb,1,333,335
70.0,100,500,1,A0A1W2PPF3,124,True,session1/density_filtered/AF-A0A1W2PPF3-F1-model_v4.pdb,1,345,347
70.0,100,500,1,A0A1W2PQ73,86,False,,1,354,356
70.0,100,500,1,A0A087WUV0,283,True,session1/density_filtered/AF-A0A087WUV0-F1-model_v4.pdb,1,522,524
