# Run workflow up to storing in database

In [None]:
from pathlib import Path

from protein_detective.uniprot import Query
from protein_detective.workflow import retrieve_structures

In [2]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

In [3]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go="GO:0005634",  # Cellular component - Nucleus
    molecular_function_go="GO:0003677",  # Molecular function - DNA binding
)

In [None]:
session_dir = Path("session1")
session_dir

In [None]:
db_path = await retrieve_structures(query, session_dir, limit=10)
db_path

Downloading files: 100%|██████████| 4/4 [00:00<00:00, 14966.29it/s]
Fetching summaries: 100%|██████████| 10/10 [00:00<00:00, 25.36it/s]
Downloading files: 100%|██████████| 20/20 [00:00<00:00, 55701.25it/s]


PosixPath('session1/session.db')

### Query session database

In [6]:
import duckdb

%load_ext sql
conn = duckdb.connect(db_path)
%sql conn --alias duckdb

In [7]:
%sql SELECT * FROM proteins

uniprot_acc
A8MT69
A0A1B0GTS1
A0A0U1RQI7
A0A087WUV0
A0A1B0GWH4
A0A1W2PQ73
A0A0C5B5G6
A0A1W2PPK0
A0A1W2PPM1
A0A1B0GVZ6


In [8]:
%sql SELECT * FROM pdbs

pdb_id,method,resolution,pdb_file
4E45,X-Ray_Crystallography,2.0,session1/downloads/pdb4e45.ent.gz
4DRA,X-Ray_Crystallography,2.4100000858306885,session1/downloads/pdb4drb.ent.gz
4E44,X-Ray_Crystallography,2.0999999046325684,session1/downloads/pdb4e44.ent.gz
4DRB,X-Ray_Crystallography,2.630000114440918,session1/downloads/pdb4dra.ent.gz


In [9]:
%sql SELECT * FROM proteins_pdbs

uniprot_acc,chain,pdb_id
A8MT69,A/C/F/H/K/M=1-110,4E45
A8MT69,A/B/C/D=1-107,4DRA
A8MT69,B/D=1-81,4E44
A8MT69,J/K/L/M/N/O=1-81,4DRB


In [11]:
%sql SELECT * FROM alphafolds LIMIT 1

uniprot_acc,summary,pdb_file,pae_file
A0A1B0GTS1,"{""entryId"":""AF-A0A1B0GTS1-F1"",""gene"":""HSFX4"",""sequenceChecksum"":""C062984AED1EA2B9"",""sequenceVersionDate"":""2016-10-05"",""uniprotAccession"":""A0A1B0GTS1"",""uniprotId"":""HSFX4_HUMAN"",""uniprotDescription"":""Heat shock transcription factor, X-linked member 4"",""taxId"":9606,""organismScientificName"":""Homo sapiens"",""uniprotStart"":1,""uniprotEnd"":333,""uniprotSequence"":""MASQNTEQEYEAKLAPSVGGEPTSGGPSGSSPDPNPDSSEVLDRHEDQAMSQDPGSQDNSPPEDRNQRVVNVEDNHNLFRLSFPRKLWTIVEEDTFKSVSWNDDGDAVIIDKDLFQREVLQRKGAERIFKTDNLTSFIRQLNLYGFCKTRPSNSPGNKKMMIYCNSNFQRDKPRLLENIQRKDALRNTAQQATRVPTPKRKNLVATRRSLRIYHINARKEAIKMCQQGAPSVQGPSGTQSFRRSGMWSKKSATRHPLGNGPPQEPNGPSWEGTSGNVTFTSSATTWMEGTGILSSLVYSDNGSVMSLYNICYYALLASLSVMSPNEPSDDEEE"",""modelCreatedDate"":""2022-06-01"",""latestVersion"":4,""allVersions"":[1,2,3,4],""bcifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1B0GTS1-F1-model_v4.bcif"",""cifUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1B0GTS1-F1-model_v4.cif"",""pdbUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1B0GTS1-F1-model_v4.pdb"",""paeImageUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1B0GTS1-F1-predicted_aligned_error_v4.png"",""paeDocUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1B0GTS1-F1-predicted_aligned_error_v4.json"",""amAnnotationsUrl"":""https://alphafold.ebi.ac.uk/files/AF-A0A1B0GTS1-F1-aa-substitutions.csv"",""amAnnotationsHg19Url"":null,""amAnnotationsHg38Url"":""https://alphafold.ebi.ac.uk/files/AF-A0A1B0GTS1-F1-hg38.csv"",""isReviewed"":true,""isReferenceProteome"":true}",session1/downloads/AF-A0A1B0GTS1-F1-model_v4.pdb,session1/downloads/AF-A0A1B0GTS1-F1-predicted_aligned_error_v4.json


In [12]:
%sql SELECT count(*) FROM alphafolds

count_star()
10


In [15]:
%sql SELECT uniprot_acc, summary.taxId, summary.uniprotStart, summary.uniprotEnd, summary.gene FROM alphafolds

uniprot_acc,taxId,uniprotStart,uniprotEnd,gene
A0A1B0GTS1,9606,1,333,"""HSFX4"""
A0A0U1RQI7,9606,1,1052,"""KLF18"""
A0A087WUV0,9606,1,522,"""Unknown"""
A0A1B0GWH4,9606,1,333,"""HSFX3"""
A0A1W2PQ73,9606,1,354,"""ERFL"""
A0A0C5B5G6,9606,1,16,"""MT-RNR1"""
A0A1W2PPK0,9606,1,400,"""Unknown"""
A0A1W2PPM1,9606,1,405,"""CPHXL"""
A0A1B0GVZ6,9606,1,204,"""MBD3L2B"""
A0A1W2PPF3,9606,1,345,"""DUXB"""
