In [1]:
from pathlib import Path

from utils import load_data

In [2]:
# UniRef50 100k sample
# output_directory = Path("../private/uniref50_051121_rnd100k")

# ProtGPT2 100k generated
output_directory = Path("../private/protGPT2_100k_maxL150_minPPL72")

In [3]:
data = load_data(output_directory)
print(f"These columns can be queried: {', '.join(data.columns.values)}")

These columns can be queried: header, sequence, length, disorder, disorder_categorical, disorder_average, disorder_count, disorder_percent, metal, metal_count, metal_percent, small, small_count, small_percent, nucleic, nucleic_count, nucleic_percent, conservation, conservation_categorical, conservation_high_count, conservation_high_percent, conservation_low_count, conservation_low_percent, dssp3, helix_count, helix_percent, strand_count, strand_percent, helix_four_count, helix_four_percent, BPO_reference, BPO_terms, BPO_distance, CCO_reference, CCO_terms, CCO_distance, MFO_reference, MFO_terms, MFO_distance, subcellular_location, CATH_reference, CATH_superfamily, CATH_distance, transmembrane, signal_residue_count, signal_residue_percent, transmembrane_helix_count, transmembrane_helix_percent, transmembrane_strand_count, transmembrane_strand_percent


# Query 1
### Average length sequences with at least 30% helical content that are likely part of the outer cell membrane according to the Cellular Compartment Ontology (CCO)

I want sequences:
- longer than 200 residues and shorter than 400
- with more than 30% of residues involved in a helix
- annotated with the CCO `GO:0045203` term (integral component of cell outer membrane)

In [7]:
for sequence in data.query(
    '''
    length > 200 and length < 400 and \
    helix_percent > 0.3 and \
    CCO_terms.str.contains("GO:0072559")
    '''
).to_records():
    print(f"Header: {sequence.header}")
    print(f"Helical content: {sequence.helix_percent*100:0.2f}%")
    print(f"Distance to CCO annotated protein: {sequence.CCO_distance}")
    print(f"View on EMPP: https://embed.predictprotein.org/#/{sequence.sequence}")
    print("---------------")

Header: >seq25232, L=234, ppl=71.688
Helical content: 32.91%
Distance to CCO annotated protein: 1.271
View on EMPP: https://embed.predictprotein.org/#/MDKTSPENKKRLFLEELEARLVKDLEAVIRENFPVTRVDLSSRFVPLNHEDHNPKKCTMIGNFVSSFKDNDYPVAYVVLSRILPHFCGQRLFSNMGGNIEKILFSSQERVEVSEIFNASQKPNAIIFLDACHSGNMFRDLKKTDNVYILTGCSSAQYSWKASIHGYFTNELLHSIYSGLENTQGDTNNDGKITVSELHTYVKKTVIAKTKKMQTPQNWRYGIEGDFVLGECSIK
---------------


# Query 2
### Short sequences with transmembrane strands binding to small molecules

I want sequences:
- shorter than 100 residues
- with any transmembrane strand content
- binding to small molecules
- ordered by sequences longest sequences with most transmembrane and small molecule binding content

In [5]:
filtering_order = ['length', 'transmembrane_strand_percent', 'small_percent']


for sequence in data.query(
    '''
    length < 100 and \
    transmembrane_strand_count > 0 and \
    small_count > 0 
    '''
).sort_values(filtering_order, ascending=False).to_records():
    print(f"Header: {sequence.header}")
    print(f"Sequence length: {sequence.length}")
    print(f"Transmembrane strand content: {sequence.transmembrane_strand_percent*100:0.2f}%")
    print(f"Small-molecule binding content: {sequence.small_percent*100:0.2f}%")
    print(f"View on EMPP: https://embed.predictprotein.org/#/{sequence.sequence}")
    print("---------------")

Header: >seq86311, L=99, ppl=61.625
Sequence length: 99
Transmembrane strand content: 47.47%
Small-molecule binding content: 8.08%
View on EMPP: https://embed.predictprotein.org/#/MWGEERTAIGAASAYHLKGGWARSWIGYSYNKYRDSTTWDFDQNRHYVLAGLDFDLSRAWTLQTGINYTRGTYDAETPFYAADHNTQNARGSVGIAYRF
---------------
Header: >seq63070, L=98, ppl=58.219
Sequence length: 98
Transmembrane strand content: 43.88%
Small-molecule binding content: 4.08%
View on EMPP: https://embed.predictprotein.org/#/MAEVGLTYALPKQFTLNPSVGWQHQLNDTSPSALFSGGAGQEFTVNTTGMAPDSAVFGLGGSYTSSNGTDVSLGYRGELATDASDNSVSGLFTVRRPW
---------------
Header: >seq41706, L=94, ppl=48.688
Sequence length: 94
Transmembrane strand content: 30.85%
Small-molecule binding content: 18.09%
View on EMPP: https://embed.predictprotein.org/#/MKKLFLGLALVAFATTAQAQQRVKFSGGLILPMSDVDYGDFSSKNNVGAGVGVNFDVGLNDKFALGASVAYNYFGAKKDVTPKGGEDAFKIDYK
---------------
Header: >seq51154, L=90, ppl=43.344
Sequence length: 90
Transmembrane strand content: 35.56%
Small-molecule binding c