# Protein modeling, starting at a Uniprot ID

In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import csv
from itertools import compress
import requests
from xml.dom import minidom

import sys
sys.path.append("..")

from src.get_pfam_info import *
from src.get_uniprot_info import *
from src.utils import *
from src.protein_modeling import *

In [4]:
# Load up some proteins to test
with open('', 'r') as f:
    reader = csv.reader(f)
    proteins = list(reader)[0]

# Step 1. Gather metadata protein associated with each Uniprot ID
1. Available crystal structures.
2. Pfam annotations.
3. AA sequence.

In [5]:
# Initialize lists of data
crystal_annotation = []
pfam_annotation = []
sequences = []
protein_crystal = []
errors = []

# Gather information for each protein
for protein in tqdm_notebook(proteins):
    try:
        # Get information from Uniprot
        uniprot_entries, sequence = get_uniprot_info(protein)

        # Does this protein have any crystal structures?
        if len(uniprot_entries) > 0:
            protein_crystal.append({'Protein': protein,
                                    'Sequence' : sequence,
                                   'Crystal' : 'Yes'})
        else:
            protein_crystal.append({'Protein': protein,
                                    'Sequence' : sequence,
                                   'Crystal' : 'No'})

        # Add all data from uniprot.org to the list
        for uniprot_entry in uniprot_entries:
            uniprot_entry["Sequence"] = sequence
            crystal_annotation.append(uniprot_entry)

        # Add all data from pfam.xfam.org to the list
        for pfam_entry in get_pfam_info(protein):
            pfam_entry["Sequence"] = sequence
            pfam_annotation.append(pfam_entry)
            
    except Exception as e:
        errors.append({protein : e})

HBox(children=(IntProgress(value=0, max=8058), HTML(value='')))




In [10]:
pfam_df = pd.DataFrame(pfam_annotation)
pfam_df = pfam_df[["UniProtKB", "Accession", "Identifier", "Start", "Stop"]]
pfam_df.head()

Unnamed: 0,UniProtKB,Accession,Identifier,Start,Stop
0,Q9UQC1,PF00012,HSP70,1,151
1,P08670,PF00038,Filament,102,410
2,P08670,PF04732,Filament_head,6,101
3,P04406,PF00044,Gp_dh_N,4,105
4,P04406,PF02800,Gp_dh_C,157,314


In [12]:
crystal_df = pd.DataFrame(protein_crystal)
crystal_df = crystal_df[["Protein", "Crystal", "Sequence"]]
crystal_df.head(10)

Unnamed: 0,Protein,Crystal,Sequence
0,Q9UQC1,No,MKHWPFQVINDGDKPKVQVSYKGETKAFYPEEISSMVLTKMKEIAE...
1,P08670,Yes,MSTRSVSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRP...
2,P04406,Yes,MGKVKVGVNGFGRIGRLVTRAAFNSGKVDIVAINDPFIDLNYMVYM...
3,P62979,Yes,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...
4,P07910,Yes,MASNVTNKTDPRSMNSRVFIGNLNTLVVKKSDVEAIFSKYGKIVGC...
5,Q15149,Yes,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...
6,P61513,No,MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...
7,P14618,Yes,MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTG...
8,P0C0S8,Yes,MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERVGA...
9,P19338,No,MVKLAKAGKNQGDPKKMAPPPKEVEEDSEDEEMSEDEEDDSSGEEV...


In [6]:
print("Errors: " + str(len(errors)))
print("Successes: " + str(len(crystal_df["Protein"])))

Errors: 51
Successes: 8009


# Step 2. Set clearly-defined rules about how to split FASTA sequences into modelable 

# Rules for protein modeling

1. Case: Protein has incomplete (<95%) crystal coverage
    
    * Model entire protein

2. Case: pfam has an incomplete (<95%) crystal structure
    
    * Model entire pfam
    * Case: 10bp upstream OR 10bp downstream of the pfam do not contain another pfam or the start/end of the protein
        * Model the region from (start or end of previous pfam) to (end or next pfam)
    * Case: There are multiple pfams with the same identifier
        * Treat the concatenation of all of those pfams as a new pfam
    
* What happens if a region is too large to be modeled?
    * For now, just skip those and add them to a list to do later
 

# Step 3. Take chunks of each protein, as necessary

In [7]:
# Create protein-based lookup tables for all of the features
crystal_annotation_lookup = {}
pfam_annotation_lookup = {}
has_crystal_lookup = {}
for crystal, pfam, has_crystal in zip(crystal_annotation, pfam_annotation, protein_crystal):
    crystal_annotation_lookup[crystal["UniProtKB"]] = crystal
    pfam_annotation_lookup[pfam["UniProtKB"]] = pfam
    has_crystal_lookup[has_crystal["Protein"]] = has_crystal

In [None]:
# Dict format: {UniProtKB, fasta_header, fasta_sequence, model_in_QUARK(bool), model_in_ITASSER(bool), notes}
sequences_to_model = {}

# Use subfunction to get protein name, fasta header, sequence,
# and then manually add whether to model in QUARK or I-Tasser

protein_name = "P19338"

if(has_crystal_lookup[protein_name]["Crystal"] == "No"):
    # Protein does not have a crystal structure, so whole protein should be modeled
    sequence = crystal_annotation_lookup[protein_name]["sequence"]
    fasta_header = protein_name + "_" + "whole_protein"
    model_in_QUARK = (False if len(sequence) > 800 else True)
    model_in_ITASSER = (False if len(sequence) > 1400 else True)
    notes = "Protein does not have any crystal structures, so model the whole protein"
else:
    print(0)
    

In [None]:
("yep" if 1==1 else "nope")

In [None]:
protein_name = "P08670"
crystal_annotation_lookup[protein_name]["Sequence"]

In [None]:
'P19338' in pfam_df["UniProtKB"]

In [None]:
crystal_df["Protein"].head()

In [None]:
'P19338' in proteins

In [None]:
'P19338' in errors

In [None]:
len(proteins) - len(errors)

In [None]:
len(crystal_df["Protein"])

In [None]:
proteins[0:10]

In [None]:
crystal_df["Protein"].unique()[0:10]

In [None]:
'P19338' in crystal_df["Protein"]

In [None]:
crystal_df["Protein"][9]

In [None]:
crystal_df["Protein"][9] in crystal_df["Protein"]

In [None]:
crystal_df["Protein"][9]

In [None]:
'P19338' in crystal_df["Protein"]

In [None]:
crystal_df["Protein"][9] in crystal_df["Protein"]

In [None]:
crystal_df[crystal_df["Protein"] == "P19338"]

In [None]:
has_crystal_lookup["P19338"]

In [None]:
crystal_df.head()

In [None]:
crystal_df.to_csv("crystal_df_07232019.csv")
crystal_annot_df.to_csv("crystal_annot_df_07232019.csv")
pfam_df.to_csv("pfam_df_07232019.csv")

In [None]:
from src/protein_modeling import get_segments_to_model

In [None]:
version

In [None]:
import sys
sys.version

In [9]:
1 in [0,0,0,1,0]

True

In [13]:
get_flanking_location()

NameError: name 'get_flanking_location' is not defined

In [16]:
pfam_annotation[0:10]

[{'UniProtKB': 'Q9UQC1',
  'Accession': 'PF00012',
  'Identifier': 'HSP70',
  'Start': 1,
  'Stop': 151,
  'Sequence': 'MKHWPFQVINDGDKPKVQVSYKGETKAFYPEEISSMVLTKMKEIAEAYLGYPVTNAVITVPAYFNDSQRQATKDAGVIAGLNVLRIINEPTAAAIAYGLDRTGKGERNVLIFDLGGGTFDVSILTIDDGIFEVKATAGDTHLGGEDFDNRQ'},
 {'UniProtKB': 'P08670',
  'Accession': 'PF00038',
  'Identifier': 'Filament',
  'Start': 102,
  'Stop': 410,
  'Sequence': 'MSTRSVSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRPSTSRSLYASSPGGVYATRSSAVRLRSSVPGVRLLQDSVDFSLADAINTEFKNTRTNEKVELQELNDRFANYIDKVRFLEQQNKILLAELEQLKGQGKSRLGDLYEEEMRELRRQVDQLTNDKARVEVERDNLAEDIMRLREKLQEEMLQREEAENTLQSFRQDVDNASLARLDLERKVESLQEEIAFLKKLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALRDVRQQYESVAAKNLQEAEEWYKSKFADLSEAANRNNDALRQAKQESTEYRRQVQSLTCEVDALKGTNESLERQMREMEENFAVEAANYQDTIGRLQDEIQNMKEEMARHLREYQDLLNVKMALDIEIATYRKLLEGEESRISLPLPNFSSLNLRETNLDSLPLVDTHSKRTLLIKTVETRDGQVINETSQHHDDLE'},
 {'UniProtKB': 'P08670',
  'Accession': 'PF04732',
  'Identifier': 'Filament_head',
  'Start': 6,
  'Stop': 101,
  'Sequence': 'MS

In [17]:
pd.DataFrame(pfam_annotation[0:10])

Unnamed: 0,UniProtKB,Accession,Identifier,Start,Stop,Sequence
0,Q9UQC1,PF00012,HSP70,1,151,MKHWPFQVINDGDKPKVQVSYKGETKAFYPEEISSMVLTKMKEIAE...
1,P08670,PF00038,Filament,102,410,MSTRSVSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRP...
2,P08670,PF04732,Filament_head,6,101,MSTRSVSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRP...
3,P04406,PF00044,Gp_dh_N,4,105,MGKVKVGVNGFGRIGRLVTRAAFNSGKVDIVAINDPFIDLNYMVYM...
4,P04406,PF02800,Gp_dh_C,157,314,MGKVKVGVNGFGRIGRLVTRAAFNSGKVDIVAINDPFIDLNYMVYM...
5,P62979,PF00240,ubiquitin,3,74,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...
6,P62979,PF01599,Ribosomal_S27,102,147,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...
7,P07910,PF00076,RRM_1,18,81,MASNVTNKTDPRSMNSRVFIGNLNTLVVKKSDVEAIFSKYGKIVGC...
8,Q15149,PF00307,CH,179,283,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...
9,Q15149,PF00307,CH,295,401,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...


In [18]:
pfam_annotation[0:10]

[{'UniProtKB': 'Q9UQC1',
  'Accession': 'PF00012',
  'Identifier': 'HSP70',
  'Start': 1,
  'Stop': 151,
  'Sequence': 'MKHWPFQVINDGDKPKVQVSYKGETKAFYPEEISSMVLTKMKEIAEAYLGYPVTNAVITVPAYFNDSQRQATKDAGVIAGLNVLRIINEPTAAAIAYGLDRTGKGERNVLIFDLGGGTFDVSILTIDDGIFEVKATAGDTHLGGEDFDNRQ'},
 {'UniProtKB': 'P08670',
  'Accession': 'PF00038',
  'Identifier': 'Filament',
  'Start': 102,
  'Stop': 410,
  'Sequence': 'MSTRSVSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRPSTSRSLYASSPGGVYATRSSAVRLRSSVPGVRLLQDSVDFSLADAINTEFKNTRTNEKVELQELNDRFANYIDKVRFLEQQNKILLAELEQLKGQGKSRLGDLYEEEMRELRRQVDQLTNDKARVEVERDNLAEDIMRLREKLQEEMLQREEAENTLQSFRQDVDNASLARLDLERKVESLQEEIAFLKKLHEEEIQELQAQIQEQHVQIDVDVSKPDLTAALRDVRQQYESVAAKNLQEAEEWYKSKFADLSEAANRNNDALRQAKQESTEYRRQVQSLTCEVDALKGTNESLERQMREMEENFAVEAANYQDTIGRLQDEIQNMKEEMARHLREYQDLLNVKMALDIEIATYRKLLEGEESRISLPLPNFSSLNLRETNLDSLPLVDTHSKRTLLIKTVETRDGQVINETSQHHDDLE'},
 {'UniProtKB': 'P08670',
  'Accession': 'PF04732',
  'Identifier': 'Filament_head',
  'Start': 6,
  'Stop': 101,
  'Sequence': 'MS

In [20]:
pd.DataFrame(pfam_annotation[0:10]).to_dict()

{'UniProtKB': {0: 'Q9UQC1',
  1: 'P08670',
  2: 'P08670',
  3: 'P04406',
  4: 'P04406',
  5: 'P62979',
  6: 'P62979',
  7: 'P07910',
  8: 'Q15149',
  9: 'Q15149'},
 'Accession': {0: 'PF00012',
  1: 'PF00038',
  2: 'PF04732',
  3: 'PF00044',
  4: 'PF02800',
  5: 'PF00240',
  6: 'PF01599',
  7: 'PF00076',
  8: 'PF00307',
  9: 'PF00307'},
 'Identifier': {0: 'HSP70',
  1: 'Filament',
  2: 'Filament_head',
  3: 'Gp_dh_N',
  4: 'Gp_dh_C',
  5: 'ubiquitin',
  6: 'Ribosomal_S27',
  7: 'RRM_1',
  8: 'CH',
  9: 'CH'},
 'Start': {0: 1,
  1: 102,
  2: 6,
  3: 4,
  4: 157,
  5: 3,
  6: 102,
  7: 18,
  8: 179,
  9: 295},
 'Stop': {0: 151,
  1: 410,
  2: 101,
  3: 105,
  4: 314,
  5: 74,
  6: 147,
  7: 81,
  8: 283,
  9: 401},
 'Sequence': {0: 'MKHWPFQVINDGDKPKVQVSYKGETKAFYPEEISSMVLTKMKEIAEAYLGYPVTNAVITVPAYFNDSQRQATKDAGVIAGLNVLRIINEPTAAAIAYGLDRTGKGERNVLIFDLGGGTFDVSILTIDDGIFEVKATAGDTHLGGEDFDNRQ',
  1: 'MSTRSVSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRPSTSRSLYASSPGGVYATRSSAVRLRSSVPGVRLLQDSVDFSLADAINTEFKNTR

In [23]:
# Pickle all my lists
import pickle

with open("pickle_07232019_crystal_annotation.txt", "wb") as fp:   
    pickle.dump(crystal_annotation, fp)
 
with open("pickle_07232019_pfam_annotation.txt", "wb") as fp:
    pickle.dump(pfam_annotation, fp)
    
with open("pickle_07232019_sequences.txt", "wb") as fp:
    pickle.dump(sequences, fp)
    
with open("pickle_07232019_protein_crystal.txt", "wb") as fp:
    pickle.dump(protein_crystal, fp)
    
with open("pickle_07232019_errors.txt", "wb") as fp:
    pickle.dump(errors, fp)

[1, 2, 3, 4]