In [6]:
import pandas as pd
from tqdm import tqdm_notebook
import csv
from itertools import compress
import requests
from xml.dom import minidom

from src.get_pfam_info import get_pfam_info
from src.get_uniprot_info import get_uniprot_info
from src.utils import filter_on_attribute
from src.protein_modeling import *

import pickle

with open("pickle_07232019_pfam_annotation.txt", "rb") as fp:   # Unpickling
    pfam_annotation = pickle.load(fp)
    
with open("pickle_07232019_crystal_annotation.txt", "rb") as fp:
    crystal_annotation = pickle.load(fp)
    
with open("pickle_07232019_protein_crystal.txt", "rb") as fp:
    protein_crystal = pickle.load(fp)
    

In [7]:
protein = 'P16403'
for pfam in pfam_annotation:
    if pfam["UniProtKB"] == protein:
        print(pfam)

{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}


In [22]:
protein = 'P16403'
lookup_uniprot_pfams = list_to_lookup(pfam_annotation, "UniProtKB")
lookup_uniprot_crystal = list_to_lookup(protein_crystal, "Protein")
lookup_uniprot_crystals = list_to_lookup(crystal_annotation, "UniProtKB")
lookup_uniprot_pfams[protein]

[{'UniProtKB': 'P16403',
  'Accession': 'PF00538',
  'Identifier': 'Linker_histone',
  'Start': 37,
  'Stop': 108,
  'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}]

In [16]:
import csv
with open('../data/csv/proteins_07222019.csv', 'r') as f:
    reader = csv.reader(f)
    proteins = list(reader)[0]

In [17]:
import copy

segments_to_model = []
errors = []
indexing_errors = 0 
indexing_success = 0

for protein in proteins:
    try:
        uniprot_crystal_protein = lookup_uniprot_crystal[protein]
    except:
        errors.append(protein)
        continue
        
    full_sequence = uniprot_crystal_protein[0]["Sequence"]
    
    if uniprot_crystal_protein[0]["Crystal"] == "No":
        crystal_dicts = []
    else:
        crystal_dicts = lookup_uniprot_crystals[protein]
        
    try:
        pfam_dicts = copy.deepcopy(lookup_uniprot_pfams[protein])
        
    except Exception as e:
        pfam_dicts = []
        
    # Clean pfams to not be longer than the protein sequence
    # Choice: The pfam database is based on UniProt, so whenever pfam
    # thinks that it's pfams go longer than the sequence (derived from UniProt)
    # We should just truncate that pfam annotation
    # Update. Some of these are very off. Let's just model the whole protein in that case
    for pfam_dict in pfam_dicts:
        if (pfam_dict["Stop"] > len(full_sequence)) | (pfam_dict["Start"] > len(full_sequence)):
            pfam_dicts = []
            indexing_errors = indexing_errors + 1
            print(protein)
        else:
            indexing_success = indexing_success + 1
    
    for segment in get_segments_to_model(protein, full_sequence, pfam_dicts, crystal_dicts):
        segments_to_model.append(segment)

Q7Z4J2
Q9P2D7
Q14393
Q9GZS1
F5GX99
F5GX99
F5GX99
Q9UJ41
Q9UFH2


In [18]:
protein = 'P16403'
lookup_uniprot_pfams[protein]

[{'UniProtKB': 'P16403',
  'Accession': 'PF00538',
  'Identifier': 'Linker_histone',
  'Start': 37,
  'Stop': 108,
  'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}]

In [20]:
import pandas as pd
seqs = pd.DataFrame(segments_to_model)

In [25]:
seqs[seqs.UniProtKB == "P16403"]

Unnamed: 0,UniProtKB,fasta_header,fasta_sequence,notes
169,P16403,>P16403_full_protein,MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITK...,Full protein (incomplete crystal coverage)
170,P16403,>P16403_PF00538_Linker_histone_37_108,GPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKL...,Full pfam


In [30]:
for segment_dict in segments_to_model:
    segment_length = len(segment_dict["fasta_sequence"])
    segment_dict["Length"] = segment_length
    
    if segment_length <= 800:
        segment_dict["Model_In_QUARK:"] = True
        segment_dict["Model_In_ITASSER:"] = True
        
    elif segment_length <= 1400:
        segment_dict["Model_In_QUARK:"] = False
        segment_dict["Model_In_ITASSER:"] = True
        
    else:
        segment_dict["Model_In_QUARK:"] = False
        segment_dict["Model_In_ITASSER:"] = False

In [44]:
seqs2 = pd.DataFrame(segments_to_model)

In [45]:
print(seqs2.shape)

(31374, 7)


In [46]:
print(seqs2.shape)

already_queued = ['O60812',
'Q15061',
'P02545',
'P02545',
'P04083',
'P07437',
'Q15061',
'P62979',
'P07910',
'P07910',
'Q13813',
'P08670',
'P08670',
'P0C0S8',
'P13639',
'P62979',
'P11142',
'P11142',
'P13639',
'P13639',
'P13639',
'P61513',
'P0C0S8',
'P13639',
'P19338',
'P19338',
'P22626',
'P23528',
'P43243',
'P08670',
'Q15149',
'Q15061',
'P60709',
'P61513',
'P13639',
'P07437',
'P68371',
'Q9Y5B9',
'P68104',
'P68104',
'P07437',
'P68371',
'P68371',
'P68104',
'Q08945',
'Q08945',
'Q5VTE0',
'Q08945',
'Q08945',
'Q13813',
'Q13813',
'Q13813',
'P45880',
'Q13813',
'Q13813',
'Q15061',
'P60174',
'P45880',
'Q13813',
'Q15149',
'Q15149',
'Q15149',
'Q15149',
'O60812',
'Q15149',
'Q58FF7',
'Q58FF7',
'Q58FF7',
'Q58FF7',
'P68104',
'Q5VTE0',
'Q5VTE0',
'Q5VTE0',
'Q9UQC1',
'P10809',
'Q9Y5B9',
'Q15149',
'Q15149',
'Q15149',
'Q15149']

seqs2 = seqs2[~seqs2['UniProtKB'].isin(already_queued)]


print(seqs2.shape)

(31374, 7)
(31205, 7)


In [47]:
seqs2.to_csv("new_sequences_07242019.csv")

In [32]:
seqs2[seqs2.UniProtKB == "P16403"]

Unnamed: 0,UniProtKB,fasta_header,fasta_sequence,notes,Model_In_QUARK:,Model_In_ITASSER:,Length
169,P16403,>P16403_full_protein,MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITK...,Full protein (incomplete crystal coverage),True,True,213
170,P16403,>P16403_PF00538_Linker_histone_37_108,GPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKL...,Full pfam,True,True,72


In [24]:
seqs

Unnamed: 0,UniProtKB,fasta_header,fasta_sequence,notes
0,Q9UQC1,>Q9UQC1_full_protein,MKHWPFQVINDGDKPKVQVSYKGETKAFYPEEISSMVLTKMKEIAE...,Full protein (incomplete crystal coverage)
1,P08670,>P08670_full_protein,MSTRSVSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRP...,Full protein (incomplete crystal coverage)
2,P08670,>P08670_PF04732_Filament_head_6_101,VSSSSYRRMFGGPGTASRPSSSRSYVTTSTRTYSLGSALRPSTSRS...,Full pfam
3,P62979,>P62979_full_protein,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...,Full protein (incomplete crystal coverage)
4,P62979,>P62979_PF01599_Ribosomal_S27_102_147,VLKYYKVDENGKISRLRRECPSDECGAGVFMASHFDRHYCGKCCLT,Full pfam
5,P62979,>P62979_PF01599_Ribosomal_S27_75_156,GGAKKRKKKSYTTPKKNKHKRKKVKLAVLKYYKVDENGKISRLRRE...,Pfam with flanking region
6,P07910,>P07910_full_protein,MASNVTNKTDPRSMNSRVFIGNLNTLVVKKSDVEAIFSKYGKIVGC...,Full protein (incomplete crystal coverage)
7,P07910,>P07910_PF00076_RRM_1_18_81,VFIGNLNTLVVKKSDVEAIFSKYGKIVGCSVHKGFAFVQYVNERNA...,Full pfam
8,Q15149,>Q15149_full_protein,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...,Full protein (incomplete crystal coverage)
9,Q15149,>Q15149_PF00681_Plectin_2941_2981,RLLEAQIATGGVIDPVHSHRVPVDVAYRRGYFDEEMNRVLA,Full pfam


In [23]:
get_segments_to_model(protein, full_sequence, pfam_dicts, crystal_dicts)

[{'Protein': 'P16403',
  'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK',
  'Crystal': 'No'}]

In [10]:
import sys
if not '/home/julian/Dropbox (HMS)/DARPA Biostasis/Experiments/Proteomics/Human protein modeling/Automated_Protein_Annotation/src' in sys.path:
  sys.path += ['/home/julian/Dropbox (HMS)/DARPA Biostasis/Experiments/Proteomics/Human protein modeling/Automated_Protein_Annotation/src']

from protein_modeling import *
import pickle
import copy

with open("../data/pickle/pickle_07232019_pfam_annotation.txt", "rb") as fp:   # Unpickling
    pfam_annotation = pickle.load(fp)
    
with open("../data/pickle/pickle_07232019_crystal_annotation.txt", "rb") as fp:
    crystal_annotation = pickle.load(fp)
    
with open("../data/pickle/pickle_07232019_protein_crystal.txt", "rb") as fp:
    protein_crystal = pickle.load(fp)
    
lookup_uniprot_pfams = list_to_lookup(pfam_annotation, "UniProtKB")
lookup_uniprot_crystal = list_to_lookup(protein_crystal, "Protein")
lookup_uniprot_crystals = list_to_lookup(crystal_annotation, "UniProtKB")

protein = 'P68104'

uniprot_crystal_protein = lookup_uniprot_crystal[protein]
full_sequence = uniprot_crystal_protein[0]["Sequence"]
if uniprot_crystal_protein[0]["Crystal"] == "No":
    crystal_dicts = []
else:
    crystal_dicts = lookup_uniprot_crystals[protein]
        
try:
    pfam_dicts = copy.deepcopy(lookup_uniprot_pfams[protein])  
except Exception as e:
        pfam_dicts = []        

In [11]:
print(protein)
print(full_sequence)
print(pfam_dicts)
print(crystal_dicts)

P68104
MGKEKTHINIVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEAAEMGKGSFKYAWVLDKLKAERERGITIDISLWKFETSKYYVTIIDAPGHRDFIKNMITGTSQADCAVLIVAAGVGEFEAGISKNGQTREHALLAYTLGVKQLIVGVNKMDSTEPPYSQKRYEEIVKEVSTYIKKIGYNPDTVAFVPISGWNGDNMLEPSANMPWFKGWKVTRKDGNASGTTLLEALDCILPPTRPTDKPLRLPLQDVYKIGGIGTVPVGRVETGVLKPGMVVTFAPVNVTTEVKSVEMHHEALSEALPGDNVGFNVKNVSVKDVRRGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHTAHIACKFAELKEKIDRRSGKKLEDGPKFLKSGDAAIVDMVPGKPMCVESFSDYPPLGRFAVRDMRQTVAVGVIKAVDKKAAGAGKVTKSAQKAQKAK
[{'UniProtKB': 'P68104', 'Accession': 'PF00009', 'Identifier': 'GTP_EFTU', 'Start': 5, 'Stop': 238, 'Sequence': 'MGKEKTHINIVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEAAEMGKGSFKYAWVLDKLKAERERGITIDISLWKFETSKYYVTIIDAPGHRDFIKNMITGTSQADCAVLIVAAGVGEFEAGISKNGQTREHALLAYTLGVKQLIVGVNKMDSTEPPYSQKRYEEIVKEVSTYIKKIGYNPDTVAFVPISGWNGDNMLEPSANMPWFKGWKVTRKDGNASGTTLLEALDCILPPTRPTDKPLRLPLQDVYKIGGIGTVPVGRVETGVLKPGMVVTFAPVNVTTEVKSVEMHHEALSEALPGDNVGFNVKNVSVKDVRRGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHTAHIACKFAELKEKIDRRSGKKLEDGPKFLKSGDAAIVDMVPGKPMCVESFSD

In [12]:
get_segments_to_model(protein, full_sequence, pfam_dicts, crystal_dicts)

[{'UniProtKB': 'P68104',
  'fasta_header': '>P68104_full_protein',
  'fasta_sequence': 'MGKEKTHINIVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEAAEMGKGSFKYAWVLDKLKAERERGITIDISLWKFETSKYYVTIIDAPGHRDFIKNMITGTSQADCAVLIVAAGVGEFEAGISKNGQTREHALLAYTLGVKQLIVGVNKMDSTEPPYSQKRYEEIVKEVSTYIKKIGYNPDTVAFVPISGWNGDNMLEPSANMPWFKGWKVTRKDGNASGTTLLEALDCILPPTRPTDKPLRLPLQDVYKIGGIGTVPVGRVETGVLKPGMVVTFAPVNVTTEVKSVEMHHEALSEALPGDNVGFNVKNVSVKDVRRGNVAGDSKNDPPMEAAGFTAQVIILNHPGQISAGYAPVLDCHTAHIACKFAELKEKIDRRSGKKLEDGPKFLKSGDAAIVDMVPGKPMCVESFSDYPPLGRFAVRDMRQTVAVGVIKAVDKKAAGAGKVTKSAQKAQKAK',
  'notes': 'Full protein (incomplete crystal coverage)'},
 {'UniProtKB': 'P68104',
  'fasta_header': '>P68104_PF00009_GTP_EFTU_5_238',
  'fasta_sequence': 'KTHINIVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEAAEMGKGSFKYAWVLDKLKAERERGITIDISLWKFETSKYYVTIIDAPGHRDFIKNMITGTSQADCAVLIVAAGVGEFEAGISKNGQTREHALLAYTLGVKQLIVGVNKMDSTEPPYSQKRYEEIVKEVSTYIKKIGYNPDTVAFVPISGWNGDNMLEPSANMPWFKGWKVTRKDGNASGTTLLEALDCILPP',
  'notes': 'Full pfam'},
 {'UniProtKB': 'P68104',
  'f