In [6]:
import pandas as pd
from tqdm import tqdm_notebook
import csv
from itertools import compress
import requests
from xml.dom import minidom

from src.get_pfam_info import get_pfam_info
from src.get_uniprot_info import get_uniprot_info
from src.utils import filter_on_attribute
from src.protein_modeling import *

import pickle

with open("pickle_07232019_pfam_annotation.txt", "rb") as fp:   # Unpickling
    pfam_annotation = pickle.load(fp)
    
with open("pickle_07232019_crystal_annotation.txt", "rb") as fp:
    crystal_annotation = pickle.load(fp)
    
with open("pickle_07232019_protein_crystal.txt", "rb") as fp:
    protein_crystal = pickle.load(fp)
    

In [7]:
protein = 'P16403'
for pfam in pfam_annotation:
    if pfam["UniProtKB"] == protein:
        print(pfam)

{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}


In [22]:
protein = 'P16403'
lookup_uniprot_pfams = list_to_lookup(pfam_annotation, "UniProtKB")
lookup_uniprot_crystal = list_to_lookup(protein_crystal, "Protein")
lookup_uniprot_crystals = list_to_lookup(crystal_annotation, "UniProtKB")
lookup_uniprot_pfams[protein]

[{'UniProtKB': 'P16403',
  'Accession': 'PF00538',
  'Identifier': 'Linker_histone',
  'Start': 37,
  'Stop': 108,
  'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}]

In [9]:
import csv
with open('proteins_07222019.csv', 'r') as f:
    reader = csv.reader(f)
    proteins = list(reader)[0]

In [14]:
import copy

segments_to_model = []
errors = []
indexing_errors = 0 
indexing_success = 0

for protein in proteins:
    try:
        uniprot_crystal_protein = lookup_uniprot_crystal[protein]
    except:
        errors.append(protein)
        continue
        
    if(protein == 'P16403'):
        print(copy.deepcopy(lookup_uniprot_pfams[protein]))
        
    full_sequence = uniprot_crystal_protein[0]["Sequence"]
    
    if uniprot_crystal_protein[0]["Crystal"] == "No":
        crystal_dicts = []
    else:
        crystal_dicts = lookup_uniprot_crystals[protein]
        
    try:
        pfam_dicts = copy.deepcopy(lookup_uniprot_pfams[protein])
        
    except Exception as e:
        pfam_dicts = []
        
    if(protein == 'P16403'):
        print(copy.deepcopy(lookup_uniprot_pfams[protein]))
        
    # Clean pfams to not be longer than the protein sequence
    # Choice: The pfam database is based on UniProt, so whenever pfam
    # thinks that it's pfams go longer than the sequence (derived from UniProt)
    # We should just truncate that pfam annotation
    # Update. Some of these are very off. Let's just model the whole protein in that case
    for pfam_dict in pfam_dicts:
        if (pfam_dict["Stop"] > len(full_sequence)) | (pfam_dict["Start"] > len(full_sequence)):
            pfam_dicts = []
            indexing_errors = indexing_errors + 1
            print(protein)
        else:
            indexing_success = indexing_success + 1
            
    if(protein == 'P16403'):
        print(copy.deepcopy(lookup_uniprot_pfams[protein]))
    
    for segment in get_segments_to_model(protein, full_sequence, pfam_dicts, crystal_dicts):
        segments_to_model.append(segment)
        
    if(protein == 'P16403'):
        print(copy.deepcopy(lookup_uniprot_pfams[protein]))
        print(get_segments_to_model(protein, full_sequence, pfam_dicts, crystal_dicts))

[{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}]
[{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}]
[{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPK

In [15]:
protein = 'P16403'
lookup_uniprot_pfams[protein]

[{'UniProtKB': 'P16403',
  'Accession': 'PF00538',
  'Identifier': 'Linker_histone',
  'Start': 37,
  'Stop': 108,
  'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK'}]

In [16]:
seqs = pd.DataFrame(segments_to_model)

In [18]:
seqs[seqs.UniProtKB == "P16403"]

Unnamed: 0,UniProtKB,fasta_header,fasta_sequence,notes
175,P16403,>P16403_full_protein,MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITK...,Full protein (incomplete crystal coverage)
176,P16403,>P16403_PF00538_Linker_histone_1_213,MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITK...,Pfam with flanking region
177,P16403,>P16403_PF00538_Linker_histone_1_213,MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITK...,Pfam with flanking region


In [20]:
protein = 'P16403'
uniprot_crystal_protein = lookup_uniprot_crystal[protein]
print(uniprot_crystal_protein[protein])

TypeError: list indices must be integers or slices, not str

In [23]:
get_segments_to_model(protein, full_sequence, pfam_dicts, crystal_dicts)

[{'Protein': 'P16403',
  'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK',
  'Crystal': 'No'}]

In [3]:
from protein_modeling import *
import pickle
import copy

with open("pickle_07232019_pfam_annotation.txt", "rb") as fp:   # Unpickling
    pfam_annotation = pickle.load(fp)
    
with open("pickle_07232019_crystal_annotation.txt", "rb") as fp:
    crystal_annotation = pickle.load(fp)
    
with open("pickle_07232019_protein_crystal.txt", "rb") as fp:
    protein_crystal = pickle.load(fp)
    
lookup_uniprot_pfams = list_to_lookup(pfam_annotation, "UniProtKB")
lookup_uniprot_crystal = list_to_lookup(protein_crystal, "Protein")
lookup_uniprot_crystals = list_to_lookup(crystal_annotation, "UniProtKB")

protein = 'P16403'

uniprot_crystal_protein = lookup_uniprot_crystal[protein]
full_sequence = uniprot_crystal_protein[0]["Sequence"]
if uniprot_crystal_protein[0]["Crystal"] == "No":
    crystal_dicts = []
else:
    crystal_dicts = lookup_uniprot_crystals[protein]
        
try:
    pfam_dicts = copy.deepcopy(lookup_uniprot_pfams[protein])  
except Exception as e:
        pfam_dicts = []        

ModuleNotFoundError: No module named 'protein_modeling'

In [2]:
print(protein)
print(full_sequence)
print(pfam_dicts)
print(crystal_dicts)

NameError: name 'protein' is not defined

In [3]:
get_segments_to_model(protein, full_sequence, pfam_dicts, crystal_dicts)

[{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK', 'Notes': 'Full pfam'}]
[{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK', 'Notes': 'Full pfam'}]
[{'UniProtKB': 'P16403', 'Accession': 'PF00538', 'Identifier': 'Linker_histone', 'Start': 37, 'Stop': 108, 'Sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPA

[{'UniProtKB': 'P16403',
  'fasta_header': '>P16403_full_protein',
  'fasta_sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK',
  'notes': 'Full protein (incomplete crystal coverage)'},
 {'UniProtKB': 'P16403',
  'fasta_header': '>P16403_PF00538_Linker_histone_37_108',
  'fasta_sequence': 'GPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLN',
  'notes': 'Full pfam'},
 {'UniProtKB': 'P16403',
  'fasta_header': '>P16403_PF00538_Linker_histone_1_213',
  'fasta_sequence': 'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITKAVAASKERSGVSLAALKKALAAAGYDVEKNNSRIKLGLKSLVSKGTLVQTKGTGASGSFKLNKKAASGEAKPKVKKAGGTKPKKPVGAAKKPKKAAGGATPKKSAKKTPKKAKKPAAATVTKKVAKSPKKAKVAKPKKAAKSAAKAVKPKAAKPKVVKPKKAAPKKK',
  'notes': 'Pfam with flanking region'}]