In [1277]:
import pandas as pd
from Bio.Seq import Seq
from Bio import Entrez
import os
from typing import List, Dict

In [1278]:
os.chdir('C:\\Users\\jcham\spanins')
os.getcwd()

'C:\\Users\\jcham\\spanins'

## Goal: Return CDS for 'sequence', given 'ncbi_sequence' and 'cds' (ncbi cds).
* Note: The spanin sequences in general are subsets of one another

# Data exploration and prep

### Table dimensions are 122x13

In [1279]:
data = pd.read_csv('category_I_sequence_diff.csv')
display(data.head(8), data.shape)

Unnamed: 0,host,host_taxid,phage_acc,protein_acc,protein_gi,cds,strand,function,spanin_type,property_feature,sequence,ncbi_sequence,same
0,Aeromonas,642,NC_020879.1,YP_007677913.1,472438133,126555..126789,NEGATIVE,ISPANIN,sep,TMD:7..29,MLLTLSSLLSWLKSNALCIIIMVLMAIMMKNQHDEISTLKTSLESM...,MKNQHDEISTLKTSLESMKSFQTKSYENAKPVTEALLKSPKATKQM...,N
1,Aeromonas,642,NC_019543.1,YP_007010874.1,423262275,124571..124826,NEGATIVE,ISPANIN,sep,TMD:7..29,MLLTLSSLLSWLKSNALYIIIMVLMAIMMKNQHDEISTLKNSLESM...,MVLMAIMMKNQHDEISTLKNSLESMKSFQTKSYENAKPVTEALLKS...,N
2,Enterobacteria,547,AP011113,,26042140,,,ISPANIN,sep,TMD:5..24,MHVSNFTAGLLLLVIAFGGTSIILKNKVERLETSVVEITKTANENA...,,
3,Enterobacteria,547,NC_024142.1,YP_009031980.1,640884671,12841..13066,POSITIVE,ISPANIN,emb,TMD:10..27,MKMLISKGWPYLLVVVLGATIYFWGNSNGQSTVQKKWDDQKVEDQK...,MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEEL...,N
4,Campylobacter,194,NC_016562.1,YP_004956969.1,371671013,89202..89547,NEGATIVE,ISPANIN,ovl,TMD:15..34,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,N
5,Escherichia,561,EU078592,,40093751,,,ISPANIN,emb,TMD:4..26,MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKL...,,
6,Pseudomonas,286,NC_007805.1,YP_001293406.1,148912827,37910..38360,POSITIVE,ISPANIN,emb,,MRWVPWLVVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQ...,MVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQ...,N
7,Salmonella,590,KC139515.1,AGF88018.1,451937721,17634..17832,NEGATIVE,ISPANIN,ovl,TMD:4..26,MTGLLARIKTGVLAALVFVVALFGVWRAGRTKGKQDQINNQNNDTL...,MALFGVWRAGRTKGKQDQINNQNNDTLREQANADKNVAEVHNEINK...,N


(122, 13)

### NaN rows
* 10 of the NaN rows do not have any ncbi_sequence data (probably weren't called at all by the submitter, but were called by rohit) **Need to manual pblast these probably?**
* The other 10 do have sequence data but are missing other data
* Rows 6, 8, 9, 15, 20, 28, and 32 are missing only the property_feature data. These rows should be manageable with the planned script
* Rows 45, 79 and 99 have sequence data but no cds or strand. Probably need to be dealt with manually. 
* All others remaining (2, 5, 25, 40, 41, 42, 74, 81, 114, 118) were not called by the ncbi submitter.

In [1280]:
data[data.isna().any(axis=1)]

Unnamed: 0,host,host_taxid,phage_acc,protein_acc,protein_gi,cds,strand,function,spanin_type,property_feature,sequence,ncbi_sequence,same
2,Enterobacteria,547,AP011113,,26042140,,,ISPANIN,sep,TMD:5..24,MHVSNFTAGLLLLVIAFGGTSIILKNKVERLETSVVEITKTANENA...,,
5,Escherichia,561,EU078592,,40093751,,,ISPANIN,emb,TMD:4..26,MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKL...,,
6,Pseudomonas,286,NC_007805.1,YP_001293406.1,148912827,37910..38360,POSITIVE,ISPANIN,emb,,MRWVPWLVVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQ...,MVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQ...,N
8,Salmonella,590,NC_010392.1,YP_001700615.1,169257238,30405..30858,NEGATIVE,ISPANIN,emb,,MMFNWKTMFVGLLLVSLIVAGRLANHYRNNAITYKYQRDTATHNLK...,MFVGLLLVSLIVAGRLANHYRNNAITYKYQRDTATHNLKLANETIT...,N
9,Salmonella,590,NC_010393.1,YP_001700673.1,169257297,17493..17973,POSITIVE,ISPANIN,emb,,MMFNWKTMFVGLLLVSLIVAGRLANHYRNNAITYKEQRDTVTHRLT...,MFVGLLLVSLIVAGRLANHYRNNAITYKEQRDTVTHRLTLANATIT...,N
15,Escherichia,561,NC_024379.1,YP_009044294.1,658607293,36334..36736,POSITIVE,ISPANIN,emb,,MLEFLKRAAPWLLAAVMFAGGYHTANNKWEAKVNAEYTSNLKASED...,MFAGGYHTANNKWEAKVNAEYTSNLKASEDTRLAVQAEVNKVSKRF...,N
20,Burkholderia,32008,NC_009234.1,YP_001111073.1,134288770,32853..33276,NEGATIVE,ISPANIN,ovl,,MNLSRLMPWLALFALIALAASCQHGRALRAQLERATDDARRANRDA...,MPWLALFALIALAASCQHGRALRAQLERATDDARRANRDAQASAAV...,N
25,Yersinia,629,HE956707,,398313030,,,ISPANIN,ovl,TMD:10..27,MLTIPNKYKWAVMALLAAVSIGSLTLANHYRDSALTSQKALQEVTD...,,
28,Yersinia,629,AM076770,CAJ28448.1,164414553,18144..18336,POSITIVE,ISPANIN,ovl,,MLGKLKIAVMLMIAAVLAWKAGSWNGARVERSVQIAECNNRIEKLA...,MQIAECNNRIEKLAAELEAEKAKKKVEVTKSASKTKQSVLVATDSD...,N
32,Stenotrophomonas,40323,NC_023588.1,YP_009008371.1,589892004,9447..9939,POSITIVE,ISPANIN,emb,,MLYRALALAALVLATAGLFSYQQGRISRATTALDKANLDLAKARSE...,MLATAGLFSYQQGRISRATTALDKANLDLAKARSENAALTSSLKLA...,N


### Creating data2 DF, which drops irrelevant columns and 13 NaN rows
* ```data2``` = original data minus rows with NaN values for sequence, ncbi_sequence or cds columns
* ```data2``` contains 7 more rows than simply running dropna() on ```data``` (as some rows have NaN values in irrelevant columns)


In [1281]:
data2 = data[['phage_acc', 'strand', 'cds', 'sequence', 'ncbi_sequence']].copy()
data2 = data2.dropna(axis=0)
display(data2.head(3), data2.shape)

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence
0,NC_020879.1,NEGATIVE,126555..126789,MLLTLSSLLSWLKSNALCIIIMVLMAIMMKNQHDEISTLKTSLESM...,MKNQHDEISTLKTSLESMKSFQTKSYENAKPVTEALLKSPKATKQM...
1,NC_019543.1,NEGATIVE,124571..124826,MLLTLSSLLSWLKSNALYIIIMVLMAIMMKNQHDEISTLKNSLESM...,MVLMAIMMKNQHDEISTLKNSLESMKSFQTKSYENAKPVTEALLKS...
3,NC_024142.1,POSITIVE,12841..13066,MKMLISKGWPYLLVVVLGATIYFWGNSNGQSTVQKKWDDQKVEDQK...,MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEEL...


(109, 5)

### Creating list of the protein sequences (rohit's spanin calls vs NCBI's)

In [1282]:
rohit_calls: List[str] = list(data2['sequence'])
ncbi_calls: List[str] = list(data2['ncbi_sequence'])
[v for v in ncbi_calls][:2]

['MKNQHDEISTLKTSLESMKSFQTKSYENAKPVTEALLKSPKATKQMEKIAEKKPQLLEKRMNMGFQKLADQLQESTK',
 'MVLMAIMMKNQHDEISTLKNSLESMKSFQTKSYENAKPVTEALLKSPKATKQMEKIAEKKPQLLEKRMNMGFQKLADQLQESTK']

### Iterates over ncbi/rohit calls, checking if each sequence is a subset of the other, or if they are identical

In [1283]:
ncbi_subset_of_rohit: List[bool] = [v in rohit_calls[i] for i, v in enumerate(ncbi_calls)]
rohit_subset_of_ncbi: List[bool] = [v in ncbi_calls[i] for i, v in enumerate(rohit_calls)]
identical: List[bool] = [v == ncbi_calls[i] for i, v in enumerate(rohit_calls)]
print(' ncbi seqs contained in rohit seqs:', sum(ncbi_subset_of_rohit), 
      '\n', 'rohit seqs contained in ncbi seqs:', sum(rohit_subset_of_ncbi), 
      '\n', 'sequence calls are the same:', sum(identical),
      '\n', 'total seqs in data:', data2.shape[0])

 ncbi seqs contained in rohit seqs: 57 
 rohit seqs contained in ncbi seqs: 10 
 sequence calls are the same: 0 
 total seqs in data: 109


### Adding these subset seqs to a df

In [1284]:
data2['ncbi_subset_of_rohit'] = ncbi_subset_of_rohit
data2['rohit_subset_of_ncbi'] = rohit_subset_of_ncbi
print(data2['sequence'].loc[4], data2['ncbi_sequence'].loc[4])

MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEKQLTQNIKNSKKELEALKNYNNLTEVFREKEVKYKEVLNNIKNIETKIQKLKLMRKDENETQYIIVNF MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEKQLTQNIKNSKKELEALKNYNNLTIEVFREKEVKYKEVLNNIKNIETKIQKLKLMRKDENETQYIIVNF


# Functions

### Finds insertions/deletions between two strings
- **Only true for one row at the beginning of the data. so insertions/deletions were not as common as it first appeared..**

In [1285]:
def find_difference_two_strings(str1: str, str2: str):
    from difflib import ndiff
    return [li for li in ndiff(str1, str2) if li[0] != ' ']

In [1286]:
row4_rohit = data2['sequence'].loc[4]
row4_ncbi = data2['ncbi_sequence'].loc[4]

print(' rohit:', row4_rohit, '\n', 'ncbi: ', row4_ncbi, '\n difference:', find_difference_two_strings(row4_rohit, row4_ncbi))

 rohit: MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEKQLTQNIKNSKKELEALKNYNNLTEVFREKEVKYKEVLNNIKNIETKIQKLKLMRKDENETQYIIVNF 
 ncbi:  MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEKQLTQNIKNSKKELEALKNYNNLTIEVFREKEVKYKEVLNNIKNIETKIQKLKLMRKDENETQYIIVNF 
 difference: ['+ I']


*May need this code to find the index where the insertions/deletions occur at as well.*

In [1287]:
# Attempting to find the index of the difference between two strings
from difflib import ndiff, SequenceMatcher
test_a = 'MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEKQLTQNIKNSKKELEALKNYNNLTEVFREKEVKYKEVLNNIKNIETKIQKLKLMRKDENETQYIIVNF'
test_b = 'MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEKQLTQNIKNSKKELEALKNYNNLTIEVFREKEVKYKEVLNNIKNIETKIQKLKLMRKDENETQYIIVNF'

display(SequenceMatcher(None, test_a, test_b).get_matching_blocks(), test_b[69])

[Match(a=0, b=0, size=69),
 Match(a=69, b=70, size=44),
 Match(a=113, b=114, size=0)]

'I'

In [1288]:
def check_if_begin_end_same(seq1: str, seq2:str) -> bool:
    '''
    Tells you if the first three letters and last three chars of a string are both the same. 
    If not, an insertion, deletion or frameshift is likely.
    '''
    if (seq1.startswith(seq2[:3]) and seq1.endswith(seq2[-3:])):
        return True
    else:
        return False

In [1289]:
# Running function on all spanin sequence data
data2['begin_end_same'] = [check_if_begin_end_same(rohit_calls[i], ncbi_calls[i]) for i, v in enumerate(rohit_calls)]
display(data2[data2['begin_end_same'] == True])

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence,ncbi_subset_of_rohit,rohit_subset_of_ncbi,begin_end_same
4,NC_016562.1,NEGATIVE,89202..89547,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,False,False,True


### Find the *length* difference between two strings, then based on which string is a subset of one another, determine the new cds
* Need to parse sequence and ncbi_sequence against one another, checking if one is a subset of another.
* If so, determine the difference in lengths between the larger one and the smaller
* Multiply this difference by three. The starting CDS should be approximately that many bases before or after the NCBI CDS (or +1 the NCBI CDS due to messed up indexing?)

In [1290]:
def length_difference_two_strings(str1: str, str2: str) -> int:
    return abs(len(str1) - len(str2))

In [1291]:
def check_if_subset(str1: str, str2: str) -> bool:
    return str1 in str2

In [1292]:
def convert_ncbi_cds(cds: str) -> List[int]:
    """
    Splits ncbi cds data (Ex: '123..6789') into a list of two integers
    """
    return cds.split('..')

In [1293]:
# def find_new_cds(seq1_cds: str, seq1: str, seq2: str, strand: str) -> List[int]:
#     """
#     Finds a new cds given a previous cds, the old protein sequence, and a new protein sequence IFF one is a subset of the other.
#     This assumes the annotations are in the same reading frame.
#     It also assumes seq2's annotations are correct and the seq1 ones are not.
#     """
#     # Converts ncbi cds
#     cds: List[str] = seq1_cds.split('..')
#     cds: List[int] = [int(x) for x in cds]
    
#     # Finds difference in length between two seqs
#     length_difference: int = length_difference_two_strings(seq1, seq2)
    
#     # Determines which is a subset of the other
#     seq1_subset_of_seq2: bool = check_if_subset(seq1, seq2)
#     seq2_subset_of_seq1: bool = check_if_subset(seq2, seq1)
    
#     # Checks if they begin the same or end the same (first and last 3 characters)
#     begin_same: bool = seq1.startswith(seq2[:3])
#     end_same: bool = seq1.endswith(seq2[-3:])
    
#     # Determines how much to shift the cds
#     # Multiplies by 3, because cds indexes by nucleotides while passed seqs are protein seqs
#     cds_shift: int = length_difference*3
    
#     if strand == 'POSITIVE':
#         # seq1 = ncbi_seq and seq2 = rohit_seq
#         if seq1_subset_of_seq2:
#             if begin_same:
#                 # extends seq1 cds end to seq2 cds end
#                 cds[1] = cds[1] + cds_shift
#                 return cds

#             if end_same:
#                 # extends seq1 cds start to seq2 cds start
#                 cds[0] = cds[0] - cds_shift
#                 return cds

#         elif seq2_subset_of_seq1:
#             # reduces seq1 cds end to the seq2 cds end
#             if begin_same:
#                 cds[1] = cds[1] - cds_shift 
#                 return cds

#             if end_same:
#                 # reduces seq1 cds to match seq2 cds start
#                 cds[0] = cds[0] + cds_shift
#                 return cds
            
#     if strand == 'NEGATIVE':
#     # Flipped the signs for all. Need to verify that is correct.
#         if seq1_subset_of_seq2:
#             if begin_same:
#                 cds[1] = cds[1] - cds_shift
#                 return cds

#             if end_same:
#                 cds[1] = cds[1] + cds_shift
#                 return cds

#         elif seq2_subset_of_seq1:
#             if begin_same:
#                 cds[1] = cds[1] + cds_shift 
#                 return cds

#             if end_same:
#                 cds[0] = cds[0] - cds_shift
#                 return cds
        
#     if begin_same:
#         if end_same:
#             # Imperfectly checks if there are insertions or deletions
#             # Probably not worth writing the logic to handle this. Only one case in the data.
#             return [1,1]
    
#         # Probably need entirely different logic to handle this case
#     if (not begin_same and not end_same):
#         return [0,0]
    
#     else:
#         # these cases do not match because the ncbi_seq uses an uncommon start codon upstream of the one rohit selected (V, L or I mostly it looks like).
#         # Handling these by finding the difference in length between ncbi_sequence and sequence, and changing the new cds to match rohit's sequence
#         if strand == 'POSITIVE':
#             if end_same:
#                 cds[0] = cds[0] - cds_shift 
#                 return cds
#         if strand == 'NEGATIVE':
#             if end_same:
#                 cds[0] = cds[0] + cds_shift
#                 return cds
    
#     # failsafe to avoid returning None
#     return [2,2]

In [1294]:
def find_new_cds(seq1_cds: str, seq1: str, seq2: str, strand: str) -> List[int]:
    """
    Finds a new cds given a previous cds, the old protein sequence, and a new protein sequence IFF one is a subset of the other.
    This assumes the annotations are in the same reading frame.
    It also assumes seq2's annotations are correct and the seq1 ones are not.
    """
    # Converts ncbi cds
    cds: List[str] = seq1_cds.split('..')
    cds: List[int] = [int(x) for x in cds]
    
    # Finds difference in length between two seqs
    length_difference: int = length_difference_two_strings(seq1, seq2)
    
    # Determines which is a subset of the other
    seq1_subset_of_seq2: bool = check_if_subset(seq1, seq2)
    seq2_subset_of_seq1: bool = check_if_subset(seq2, seq1)
    
    # Checks if they begin the same or end the same (first and last 3 characters)
    begin_same: bool = seq1.startswith(seq2[:3])
    end_same: bool = seq1.endswith(seq2[-3:])
    
    # Determines how much to shift the cds
    # Multiplies by 3, because cds indexes by nucleotides while passed seqs are protein seqs
    cds_shift: int = length_difference*3
    
    if strand == 'POSITIVE':
        # seq1 = ncbi_seq and seq2 = rohit_seq
        if seq1_subset_of_seq2:
            if begin_same:
                # extends seq1 cds end to seq2 cds end
                cds[1] = cds[1] + cds_shift
                return cds

            if end_same:
                # extends seq1 cds start to seq2 cds start
                cds[0] = cds[0] - cds_shift
                return cds

        elif seq2_subset_of_seq1:
            # reduces seq1 cds end to the seq2 cds end
            if begin_same:
                cds[1] = cds[1] - cds_shift 
                return cds

            if end_same:
                # reduces seq1 cds to match seq2 cds start
                cds[0] = cds[0] + cds_shift
                return cds
            
    if strand == 'NEGATIVE':
    # Flipped the signs for all. Need to verify that is correct.
        if seq1_subset_of_seq2:
            if begin_same:
                cds[1] = cds[1] - cds_shift
                return cds

            if end_same:
                cds[1] = cds[1] + cds_shift
                return cds

        elif seq2_subset_of_seq1:
            if begin_same:
                cds[0] = cds[0] + cds_shift 
                return cds

            if end_same:
                cds[1] = cds[1] - cds_shift
                return cds
        
    if begin_same:
        if end_same:
            # Imperfectly checks if there are insertions or deletions
            # Probably not worth writing the logic to handle this. Only one case in the data.
            return [1,1]
    
        # Probably need entirely different logic to handle this case
    if (not begin_same and not end_same):
        return [0,0]
    
    else:
        # these cases do not match because the ncbi_seq uses an uncommon start codon upstream of the one rohit selected (V, L or I mostly it looks like).
        # Handling these by finding the difference in length between ncbi_sequence and sequence, and changing the new cds to match rohit's sequence
        if strand == 'POSITIVE':
            if end_same:
                cds[0] = cds[0] - cds_shift 
                return cds
        if strand == 'NEGATIVE':
            if end_same:
                cds[1] = cds[1] + cds_shift
                return cds
    
    # failsafe to avoid returning None
    return [2,2]

## Applying the function

In [1295]:
data2['new_cds'] = data2.apply(lambda x: find_new_cds(x['cds'], x['ncbi_sequence'], x['sequence'], x['strand']), axis=1)
data2.head(7)

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence,ncbi_subset_of_rohit,rohit_subset_of_ncbi,begin_end_same,new_cds
0,NC_020879.1,NEGATIVE,126555..126789,MLLTLSSLLSWLKSNALCIIIMVLMAIMMKNQHDEISTLKTSLESM...,MKNQHDEISTLKTSLESMKSFQTKSYENAKPVTEALLKSPKATKQM...,True,False,False,"[126555, 126873]"
1,NC_019543.1,NEGATIVE,124571..124826,MLLTLSSLLSWLKSNALYIIIMVLMAIMMKNQHDEISTLKNSLESM...,MVLMAIMMKNQHDEISTLKNSLESMKSFQTKSYENAKPVTEALLKS...,True,False,False,"[124571, 124889]"
3,NC_024142.1,POSITIVE,12841..13066,MKMLISKGWPYLLVVVLGATIYFWGNSNGQSTVQKKWDDQKVEDQK...,MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEEL...,True,False,False,"[12559, 13066]"
4,NC_016562.1,NEGATIVE,89202..89547,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,False,False,True,"[1, 1]"
6,NC_007805.1,POSITIVE,37910..38360,MRWVPWLVVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQ...,MVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQ...,False,False,False,"[37889, 38360]"
7,KC139515.1,NEGATIVE,17634..17832,MTGLLARIKTGVLAALVFVVALFGVWRAGRTKGKQDQINNQNNDTL...,MALFGVWRAGRTKGKQDQINNQNNDTLREQANADKNVAEVHNEINK...,False,False,False,"[17634, 17889]"
8,NC_010392.1,NEGATIVE,30405..30858,MMFNWKTMFVGLLLVSLIVAGRLANHYRNNAITYKYQRDTATHNLK...,MFVGLLLVSLIVAGRLANHYRNNAITYKYQRDTATHNLKLANETIT...,True,False,False,"[30405, 30879]"


In [1296]:
data2[data2['new_cds'].isnull()]

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence,ncbi_subset_of_rohit,rohit_subset_of_ncbi,begin_end_same,new_cds


In [1297]:
data.loc[62:66]

Unnamed: 0,host,host_taxid,phage_acc,protein_acc,protein_gi,cds,strand,function,spanin_type,property_feature,sequence,ncbi_sequence,same
62,Pseudomonas,286,NC_020203.1,YP_007392789.1,448245071,12285..12399,POSITIVE,OSPANIN,ovl,LIPO:17..20,MPWPKPLLIALPAAFLLASCSSSKPPVNVPPRPLPAALAQPCPTPV...,MTDDSPDATAIALKQLYDQYGVCAGLHWDTVRHLQKD,N
63,Pseudomonas,286,JN811560.1,AEY99484.1,374112688,12561..12675,POSITIVE,OSPANIN,ovl,LIPO:17..20,MPWPKPLLIALPAAFLLASCSSSKPPVNVPPRPLPAALAQPCPTPV...,MTDDSPDATAIALKQLYDQYGLCAGLHWDTVRHLQKD,N
64,Pseudomonas,286,NC_020198.1,YP_007392334.1,448244611,12392..12506,POSITIVE,OSPANIN,ovl,LIPO:17..20,MPWPKPLLIALPAAFLLASCSSSKPPVNVPPRPLPAALAQPCPTPV...,MTDDSPDATAIALKQLYDQYGLCAGLHWDTVRHLQKD,N
65,Pseudomonas,286,NC_020202.1,YP_007392731.1,448245012,12344..12458,POSITIVE,OSPANIN,ovl,LIPO:17..20,MPWPKPLLIALPAAFLLASCSSSKPPVNVPPRPLPAALAQPCQTPV...,MTDDSPDAAAIALKQLYDQYGACAGLHWDTVRHLQKD,N
66,Pseudomonas,286,NC_020200.1,YP_007392432.1,448244711,11426..11540,POSITIVE,OSPANIN,ovl,LIPO:17..20,MPWPKPLLIALPAAFLLASCSSSKPPVNVPPRPLPAALAQPCPTPV...,MTDDSPDAAAIALKQLYDQYGACAGLHWDTVRHFQKD,N


In [1298]:
list(data2['cds'].apply(lambda x: x.split('..')))

[['126555', '126789'],
 ['124571', '124826'],
 ['12841', '13066'],
 ['89202', '89547'],
 ['37910', '38360'],
 ['17634', '17832'],
 ['30405', '30858'],
 ['17493', '17973'],
 ['132794', '133061'],
 ['59169', '59394'],
 ['26697', '26961'],
 ['23912', '24308'],
 ['11928', '12216'],
 ['36334', '36736'],
 ['6803', '7103'],
 ['20330', '20630'],
 ['41067', '41313'],
 ['22226', '22775'],
 ['32853', '33276'],
 ['42438', '42759'],
 ['23635', '23845'],
 ['167032', '167455'],
 ['24156', '24534'],
 ['134824', '135121'],
 ['11233', '11458'],
 ['18144', '18336'],
 ['46829', '47042'],
 ['129054', '129321'],
 ['43580', '43853'],
 ['9447', '9939'],
 ['137726', '138050'],
 ['136877', '137234'],
 ['58896', '59115'],
 ['28053', '28608'],
 ['132260', '132545'],
 ['234503', '234857'],
 ['23239', '23539'],
 ['210927', '211197'],
 ['132642', '132858'],
 ['35414', '35684'],
 ['9381', '9528'],
 ['16184', '16514'],
 ['79273', '79471'],
 ['36139', '36415'],
 ['4906', '5065'],
 ['115946', '116180'],
 ['5604', '6030'

In [1299]:
from Bio import Entrez
from Bio import SeqIO


class GetPhageSeq:
    """
    Connnects to the NUCCORE database and retrieves phage sequence.
        Email : Email used by NCBI to notifiy with errors and problems
        acc : Accession use to query database
        db : database
    """
    def __init__(self,email,acc,db):
        self.email = email
        self.acc = acc
        self.db = db
        Entrez.email = self.email
    
    def get_and_return_sequence(self, ret_type):
        """
        Uses input params to retrieve the sequence from NCBI
        """

        sequence = Entrez.efetch(db=self.db, id=self.acc, rettype=ret_type, retmode="text")
        read_fasta = str(SeqIO.read(sequence,"fasta").seq)

        return read_fasta



# if __name__ == '__main__':

#     payload = {
#         "email" : "curtisross@tamu.edu",
#         "acc" : "NC_001416.1",
#         "db" : "nuccore"
#     }

    gps = GetPhageSeq(**payload).get_and_return_sequence(ret_type="fasta")
    #sequence = str(SeqIO.read(gps,"fasta").seq)


    # print(gps)
    # print(len(gps))
    # print(sequence)

In [1300]:
payload = {
        "email" : "curtisross@tamu.edu",
        "acc" : "NC_020879.1",
        "db" : "nuccore"
    }
from Bio.Seq import Seq

phage1 = Seq(GetPhageSeq(**payload).get_and_return_sequence(ret_type="fasta"))


# Phage "NC_007805.1" 

In [1301]:
payload3 = {
        "email" : "curtisross@tamu.edu",
        "acc" : "NC_007805.1",
        "db" : "nuccore"
    }
phage3pos_strand = Seq(GetPhageSeq(**payload3).get_and_return_sequence(ret_type="fasta"))

In [1302]:
str(phage3pos_strand[37910:38360].translate(to_stop=True))

'VVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQAQVQTDALARTYQAALQASHEENQLRRDAIGTGARVVYVKARCPAGGVHQAPGATGSADAGRAVLAAADGQVVSDLRAGVERRELMIAALRKHIAGLPRYCRR'

In [1303]:
str(phage3pos_strand[37889:38360].translate(to_stop=True))

'MRWVPWLVVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQAQVQTDALARTYQAALQASHEENQLRRDAIGTGARVVYVKARCPAGGVHQAPGATGSADAGRAVLAAADGQVVSDLRAGVERRELMIAALRKHIAGLPRYCRR'

In [1304]:
data2

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence,ncbi_subset_of_rohit,rohit_subset_of_ncbi,begin_end_same,new_cds
0,NC_020879.1,NEGATIVE,126555..126789,MLLTLSSLLSWLKSNALCIIIMVLMAIMMKNQHDEISTLKTSLESM...,MKNQHDEISTLKTSLESMKSFQTKSYENAKPVTEALLKSPKATKQM...,True,False,False,"[126555, 126873]"
1,NC_019543.1,NEGATIVE,124571..124826,MLLTLSSLLSWLKSNALYIIIMVLMAIMMKNQHDEISTLKNSLESM...,MVLMAIMMKNQHDEISTLKNSLESMKSFQTKSYENAKPVTEALLKS...,True,False,False,"[124571, 124889]"
3,NC_024142.1,POSITIVE,12841..13066,MKMLISKGWPYLLVVVLGATIYFWGNSNGQSTVQKKWDDQKVEDQK...,MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEEL...,True,False,False,"[12559, 13066]"
4,NC_016562.1,NEGATIVE,89202..89547,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,False,False,True,"[1, 1]"
6,NC_007805.1,POSITIVE,37910..38360,MRWVPWLVVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQ...,MVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQ...,False,False,False,"[37889, 38360]"
...,...,...,...,...,...,...,...,...,...
116,NC_021794.1,POSITIVE,41131..41452,MEKTSNTSKNFMIVILMMLFSIACFAQDKNYVSVHRDSLYNALLNI...,MMLFSIACFAQDKNYVSVHRDSLYNALLNISELKVENAFLRKKNDN...,True,False,False,"[41083, 41452]"
117,NC_021789.1,POSITIVE,44760..45078,MEKTSNTSKNFMIVILMMLFSIACFAQDKNYVSVHRDSLYNALLNI...,MLFSIACFAQDKNYVSVHRDSLYNALLNISELKVENAFLRKKNDNQ...,True,False,False,"[44709, 45078]"
119,NC_019540.1,NEGATIVE,20740..21175,MKRLRQVLLMLGMVLALSGCSAVTSAVTDRLAGGDKPAVGIDTEIV...,MLSRLCQKHLDLQGTTMKRLRQVLLMLGMVLALSGCSAVTSAVTDR...,False,True,False,"[20740, 21127]"
120,NC_024791.1,NEGATIVE,629..968,MTYVRNKMKSLLVLMGLVLALSTNGCSSLTPVEIAKDVLLPDQQSG...,MGLVLALSTNGCSSLTPVEIAKDVLLPDQQSGITVDTQIGDKEYAL...,True,False,False,"[629, 1010]"


In [1305]:
def test_phage_sequence(acc: str, cds: str, new: List[int], strand: str, rohitseq: str):
           
    payload = {
        "email" : "curtisross@tamu.edu",
        "acc" : f"{acc}",
        "db" : "nuccore"
    }
    
    # Imports genome as FASTA
    genome = Seq(GetPhageSeq(**payload).get_and_return_sequence(ret_type="fasta"))
    
    # Converts ncbi cds
    old_cds: List[str] = cds.split('..')
    old_cds: List[int] = [int(x) for x in old_cds]
        
    start: int = old_cds[0]
    end: int = old_cds[1]
    
    if strand == 'POSITIVE':
        if rohitseq == str(genome[new[0]:new[1]].translate(to_stop=True)):
            return True
        else:
            return str(find_difference_two_strings(rohitseq, str(genome[new[0]:new[1]].translate(to_stop=True))))
        
    if strand == 'NEGATIVE':
        if rohitseq == str(genome[new[0]:new[1]].reverse_complement().translate(to_stop=True)):
            return True
        else:
            # return rohitseq + '   ' + str(genome[new_cds[0]:new_cds[1]].reverse_complement().translate(to_stop=True))
            #return str(genome[126471:126789].reverse_complement().translate(to_stop=True))
            #return rohitseq == str(genome[new_cds[0]:new_cds[1]].reverse_complement().translate(to_stop=True))
            return find_difference_two_strings(rohitseq, str(genome[new[0]:new[1]].reverse_complement().translate(to_stop=True)))

In [1306]:
data2['verified'] = data2.apply(lambda x: test_phage_sequence(x['phage_acc'], x['cds'], x['new_cds'], x['strand'], x['sequence']), axis=1)

In [1307]:
data2.head()

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence,ncbi_subset_of_rohit,rohit_subset_of_ncbi,begin_end_same,new_cds,verified
0,NC_020879.1,NEGATIVE,126555..126789,MLLTLSSLLSWLKSNALCIIIMVLMAIMMKNQHDEISTLKTSLESM...,MKNQHDEISTLKTSLESMKSFQTKSYENAKPVTEALLKSPKATKQM...,True,False,False,"[126555, 126873]",True
1,NC_019543.1,NEGATIVE,124571..124826,MLLTLSSLLSWLKSNALYIIIMVLMAIMMKNQHDEISTLKNSLESM...,MVLMAIMMKNQHDEISTLKNSLESMKSFQTKSYENAKPVTEALLKS...,True,False,False,"[124571, 124889]",True
3,NC_024142.1,POSITIVE,12841..13066,MKMLISKGWPYLLVVVLGATIYFWGNSNGQSTVQKKWDDQKVEDQK...,MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEEL...,True,False,False,"[12559, 13066]","['+ G', '+ I', '- K', '- M', '- L', '+ A', '+ ..."
4,NC_016562.1,NEGATIVE,89202..89547,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,False,False,True,"[1, 1]","[- M, - Q, - M, - F, - N, - F, - L, - F, - S, ..."
6,NC_007805.1,POSITIVE,37910..38360,MRWVPWLVVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQ...,MVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQ...,False,False,False,"[37889, 38360]",True


In [1308]:
true = data2[data2['verified'] == True]
true

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence,ncbi_subset_of_rohit,rohit_subset_of_ncbi,begin_end_same,new_cds,verified
0,NC_020879.1,NEGATIVE,126555..126789,MLLTLSSLLSWLKSNALCIIIMVLMAIMMKNQHDEISTLKTSLESM...,MKNQHDEISTLKTSLESMKSFQTKSYENAKPVTEALLKSPKATKQM...,True,False,False,"[126555, 126873]",True
1,NC_019543.1,NEGATIVE,124571..124826,MLLTLSSLLSWLKSNALYIIIMVLMAIMMKNQHDEISTLKNSLESM...,MVLMAIMMKNQHDEISTLKNSLESMKSFQTKSYENAKPVTEALLKS...,True,False,False,"[124571, 124889]",True
6,NC_007805.1,POSITIVE,37910..38360,MRWVPWLVVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQ...,MVALVAALVFWRLDHVTAQRNDLQAAVEQSAETITAMAQQAQRDTQ...,False,False,False,"[37889, 38360]",True
7,KC139515.1,NEGATIVE,17634..17832,MTGLLARIKTGVLAALVFVVALFGVWRAGRTKGKQDQINNQNNDTL...,MALFGVWRAGRTKGKQDQINNQNNDTLREQANADKNVAEVHNEINK...,False,False,False,"[17634, 17889]",True
8,NC_010392.1,NEGATIVE,30405..30858,MMFNWKTMFVGLLLVSLIVAGRLANHYRNNAITYKYQRDTATHNLK...,MFVGLLLVSLIVAGRLANHYRNNAITYKYQRDTATHNLKLANETIT...,True,False,False,"[30405, 30879]",True
...,...,...,...,...,...,...,...,...,...,...
116,NC_021794.1,POSITIVE,41131..41452,MEKTSNTSKNFMIVILMMLFSIACFAQDKNYVSVHRDSLYNALLNI...,MMLFSIACFAQDKNYVSVHRDSLYNALLNISELKVENAFLRKKNDN...,True,False,False,"[41083, 41452]",True
117,NC_021789.1,POSITIVE,44760..45078,MEKTSNTSKNFMIVILMMLFSIACFAQDKNYVSVHRDSLYNALLNI...,MLFSIACFAQDKNYVSVHRDSLYNALLNISELKVENAFLRKKNDNQ...,True,False,False,"[44709, 45078]",True
119,NC_019540.1,NEGATIVE,20740..21175,MKRLRQVLLMLGMVLALSGCSAVTSAVTDRLAGGDKPAVGIDTEIV...,MLSRLCQKHLDLQGTTMKRLRQVLLMLGMVLALSGCSAVTSAVTDR...,False,True,False,"[20740, 21127]",True
120,NC_024791.1,NEGATIVE,629..968,MTYVRNKMKSLLVLMGLVLALSTNGCSSLTPVEIAKDVLLPDQQSG...,MGLVLALSTNGCSSLTPVEIAKDVLLPDQQSGITVDTQIGDKEYAL...,True,False,False,"[629, 1010]",True


In [1309]:
false = data2[data2['verified'] != True]
false

Unnamed: 0,phage_acc,strand,cds,sequence,ncbi_sequence,ncbi_subset_of_rohit,rohit_subset_of_ncbi,begin_end_same,new_cds,verified
3,NC_024142.1,POSITIVE,12841..13066,MKMLISKGWPYLLVVVLGATIYFWGNSNGQSTVQKKWDDQKVEDQK...,MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEEL...,True,False,False,"[12559, 13066]","['+ G', '+ I', '- K', '- M', '- L', '+ A', '+ ..."
4,NC_016562.1,NEGATIVE,89202..89547,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,MQMFNFLFSFIKSNIIYILLGSLLAFTAYRYISLEKSNAILIENEK...,False,False,True,"[1, 1]","[- M, - Q, - M, - F, - N, - F, - L, - F, - S, ..."
14,NC_005884.1,POSITIVE,11928..12216,MYKIAIGLALGLALFSLWSYYSLQLTKQELATTKEALADRSKEVEF...,MATTKEALADRSKEVEFLNTSLQLRDKVALQTAEREGAINAQLQRI...,False,False,False,"[11841, 12216]","['- M', '+ L']"
16,GQ422450.1,POSITIVE,6803..7103,MSRIKAIIASVIICIIVCLSWAVNHYRDNAITYKEQRDKATSIIAD...,MQKRQRDVAELDARYTKELADANATIETLRADVSAGRKRLQVSATC...,True,False,False,"[6665, 7103]","['- M', '+ V']"
17,GQ422451.1,POSITIVE,20330..20630,MSRIKAIIASVIICIIVCLSWAVNHYRDNAITYKEQRDKATSIIAD...,MQKRQRDVAELDARYTKELADANATIETLRADVSAGRKRLQVSATC...,True,False,False,"[20192, 20630]","['- M', '+ V']"
21,NC_019926.1,POSITIVE,42438..42759,MRKIYVVIITTIVMAGLIWAFIATQVNTGVTSKRQEDALAVSEANV...,MAGLIWAFIATQVNTGVTSKRQEDALAVSEANVGIGKEAKDQGEQA...,True,False,False,"[42399, 42759]","['- M', '+ V']"
23,NC_016163,NEGATIVE,167032..167455,MDSILNNFVSSYKNIILAICIAIVVVVIGLFLNGIADNAKASTEWE...,MDSILNNFVSSYKNIILAICIAIVVVVIGLFLNGIADNAKASTEWE...,False,True,False,"[167035, 167455]",[+ L]
28,AM076770,POSITIVE,18144..18336,MLGKLKIAVMLMIAAVLAWKAGSWNGARVERSVQIAECNNRIEKLA...,MQIAECNNRIEKLAAELEAEKAKKKVEVTKSASKTKQSVLVATDSD...,False,False,False,"[18048, 18336]","['- M', '+ L']"
34,NC_024794.1,NEGATIVE,136877..137234,MLKVNPIYAIVAAFALVSTVTIVVLNNKVDSLNTELASVKETAKNN...,MLKVNPIYAIVAAFALVSTVTIVVLNNKVDSLNTELASVKETAKNN...,False,True,False,"[137081, 137234]",[+ D]
38,NC_023568.1,POSITIVE,234503..234857,MFSQNKIYVLLAVAAVFIGSIGYLKYENVSLEKDLVTQTAEVERLT...,MAVAAVFIGSIGYLKYENVSLEKDLVTQTAEVERLTGDNERLQTTI...,False,False,False,"[234473, 234857]","['- M', '+ V']"


## Phage 34 = 137084:137234, NC_024794.1 (did this one manually)

In [1310]:
payload46 = {
        "email" : "curtisross@tamu.edu",
        "acc" : "NC_023576.1",
        "db" : "nuccore"
    }
from Bio.Seq import Seq

phage46 = Seq(GetPhageSeq(**payload46).get_and_return_sequence(ret_type="fasta"))
# negative strand so need to rev_complement

print('', phage46[35414:35684].translate(), '\n', phage46[35369:35684].translate()),'\n', 'length:', len(phage46)

 MQNTPRILRHRKIQGLLSKLKSTKCPSGFRTKCPRWKAALIGLLLTLTAITSGCASKSTPQVSPSQISVDASLMVESNYTQKLLKVLSE* 
 LVATTPLTISGRLRSMQNTPRILRHRKIQGLLSKLKSTKCPSGFRTKCPRWKAALIGLLLTLTAITSGCASKSTPQVSPSQISVDASLMVESNYTQKLLKVLSE*


(None, '\n', 'length:', 39207)

In [1311]:
data2['sequence'][46], data2['ncbi_sequence'][46]

('MLSKLKSTKCPSGFRTKCPRWKAALIGLLLTLTAITSGCASKSTPQVSPSQISVDASLMVESNYTQKLLKVLSE',
 'MQNTPRILRHRKIQGLLSKLKSTKCPSGFRTKCPRWKAALIGLLLTLTAITSGCASKSTPQVSPSQISVDASLMVESNYTQKLLKVLSE')

In [1312]:
136877 + 204

137081

In [1313]:
136877 - 204

136673

In [1314]:
begin_7: bool = data2['sequence'][7].startswith(data2['ncbi_sequence'][7][:3])
end_7: bool = data2['sequence'][7].endswith(data2['ncbi_sequence'][7][-3:])
display(begin_7, end_7)

False

True

In [1315]:
payload14 = {
        "email" : "curtisross@tamu.edu",
        "acc" : "NC_005884.1",
        "db" : "nuccore"
    }
from Bio.Seq import Seq

phage14 = Seq(GetPhageSeq(**payload14).get_and_return_sequence(ret_type="fasta"))

print('', phage14[11928:12216].translate(to_stop=True), '\n', phage14[11841:12216].translate(to_stop=True),'\n', 'length:', len(phage14))

 LATTKEALADRSKEVEFLNTSLQLRDKVALQTAEREGAINAQLQRIYATVSKYKSQTNSASDQCLGLVPSPEFLEWVRRAEADSKDKSGAASSNK 
 LYKIAIGLALGLALFSLWSYYSLQLTKQELATTKEALADRSKEVEFLNTSLQLRDKVALQTAEREGAINAQLQRIYATVSKYKSQTNSASDQCLGLVPSPEFLEWVRRAEADSKDKSGAASSNK 
 length: 43783


In [1316]:
data2['sequence'][14], data2['ncbi_sequence'][14]

('MYKIAIGLALGLALFSLWSYYSLQLTKQELATTKEALADRSKEVEFLNTSLQLRDKVALQTAEREGAINAQLQRIYATVSKYKSQTNSASDQCLGLVPSPEFLEWVRRAEADSKDKSGAASSNK',
 'MATTKEALADRSKEVEFLNTSLQLRDKVALQTAEREGAINAQLQRIYATVSKYKSQTNSASDQCLGLVPSPEFLEWVRRAEADSKDKSGAASSNK')

In [1317]:
data2['sequence'][16], data2['ncbi_sequence'][16]

('MSRIKAIIASVIICIIVCLSWAVNHYRDNAITYKEQRDKATSIIADMQKRQRDVAELDARYTKELADANATIETLRADVSAGRKRLQVSATCPKSTTGASGMGDGESPRLTADAELNYYRLRSGIDRITAQVNYLQEYIRSQCLK',
 'MQKRQRDVAELDARYTKELADANATIETLRADVSAGRKRLQVSATCPKSTTGASGMGDGESPRLTADAELNYYRLRSGIDRITAQVNYLQEYIRSQCLK')

In [1318]:
display(data2['sequence'][3], data2['ncbi_sequence'][3])

'MKMLISKGWPYLLVVVLGATIYFWGNSNGQSTVQKKWDDQKVEDQKAMQKLQDKYNALQRNHSYEVGLLTSRLQTAESNYASELARVSSDYDSRMQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEELRATVRLRDSQLIELGKQIQADRKLFEQE'

'MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEELRATVRLRDSQLIELGKQIQADRKLFEQE'

In [1319]:
payload_3 = {
        "email" : "curtisross@tamu.edu",
        "acc" : "NC_024142.1",
        "db" : "nuccore"
    }
phage_3pos_strand = Seq(GetPhageSeq(**payload3).get_and_return_sequence(ret_type="fasta"))

## Genome was updated to NC_024142.2 therefore this one fails as the data is based on NC_024142.1

In [1320]:
# What is going on here? The other cases of POSITIVE strandk, ncbi_subset_of_rohit are working properly
phage_3pos_strand[12841:13066].translate()

Seq('LNSAALNSAAHSAVPGPEPIIPGYAFTWRPIVRVGDDDVTPLLTGEIEVDREEG...TDW', ExtendedIUPACProtein())

In [1321]:
display(data2['verified'][3])

"['+ G', '+ I', '- K', '- M', '- L', '+ A', '+ V', '- S', '- K', '- G', '- W', '- Y', '- L', '- L', '+ N', '+ T', '+ T', '+ P', '+ V', '+ D', '+ E', '- V', '- L', '+ T', '+ F', '+ T', '+ G', '+ V', '+ V', '- T', '- I', '- Y', '- F', '- G', '- N', '- S', '- N', '- G', '- Q', '- S', '- T', '- V', '- Q', '- K', '- K', '- W', '- D', '- D', '- Q', '- K', '- V', '- E', '- D', '- Q', '- K', '- A', '- M', '- Q', '- K', '- L', '- Q', '- D', '- K', '- Y', '- N', '- A', '- L', '- Q', '- R', '- N', '- H', '- S', '- Y', '- E', '- V', '- G', '- L', '- L', '- T', '- S', '- R', '- L', '- Q', '- T', '- A', '- E', '- S', '- N', '- Y', '- A', '- S', '- E', '- L', '- A', '- R', '- V', '- S', '- S', '- D', '- Y', '- D', '- S', '- R', '- M', '- Q', '- Q', '- S', '- E', '- R', '- R', '- A', '- S', '- V', '- Y', '- Q', '- A', '- E', '- A', '+ K', '+ F', '+ F', '+ D', '- F', '- E', '- C', '- R', '- S', '- L', '- A', '- S', '- H', '- A', '- A', '- R', '- L', '- D', '- N', '- S', '- L', '- E', '- E', '- G', '- R

In [1322]:
'MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEELRATVRLRDSQLIELGKQIQADRKLFEQE' == 'MQQSERRASVYKRQAEAGTFECRSLASHAARLDNSLEEGRRLVEELRATVRLRDSQLIELGKQIQADRKLFEQE'

True

In [1323]:
data2['verified'][1]

True

In [1324]:
data2['strand'][0] == 'NEGATIVE'

True