In [1]:
from Bio import SeqIO
import re
import pandas as pd
from collections import Counter

gly = re.compile('N[A-O,Q-Z][S,T]') # Regular expression for PNGS
potential_glycosylation_sites = [87,
                                 269,
                                 162,
                                 127,
                                 160,
                                 155,
                                 125,
                                 54,
                                 73] # List of "known" PNGS sites, not strictly necessary. These positions are indexed such that position 1 is the first residue after the leader sequence.
new_rows = []

In [2]:
for s in SeqIO.parse('HA_01_01_1971__01_01_2000_Cambodia_China_HongKong_Thailand_Vietnam.fasta', 'fasta'):

    # Split the description string up to get collection date, year, and strain name
    collection_date = s.description.split('|')[2]
    year = collection_date.split('-')[0].split()[0]
    strain = s.description.split('|')[1] 
    #new_rows.append([strain, int(year), collection_date])
    
    # Variable to skip a sequence if the year of isolation is unknown, or if sequence isn't full-length
    skip = False
    
    if len(s) == 565: 
        # If sequence is 565 residues long, then we don't need to worry about the insertion
        leader_trunc = s[17:] # Remove the leader sequence
        has_deletion = True
        len_leader_trunc = len(leader_trunc)
    elif len(s) == 566: 
        # If it's 566 residues long, then we just remove the inserted residue. 
        #More robust to pre-align the sequences.
        HA_trunc = s[17:]
        leader_trunc = HA_trunc[0:130] + HA_trunc[130:344]
        has_deletion = False
        len_leader_trunc = len(leader_trunc)
    else: 
        # Skip if not full-length
        skip=True

    if year == 'unknown':
        # Skip if year is unknown
        skip =True
    
    if not skip:
        g = 0
        glycosylated_sites = []
        for m in gly.finditer(str(leader_trunc.seq)):
            # This loop iterates over every potential PNGS found by the regex
            pos = m.start() 
            # Keep if a "known" PNGS
            if pos + 1 in potential_glycosylation_sites: # Remember that python indexes from 0
                glycosylated_sites.append(str(pos+1))
        glycosylated_sites.sort()
        new_rows.append([strain, int(year), collection_date, ','.join(glycosylated_sites), has_deletion, len_leader_trunc])

df = pd.DataFrame(new_rows, columns=['Strain', 'Year', 'Collection date', 'Glycosylated sites', 'Deletion', 'Seq len'])
df

Unnamed: 0,Strain,Year,Collection date,Glycosylated sites,Deletion,Seq len
0,A/Shanghai/2/1997,1997,1997-09-01,1255487,True,548
1,A/Shanghai/2/1997,1997,1997-08-01,1255487,True,548
2,A/Shanghai/8/1996,1996,1996-08-01,1251602695487,False,344
3,A/Shengzhen/227/1995,1995,1995-08-01,1251605487,False,344
4,A/Nanchang/16A/1999,1999,1999-08-01,1255487,True,548
5,A/Beijing/262/1995,1995,1995-01-01,1255487,True,548
6,A/Tientsin/78/1977,1977,1977-01-01,12715516026987,False,344
7,A/Nanchang/14/1996,1996,1996-01-01,1255487,True,548
8,A/Nanchang/9/1996,1996,1996-01-01,1255487,True,548
9,A/Nanchang/25/1996,1996,1996-01-01,1255487,True,548
