In [1]:
from __future__ import print_function
import re, gzip, time, itertools, io
import sys
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import Bio
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
import pandas_bokeh
from bokeh.io import output_file, show
pandas_bokeh.output_notebook()

In [2]:
with open('Homo_sapiens.GRCh38.pep.all.fa') as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    sequences = []
    annotation =[]
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split(None, 1)[0]) # First word is ID
        annotation.append(title)
        lengths.append(len(sequence))
        sequences.append(sequence)

In [3]:
#converting lists to pandas Series    
s1 = pd.Series(identifiers, name='ID')
s2 = pd.Series(lengths, name='seq_length')
s3 = pd.Series(sequences, name='sequence')
s4 = pd.Series(annotation, name='annotation')
#Gathering Series into a pandas DataFrame and rename index as ID column
df = pd.DataFrame(dict(ID=s1, seq_length=s2,sequence=s3, annotation=s4 ))
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116646 entries, 0 to 116645
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ID          116646 non-null  object
 1   seq_length  116646 non-null  int64 
 2   sequence    116646 non-null  object
 3   annotation  116646 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.6+ MB


Unnamed: 0,ID,seq_length,sequence,annotation
0,ENSP00000488240.1,4,GTGG,ENSP00000488240.1 pep chromosome:GRCh38:CHR_HS...
1,ENSP00000451042.1,2,EI,ENSP00000451042.1 pep chromosome:GRCh38:14:224...
2,ENSP00000452494.1,4,TGGY,ENSP00000452494.1 pep chromosome:GRCh38:14:224...
3,ENSP00000451515.1,3,PSY,ENSP00000451515.1 pep chromosome:GRCh38:14:224...
4,ENSP00000487941.1,4,GTGG,ENSP00000487941.1 pep chromosome:GRCh38:7:1427...


In [4]:
p = re.compile(r'(?P<pep_ID>[^\s]+)\s+'
                   + r'(?P<seqtype>[^:\s]+):?.'
                   + r'chromosome:(?P<version>[^:]+):'
                   + r'(?P<chromosome>[^:]+):'
                   + r'(?P<start>[^:]+):'
                   + r'(?P<end>[^:]+):'
                   + r'(?P<strand>[^\s]+)\s+'
                   + r'gene:(?P<gene_ID>[^\s]+)\s+'
                   + r'transcript:(?P<transcript_ID>[^\s]+)\s*'
                   + r'gene_biotype:(?P<gene_biotype>[^\s]+)\s*'
                   + r'transcript_biotype:(?P<transcript_biotype>[^\s]+)\s*'
                   + r'gene_symbol:(?P<gene_symbol>[^\s]+)\s*'
                   + r'description:(?P<description>[^[]+)\s'
                   + r'\[Source:(?P<source>[^;]+);*'
                   + r'Acc:(?P<accession>[^]]+)\]*')

In [5]:
df = pd.concat([
    df,
    (
        df.annotation.str.extractall(p)
          .reset_index('match', drop=True)
    )
], axis=1).fillna('')
df.head()

Unnamed: 0,ID,seq_length,sequence,annotation,pep_ID,seqtype,version,chromosome,start,end,strand,gene_ID,transcript_ID,gene_biotype,transcript_biotype,gene_symbol,description,source,accession
0,ENSP00000488240.1,4,GTGG,ENSP00000488240.1 pep chromosome:GRCh38:CHR_HS...,ENSP00000488240.1,pep,GRCh38,CHR_HSCHR7_2_CTG6,142847306,142847317,1,ENSG00000282253.1,ENST00000631435.1,TR_D_gene,TR_D_gene,TRBD1,T cell receptor beta diversity 1,HGNC Symbol,HGNC:12158
1,ENSP00000451042.1,2,EI,ENSP00000451042.1 pep chromosome:GRCh38:14:224...,ENSP00000451042.1,pep,GRCh38,14,22438547,22438554,1,ENSG00000223997.1,ENST00000415118.1,TR_D_gene,TR_D_gene,TRDD1,T cell receptor delta diversity 1,HGNC Symbol,HGNC:12254
2,ENSP00000452494.1,4,TGGY,ENSP00000452494.1 pep chromosome:GRCh38:14:224...,ENSP00000452494.1,pep,GRCh38,14,22449113,22449125,1,ENSG00000228985.1,ENST00000448914.1,TR_D_gene,TR_D_gene,TRDD3,T cell receptor delta diversity 3,HGNC Symbol,HGNC:12256
3,ENSP00000451515.1,3,PSY,ENSP00000451515.1 pep chromosome:GRCh38:14:224...,ENSP00000451515.1,pep,GRCh38,14,22439007,22439015,1,ENSG00000237235.2,ENST00000434970.2,TR_D_gene,TR_D_gene,TRDD2,T cell receptor delta diversity 2,HGNC Symbol,HGNC:12255
4,ENSP00000487941.1,4,GTGG,ENSP00000487941.1 pep chromosome:GRCh38:7:1427...,ENSP00000487941.1,pep,GRCh38,7,142786213,142786224,1,ENSG00000282431.1,ENST00000632684.1,TR_D_gene,TR_D_gene,TRBD1,T cell receptor beta diversity 1,HGNC Symbol,HGNC:12158


In [6]:
df = df[(df['transcript_biotype'] == "protein_coding")]
df = df[(df['gene_biotype'] == "protein_coding")]
df = df.drop('annotation', 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95092 entries, 762 to 116645
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  95092 non-null  object
 1   seq_length          95092 non-null  int64 
 2   sequence            95092 non-null  object
 3   pep_ID              95092 non-null  object
 4   seqtype             95092 non-null  object
 5   version             95092 non-null  object
 6   chromosome          95092 non-null  object
 7   start               95092 non-null  object
 8   end                 95092 non-null  object
 9   strand              95092 non-null  object
 10  gene_ID             95092 non-null  object
 11  transcript_ID       95092 non-null  object
 12  gene_biotype        95092 non-null  object
 13  transcript_biotype  95092 non-null  object
 14  gene_symbol         95092 non-null  object
 15  description         95092 non-null  object
 16  source             

In [7]:
df2 = pd.read_csv("pep_canonical.csv")
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116646 entries, 0 to 116645
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   ID                 116646 non-null  object
 1   GENCODE_basic      68747 non-null   object
 2   Ensembl_Canonical  23477 non-null   object
dtypes: object(3)
memory usage: 2.7+ MB


In [8]:
df = pd.merge(df, df2, how="left", on="ID")

In [9]:
df = df.dropna(subset=["GENCODE_basic"])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66823 entries, 0 to 95091
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  66823 non-null  object
 1   seq_length          66823 non-null  int64 
 2   sequence            66823 non-null  object
 3   pep_ID              66823 non-null  object
 4   seqtype             66823 non-null  object
 5   version             66823 non-null  object
 6   chromosome          66823 non-null  object
 7   start               66823 non-null  object
 8   end                 66823 non-null  object
 9   strand              66823 non-null  object
 10  gene_ID             66823 non-null  object
 11  transcript_ID       66823 non-null  object
 12  gene_biotype        66823 non-null  object
 13  transcript_biotype  66823 non-null  object
 14  gene_symbol         66823 non-null  object
 15  description         66823 non-null  object
 16  source              66

In [11]:
df['duplicate']=df.gene_ID.duplicated()

In [12]:
df.tail()

Unnamed: 0,ID,seq_length,sequence,pep_ID,seqtype,version,chromosome,start,end,strand,...,transcript_ID,gene_biotype,transcript_biotype,gene_symbol,description,source,accession,GENCODE_basic,Ensembl_Canonical,duplicate
95085,ENSP00000362196.1,783,MARLGNCSLTWAALIILLLPGSLEECGHISVSAPIVHLGDPITASC...,ENSP00000362196.1,pep,GRCh38,1,36466043,36483278,-1,...,ENST00000373104.5,protein_coding,protein_coding,CSF3R,colony stimulating factor 3 receptor,HGNC Symbol,HGNC:2439,GENCODE basic,,True
95086,ENSP00000355406.4,836,MARLGNCSLTWAALIILLLPGSLEECGHISVSAPIVHLGDPITASC...,ENSP00000355406.4,pep,GRCh38,1,36466044,36479519,-1,...,ENST00000361632.8,protein_coding,protein_coding,CSF3R,colony stimulating factor 3 receptor,HGNC Symbol,HGNC:2439,GENCODE basic,,True
95088,ENSP00000332180.5,783,MARLGNCSLTWAALIILLLPGSLEECGHISVSAPIVHLGDPITASC...,ENSP00000332180.5,pep,GRCh38,1,36466096,36479519,-1,...,ENST00000331941.6,protein_coding,protein_coding,CSF3R,colony stimulating factor 3 receptor,HGNC Symbol,HGNC:2439,GENCODE basic,,True
95090,ENSP00000507013.1,165,MDAPRRDMELLSNSLAAYAHIRANPESFGLYFVLGVCFGLLLTLCL...,ENSP00000507013.1,pep,GRCh38,1,36322030,36323645,-1,...,ENST00000490466.2,protein_coding,protein_coding,EVA1B,eva-1 homolog B,HGNC Symbol,HGNC:25558,GENCODE basic,True,False
95091,ENSP00000270824.1,165,MDAPRRDMELLSNSLAAYAHIRANPESFGLYFVLGVCFGLLLTLCL...,ENSP00000270824.1,pep,GRCh38,1,36322031,36324154,-1,...,ENST00000270824.1,protein_coding,protein_coding,EVA1B,eva-1 homolog B,HGNC Symbol,HGNC:25558,GENCODE basic,,True


In [13]:
df.Ensembl_Canonical.value_counts()

True    21793
Name: Ensembl_Canonical, dtype: int64

In [14]:
df.duplicate.value_counts()

True     44912
False    21911
Name: duplicate, dtype: int64

In [15]:
dfC = df[(df['Ensembl_Canonical'] == True)]
dfD = df[(df['Ensembl_Canonical'] != True)]

In [16]:
dfC.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21793 entries, 0 to 95090
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  21793 non-null  object
 1   seq_length          21793 non-null  int64 
 2   sequence            21793 non-null  object
 3   pep_ID              21793 non-null  object
 4   seqtype             21793 non-null  object
 5   version             21793 non-null  object
 6   chromosome          21793 non-null  object
 7   start               21793 non-null  object
 8   end                 21793 non-null  object
 9   strand              21793 non-null  object
 10  gene_ID             21793 non-null  object
 11  transcript_ID       21793 non-null  object
 12  gene_biotype        21793 non-null  object
 13  transcript_biotype  21793 non-null  object
 14  gene_symbol         21793 non-null  object
 15  description         21793 non-null  object
 16  source              21

In [17]:
dfD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45030 entries, 14 to 95091
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  45030 non-null  object
 1   seq_length          45030 non-null  int64 
 2   sequence            45030 non-null  object
 3   pep_ID              45030 non-null  object
 4   seqtype             45030 non-null  object
 5   version             45030 non-null  object
 6   chromosome          45030 non-null  object
 7   start               45030 non-null  object
 8   end                 45030 non-null  object
 9   strand              45030 non-null  object
 10  gene_ID             45030 non-null  object
 11  transcript_ID       45030 non-null  object
 12  gene_biotype        45030 non-null  object
 13  transcript_biotype  45030 non-null  object
 14  gene_symbol         45030 non-null  object
 15  description         45030 non-null  object
 16  source              4

In [18]:
dfD.duplicate.value_counts()

True     37847
False     7183
Name: duplicate, dtype: int64

In [19]:
dfD = dfD[(dfD['duplicate'] == False)]

In [20]:
dfcomplete = pd.concat([dfC, dfD])

In [21]:
dfcomplete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28976 entries, 0 to 95071
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  28976 non-null  object
 1   seq_length          28976 non-null  int64 
 2   sequence            28976 non-null  object
 3   pep_ID              28976 non-null  object
 4   seqtype             28976 non-null  object
 5   version             28976 non-null  object
 6   chromosome          28976 non-null  object
 7   start               28976 non-null  object
 8   end                 28976 non-null  object
 9   strand              28976 non-null  object
 10  gene_ID             28976 non-null  object
 11  transcript_ID       28976 non-null  object
 12  gene_biotype        28976 non-null  object
 13  transcript_biotype  28976 non-null  object
 14  gene_symbol         28976 non-null  object
 15  description         28976 non-null  object
 16  source              28

In [22]:
dfcomplete['duplicate']=dfcomplete.gene_ID.duplicated()

In [23]:
dfcomplete.duplicate.value_counts()

False    21911
True      7065
Name: duplicate, dtype: int64

In [24]:
dfcomplete = dfcomplete[(dfcomplete['Ensembl_Canonical'] == True) | (dfcomplete['duplicate'] == False)]
dfcomplete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21911 entries, 0 to 94197
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  21911 non-null  object
 1   seq_length          21911 non-null  int64 
 2   sequence            21911 non-null  object
 3   pep_ID              21911 non-null  object
 4   seqtype             21911 non-null  object
 5   version             21911 non-null  object
 6   chromosome          21911 non-null  object
 7   start               21911 non-null  object
 8   end                 21911 non-null  object
 9   strand              21911 non-null  object
 10  gene_ID             21911 non-null  object
 11  transcript_ID       21911 non-null  object
 12  gene_biotype        21911 non-null  object
 13  transcript_biotype  21911 non-null  object
 14  gene_symbol         21911 non-null  object
 15  description         21911 non-null  object
 16  source              21

In [31]:
chrlist = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y" ]
dfcomplete = dfcomplete[dfcomplete["chromosome"].isin(chrlist)]

In [32]:
dfcomplete.to_csv('HG30_pep_annotation.csv', index=False)

In [33]:
seq_id = []
gene_name = []
seq = []

output_path = 'HG38_pep_unique.fasta'
output_file = open(output_path,'w')

for i, row in dfcomplete.iterrows():
    seq_id = dfcomplete['ID'][i]
    gene_name = dfcomplete['gene_symbol'][i]
    seq = dfcomplete['sequence'][i]
    identifier_line = ">" + seq_id + "\t" + gene_name + "\n"
    output_file.write(identifier_line)
    sequence_line = seq + "\n"
    output_file.write(sequence_line)
    
#Close the file when we're done
output_file.close()