-
Notifications
You must be signed in to change notification settings - Fork 0
/
LBB_transcript.py
78 lines (66 loc) · 2.6 KB
/
LBB_transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#this script uses blastx and protein file
# outputs statistics:
# Total number of basepairs
# Number of transcripts
# Number of "unigenes"
# Mean and median transcript length
# N50 explanation here
# calculates the # of transcripts that have blast hits
from Bio import SeqIO
import statistics as stat
import pandas as pd
import argparse
def get_args():
parser = argparse.ArgumentParser(description="this parses transcripts", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-in', '--infile', type=str, help='Insert the fasta pep file', required=True)
parser.add_argument('-blast', '--blastx', type=str, help='Insert the blastx file', required=True)
args = parser.parse_args()
INFILE = args.infile
BLASTX = args.blastx
return INFILE, BLASTX
INFILE, BLASTX = get_args()
ID_LIS = []
LEN_LIS = []
IDLIS = []
with open('stats.txt', 'w') as STATS:
for RECORD in SeqIO.parse(INFILE, 'fasta'):
LEN_LIS.append(len(RECORD.seq))
#THIS CALCULATES THE TOTAL BP LENGTH
print('length = ' + str(sum(LEN_LIS)) + '.')
STATS.write('full lengths = ' + str(sum(LEN_LIS)) + '.\n')
#THIS CALCULATES THE LENGTH OF THE TRANSCRIPT
print ('transcripts = ' + str(len(LEN_LIS)) + '.')
STATS.write('transcripts = ' + str(len(LEN_LIS)) + '.\n')
#THIS CALCULATES THE MEAN OF THE LENGTH OF THE TRANSCRIPTS
print('mean transcripts length = ' + str(stat.mean(LEN_LIS)) + '.')
STATS.write('mean transcritps length = ' + str(stat.mean(LEN_LIS)) + '.\n')
#THIS CALCULATES THE N50, THIS WAS COPIED FROM DARAY'S GITHUB
LENLIS = sorted(LEN_LIS)
HALF = sum(LEN_LIS)/2
TOTALENTRIES = len(LEN_LIS)
for ENTRY in range(1, TOTALENTRIES):
RUNNINGTOTAL = sum(LEN_LIS[0:ENTRY])
if RUNNINGTOTAL >= HALF:
print('N50 = ' + str(LEN_LIS[ENTRY-1]) + '.')
STATS.write('N50 = ' + str(LEN_LIS[ENTRY-1]) + '.\n')
break
for RECORD in SeqIO.parse(INFILE, 'fasta'):
ID_LIS.append(RECORD.id.rsplit('_',1))
###
##THIS CALCULATES THE UNIGENES FROM THE FILES
###
IDS = pd.DataFrame(ID_LIS, columns = ['uni', 'iso'])
COUNTUNI = len(IDS['uni'].unique())
print('unigenes = ' + str(COUNTUNI))
STATS.write('unigenes ' + str(COUNTUNI) + '.\n')
####
##PART TWO OF THE HW. THE IDEA HERE IS TO COUNT THE SEQS THAT HAVE HITS IN THE BLAST
###
#reading the file as a table and use the columns to count the hits
TRANSCRIPT_HITS = pd.read_table(BLASTX, sep='\t', usecols=[1,2], names=['I.D', 'unique'])
HITS = len(TRANSCRIPT_HITS.ID.unique())
print('UNIQUE HITS = ' + str(HITS) + '.')
#####
##PART THREE, NUMBER OF UNIQUE PROTEINS IN THE REFERENCE PROTEOME WITH BLAST HIT
#####
##count the number of times there is a hit in the blast file