Skip to content

Commit

Permalink
Merge pull request #1 from deprekate/working
Browse files Browse the repository at this point in the history
Working
  • Loading branch information
deprekate committed Sep 3, 2019
2 parents 4e652e8 + f406a9a commit a405a49
Show file tree
Hide file tree
Showing 28 changed files with 1,053 additions and 10,579 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,12 @@ ENV/

# Rope project settings
.ropeproject

# pycharm
.idea/

# Repeat Finder that we make
source/repeatFinder

# Test organism data
Test_Organism
133 changes: 67 additions & 66 deletions PhiSpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,65 +3,60 @@
import sys
import re
import subprocess
try:
import argparse
except ImportError:
sys.path.append(os.path.dirname(os.path.realpath(__file__))+'/argparse-1.2.1')
import argparse
import argparse


def call_phiSpy(organismPath, output_dir, trainingFlag, INSTALLATION_DIR, evaluateOnly, threshold_for_FN, phageWindowSize, quietMode, keep):

sys.path.append(INSTALLATION_DIR+'source/')
import makeTrain
import makeTest
import classification
import evaluation
import unknownFunction

if ( not evaluateOnly ):

sys.stderr.write("Running PhiSpy on " + organismPath + "\n")
if (not evaluateOnly):
if trainingFlag == -1:
print('Making Train Set... (need couple of minutes)')
my_make_train_flag = makeTrain.call_make_train_set(organismPath,output_dir,INSTALLATION_DIR)
my_make_train_flag = makeTrain.call_make_train_set(organismPath, output_dir, INSTALLATION_DIR)
exit()
if (quietMode == 0):
print('Making Test Set... (need couple of minutes)')

my_make_test_flag = makeTest.call_make_test_set(organismPath,output_dir,INSTALLATION_DIR)
my_make_test_flag = makeTest.call_make_test_set(organismPath, output_dir, INSTALLATION_DIR)
if (my_make_test_flag == 0):
print('The input organism is too small to predict prophages. Please consider large contig (having at least 40 genes) to use PhiSpy.')
return
if (quietMode == 0):
print('Start Classification Algorithm')
classification.call_classificaton(organismPath,output_dir,trainingFlag,INSTALLATION_DIR)

classification.call_classification(organismPath, output_dir, trainingFlag, phageWindowSize, INSTALLATION_DIR)
if (quietMode == 0):
print('Done with classification Algorithm')
classification.call_classificaton(organismPath,output_dir,trainingFlag,INSTALLATION_DIR)
if (quietMode == 0):
print('Done with classification Algorithm')

###### added in this version 2.2 #####
if (trainingFlag == 0):
if (quietMode == 0):
print('As training flag is zero, considering unknown functions')
unknownFunction.consider_unknown(output_dir)
######################################

if (quietMode == 0):
print('Start evaluation...')
evaluation.call_start_end_fix(output_dir,organismPath,INSTALLATION_DIR,threshold_for_FN, phageWindowSize)
evaluation.call_start_end_fix(output_dir, organismPath, INSTALLATION_DIR, threshold_for_FN, phageWindowSize)
if (quietMode == 0):
print('Done!!!')

def print_list(INSTALLATION_DIR):
printstr = ''
try:
f = open(INSTALLATION_DIR+"data/trainingGenome_list.txt","r")
f = open(INSTALLATION_DIR + "data/trainingGenome_list.txt", "r")
except:
print('cannot find list')
for line in f:
line = line.strip()
temp = re.split('\t',line)
temp = re.split('\t', line)
if int(temp[3]) == 1:
printstr = printstr +temp[0]+' '+temp[2]+'\n'
printstr = printstr + temp[0] + ' ' + temp[2] + '\n'
print(printstr)
f.close()

Expand All @@ -70,99 +65,105 @@ def start_propgram(argv):
subprocess.call("type Rscript", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
except OSError:
sys.exit("The R programming language is not installed")

INSTALLATION_DIR = argv[0]
if '/' in argv[0]:
INSTALLATION_DIR = INSTALLATION_DIR[0:len(INSTALLATION_DIR)-INSTALLATION_DIR[::-1].find('/')]
INSTALLATION_DIR = INSTALLATION_DIR[0:len(INSTALLATION_DIR) - INSTALLATION_DIR[::-1].find('/')]
else:
INSTALLATION_DIR = './'

args_parser = argparse.ArgumentParser(description="phiSpy is a program for identifying prophages from among microbial genome sequences", epilog="(c) 2008-2017 Sajia Akhter, Katelyn McNair, Rob Edwards, San Diego State University, San Diego, CA")
args_parser.add_argument('-m', '--make_training_data', type=bool, default=False, const=True, nargs='?', help='Create training data from a set of annotated genome files')
args_parser.add_argument('-t', '--training_set', default=0, type=int, help='Choose a training set from the list of training sets.')
args_parser.add_argument('-l', '--list', type=bool, default=False, const=True, nargs='?', help='List the available training sets and exit')
args_parser.add_argument('-c', '--choose', type=bool, default=False, const=True, nargs='?', help='Choose a training set from a list (overrides -t)')
args_parser.add_argument('-e', '--evaluate', type=bool, default=False, const=True, nargs='?', help='Run in evaluation mode -- does not generate new data, but reruns the evaluation')
args_parser.add_argument('-n', '--number', default=5, type=int, help='Number of consecutive genes in a region of window size that must be prophage genes to be called')
args_parser.add_argument('-w', '--window_size', default=30, type=int, help='Window size of consecutive genes to look through to find phages')
args_parser.add_argument('-i', '--input_dir', required=True, help='The input directory that holds the genome')
args_parser.add_argument('-o', '--output_dir', required=True, help='The output directory to write the results')
args_parser.add_argument('-qt', '--quiet', type=bool, default=False, const=True, nargs='?', help='Run in quiet mode')
args_parser.add_argument('-k', '--keep', type=bool, default=False, const=True, nargs='?', help='Do not delete temp files')

args_parser = argparse.ArgumentParser(
description="phiSpy is a program for identifying prophages from among microbial genome sequences",
epilog="(c) 2008-2018 Sajia Akhter, Katelyn McNair, Rob Edwards, San Diego State University, San Diego, CA")
args_parser.add_argument('-m', '--make_training_data', action='store_true',
help='Create training data from a set of annotated genome files')
args_parser.add_argument('-t', '--training_set', default=0, type=int,
help='Choose a training set from the list of training sets.')
args_parser.add_argument('-l', '--list', type=bool, default=False, const=True, nargs='?',
help='List the available training sets and exit')
args_parser.add_argument('-c', '--choose', type=bool, default=False, const=True, nargs='?',
help='Choose a training set from a list (overrides -t)')
args_parser.add_argument('-e', '--evaluate', type=bool, default=False, const=True, nargs='?',
help='Run in evaluation mode -- does not generate new data, but reruns the evaluation')
args_parser.add_argument('-n', '--number', default=5, type=int,
help='Number of consecutive genes in a region of window size that must be prophage genes to be called')
args_parser.add_argument('-w', '--window_size', default=30, type=int,
help='Window size of consecutive genes to look through to find phages')
args_parser.add_argument('-i', '--input_dir', help='The input directory that holds the genome')
args_parser.add_argument('-o', '--output_dir', help='The output directory to write the results')
args_parser.add_argument('-qt', '--quiet', type=bool, default=False, const=True, nargs='?',
help='Run in quiet mode')
args_parser.add_argument('-k', '--keep', type=bool, default=False, const=True, nargs='?',
help='Do not delete temp files')
args_parser = args_parser.parse_args()

if (args_parser.list):
print_list(INSTALLATION_DIR)
sys.exit(0)

if not args_parser.input_dir and not args_parser.output_dir:
print(sys.argv[0] + " [-h for help] [-l to list training sets] OPTIONS")
print("Input and output directories are required")
sys.exit(0)
output_dir = args_parser.output_dir
organismPath = args_parser.input_dir
trainingFlag = args_parser.training_set


output_dir = output_dir.strip()
if output_dir[len(output_dir)-1]!='/':
output_dir = output_dir+'/'
if output_dir[len(output_dir) - 1] != '/':
output_dir = output_dir + '/'
try:
f = open(output_dir+'testing.txt','w')
f = open(output_dir + 'testing.txt', 'w')
except:
try:
os.makedirs(output_dir)
f = open(output_dir+'testing.txt','w')
f = open(output_dir + 'testing.txt', 'w')
except:
print("Cannot create the output directory or write file in the output directory",output_dir)
print("Cannot create the output directory or write file in the output directory", output_dir)
return
f.close()
os.system("rm "+output_dir+'testing.txt')

os.system("rm " + output_dir + 'testing.txt')
organismPath = organismPath.strip()
if organismPath[len(organismPath)-1]=='/':
organismPath = organismPath[0:len(organismPath)-1]


if organismPath[len(organismPath) - 1] == '/':
organismPath = organismPath[0:len(organismPath) - 1]
# this is a hack to add in ability to make train set
if args_parser.make_training_data:
call_phiSpy(organismPath,output_dir,-1,INSTALLATION_DIR,args_parser.evaluate, args_parser.number, args_parser.window_size,args_parser.quiet,args_parser.keep)
exit()

#
try:
f_dna = open(organismPath+'/contigs','r')
f_dna = open(organismPath + '/contigs', 'r')
f_dna.close()
except:
print("Cannot open",organismPath+'/contigs')
print("Cannot open", organismPath + '/contigs')
return
try:
f = open(organismPath+'/Features/peg/tbl','r')
f = open(organismPath + '/Features/peg/tbl', 'r')
f.close()
except:
print("Cannot open",organismPath+'/Features/peg/tbl')
print("Cannot open", organismPath + '/Features/peg/tbl')
return
try:
f = open(organismPath+'/assigned_functions','r')
f = open(organismPath + '/assigned_functions', 'r')
f.close()
except:
print("Cannot open",organismPath+'/assigned_functions')
#return
print("Cannot open", organismPath + '/assigned_functions')
# return
try:
f = open(organismPath+'/Features/rna/tbl','r')
f = open(organismPath + '/Features/rna/tbl', 'r')
f.close()
except:
print("Cannot open",organismPath+'/Features/rna/tbl')
#return

print("Cannot open", organismPath + '/Features/rna/tbl')
# return
if (args_parser.choose):
while (1):
print_list(INSTALLATION_DIR)
temp = raw_input("Please choose the number for a closely related organism we can use for training, or choose 0 if you don't know: ")
temp = raw_input(
"Please choose the number for a closely related organism we can use for training, or choose 0 if you don't know: ")
try:
trainingFlag = int(temp)
except:
continue
if trainingFlag<0 or trainingFlag>30:
if trainingFlag < 0 or trainingFlag > 30:
continue
break
print('')
call_phiSpy(organismPath,output_dir,trainingFlag,INSTALLATION_DIR,args_parser.evaluate, args_parser.number, args_parser.window_size,args_parser.quiet,args_parser.keep)
call_phiSpy(organismPath, output_dir, trainingFlag, INSTALLATION_DIR, args_parser.evaluate, args_parser.number,
args_parser.window_size, args_parser.quiet, args_parser.keep)

start_propgram(sys.argv)
58 changes: 50 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@ iii. assigned_functions file: organism_directory/assigned_functions or organism_
iv. tbl file for rna: organism_directory/Features/rna/tbl


# REQUIRED INPUT OPTION
_Note:_
The assigned functions file may not be in the RAST genome directory. You can create it from proposed_functions and proposed_non_ff_functions or you can use [this perl script](/home/redwards/Dropbox/GitHubs/EdwardsLab/RAST/make_assigned_functions.pl) to create an assigned_functions file for you.

# REQUIRED INPUT OPTIONS

The program will take 1 command line input.

Expand All @@ -105,15 +108,54 @@ For the help menu use the -h option:

# OUTPUT FILES

There are 2 output files, located in output directory.
There are 3 output files, located in output directory.

1. prophage.tbl: This file has two columns separated by tabs [id, location].
The id should be in the format: fig|genomeid.pp.number, where genome id is the id of the genome that is being processed,
and number is a sequential number of the prophage (starting at 1).
Location should be in the format: contig_start_stop that encompasses the prophage.
The id is in the format: pp_number, where number is a sequential number of the prophage (starting at 1).
Location is be in the format: contig_start_stop that encompasses the prophage.

2. prophage_tbl.txt: This is a tab seperated file. The file contains all the genes of the genome. The tenth colum represents the status of a gene. If this column is 1 then the gene is a phage like gene; otherwise it is a bacterial gene.
This file has 16 columns:(i) fig_no: the id of each gene; (ii) function: function of the gene; (iii) contig; (iv) start: start location of the gene; (v) stop: end location of the gene; (vi) position: a sequential number of the gene (starting at 1); (vii) rank: rank of each gene provided by random forest; (viii) my_status: status of each gene based on random forest; (ix) pp: classification of each gene based on their function; (x) Final_status: the status of each gene, if this column is 1 then the gene is a phage like gene, otherwise it is a bacterial gene; (xi) start of attL; (xii) end of attL; (xiii) start of attR; (xiv) end of attR; (xv) sequence of attL; (xvi) sequence of attR.
2. prophage_tbl.tsv: This is a tab seperated file. The file contains all the genes of the genome. The tenth colum represents the status of a gene. If this column is 1 then the gene is a phage like gene; otherwise it is a bacterial gene.

This file has 16 columns:(i) fig_no: the id of each gene; (ii) function: function of the gene; (iii) contig; (iv) start: start location of the gene; (v) stop: end location of the gene; (vi) position: a sequential number of the gene (starting at 1); (vii) rank: rank of each gene provided by random forest; (viii) my_status: status of each gene based on random forest; (ix) pp: classification of each gene based on their function; (x) Final_status: the status of each gene. For prophages, this column has the number of the prophage as listed in prophage.tbl above; If the column contains a 0 we believe that it is a bacterial gene. If we can detect the _att_ sites, the additional columns will be: (xi) start of _attL_; (xii) end of _attL_; (xiii) start of _attR_; (xiv) end of _attR_; (xv) sequence of _attL_; (xvi) sequence of _attR_.


3. prophage_coordinates.tsv: This file has the prophage ID, contig, start, stop, and potential _att_ sites identified for the phages.


# EXAMPLE DATA

We have provided two different example data sets.

* _Streptococcus pyogenes_ M1 GAS which has a single genome contig. The genome contains four prophages.

To analyse this data, you can use:

```
python2.7 PhiSpy.py -t 25 -i Test_Organism/160490.1/ -o Test_Organism/160490.1.output
```

And you should get a prophage table that has this information:

| Prophage number | Contig | Start | Stop |
| --- | --- | --- | --- |
pp_1 | NC_002737 | 529631 | 604720
pp_2 | NC_002737 | 778642 | 846824
pp_3 | NC_002737 | 1191309 | 1255536
pp_4 | NC_002737 | 1607352 | 1637214

* _Salmonella enterica_ serovar Enteritidis LK5

This is an early draft of the genome (the published sequence has a single contig), but this draft has 1,410 contigs and some phage like regions.

If you run PhiSpy on this draft genome with the default parameters you will not find any prophage because they are all filtered out for not having enough genes. By default, PhiSpy requires 30 genes in a prophage. You can alter that stringency on the command line, and for example reducing the phage gene window size to 10 results in 3 prophage regions being identified.

```
python2.7 PhiSpy.py -t 21 -w 10 -i Test_Organism/272989.13/ -o Test_Organism/272989.13.output
```

You should get a prophage table that has this information:

| Prophage number | Contig | Start | Stop |
| --- | --- | --- | --- |
pp_1 | Contig_2300_10.15 | 1630 | 10400
pp_2 | Contig_2294_10.15 | 175 | 11290
pp_3 | Contig_2077_10.15 | 318 | 12625
Binary file modified Test_Organism.zip
Binary file not shown.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
PhiSpy 3.2
PhiSpy 3.3
20 changes: 0 additions & 20 deletions argparse-1.2.1/LICENSE.txt

This file was deleted.

15 changes: 0 additions & 15 deletions argparse-1.2.1/MANIFEST.in

This file was deleted.

16 changes: 0 additions & 16 deletions argparse-1.2.1/NEWS.txt

This file was deleted.

Loading

0 comments on commit a405a49

Please sign in to comment.