In [None]:
#!/usr/bin/python

In [None]:
import sys
import numpy as np
import argparse

In [None]:
def assemblystats(lines, xmax=10000, step=100):
    """Returns statistics about a fasta file from an assembly."""
    
    lengthcounts = list()
    
    for line in lines:
        try:
            count, length = line.split()
            count, length = int(count), int(length)
            
        except ValueError:
            message = 'Cant parse line as "int, int": ' + line
            raise ValueError(message) from None

        lengthcounts.append((length, count))

    lengthcounts.sort(reverse=True)
    
    sizes = np.zeros((xmax//step + 1), dtype=np.int)
    
    assemblysize = sum(length * count for length, count in lengthcounts)
    ncontigs = sum(count for length, count in lengthcounts)
    largestcontig = lengthcounts[0][0]
    smallestcontig = lengthcounts[-1][0]
    mediancontig = lengthcounts[len(lengthcounts) // 2][0]
    N50 = None
    
    # Length distribution
    for length, count in lengthcounts:
        sizes[:(length // step) + 1] += length * count
        
        if N50 is None and sizes[0] >= assemblysize / 2:
            N50 = length
            
    return assemblysize, N50, ncontigs, largestcontig, smallestcontig, mediancontig, sizes

In [3]:
if __name__ == '__main__':
    usage = """SPADES assembly: grep ">^" contigs.fna | cut -d _ -f 5 | sort -nr | uniq -c | assembly.py -m MAX -s STEP
       MEGAHIT assembly: grep ">^" contigs.fna | cut -d = -f 4 | sort -nr | uniq -c | assembly.py -m MAX -s STEP
    """
    
    # Parse input
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        usage=usage)
    
    # Required, positional arguments
    parser.add_argument('-m', type=int, default=10000, help='max contig size', 
                        dest='maxsize', metavar='maxsize')
    parser.add_argument('-s', type=int, default=100, help='contig step size',
                        dest='step', metavar='step')
    
    lines = list()
    for nline, line in enumerate(sys.stdin):
        if nline == 100000:
            raise ValueError('More than 100k lines, wrong input, exiting.')
        lines.append(line)
    
    if len(lines) == 0:
        parser.print_help()
        sys.exit()
        
    args = parser.parse_args()
    
    if args.step > args.maxsize:
        raise ValueError('Step must be less than maxsize')
        
    if args.step < 1 or args.maxsize < 1:
        raise ValueError('Step and maxsize must be positive integers')
        
    (assemblysize, N50, ncontigs, largestcontig, smallestcontig, mediancontig,
    sizes) = assemblystats(lines, args.maxsize, args.step)
    
    print('Size:', assemblysize, sep='\t')
    print('N50:', N50, sep='\t')
    print('Number of contigs:', ncontigs, sep='\t')
    print('Smallest contig:', smallestcontig, sep='\t')
    print('Largest contig:', largestcontig, sep='\t')
    print('Median contig:', mediancontig, sep='\t', end='\n\n')
    
    for i, length in enumerate(range(0, args.maxsize + args.step, args.step)):
        print(length, sizes[i], sep='\t')

NameError: name 'argparse' is not defined