In [1]:
from __future__ import division
import numpy as np
import argparse
import vcf
import itertools
import sys
import multiprocessing
import logging
import seaborn as sns
import pandas as pd
from glob import glob
from tools.fileOps import *
from tools.procOps import *
from tools.bio import *
from phase_lib import *

In [2]:
def construct_feature_vector(df, cutoff, min_cover):
    """For a haplotype phased genome, return a binary vector of all non-ref sites"""
    filtered = df[(df.coverage >= min_cover) & (df.ratio >= cutoff)]
    return set(filtered['loc'])

def convert_pileup(pileup_recs, pileup_converter):
    with TemporaryFilePath() as tmp, TemporaryFilePath() as tmp2:
        with open(tmp, 'w') as outf:
            for l in pileup_recs:
                outf.write(l + '\n')
        cmd = ['perl', pileup_converter, tmp, 0, tmp2]
        r = run_proc(cmd, stderr='/dev/null', stdout='/dev/null')
        return load_pileup(tmp2)


def load_pileup(pileup_path):
    r = [x.split() for x in open(pileup_path)]
    return pd.DataFrame(r[1:], columns=r[0])


def ref_count(s):
    return s[s.ref]


bases = {'A', 'T', 'G', 'C'}
def alt_count(s):
    return sum(s[x] for x in bases if x != s.ref)


def make_pileup(bam):
    """Construct a pileup from a bam"""
    cmd = ['samtools', 'mpileup', bam]
    return call_proc_lines(cmd)


def parse_converted_pileup(df, seq):
    df['loc'] = np.array(map(int, df['loc'])) - 1
    df['ref'] = [seq[i] for i in df['loc']]
    df = df[df.ref.isin(bases)]
    df['A'] = pd.to_numeric(df['A']) + pd.to_numeric(df['a'])
    df['C'] = pd.to_numeric(df['C']) + pd.to_numeric(df['c'])
    df['G'] = pd.to_numeric(df['G']) + pd.to_numeric(df['g'])
    df['T'] = pd.to_numeric(df['T']) + pd.to_numeric(df['t'])
    df = df[['loc', 'ref', 'A', 'C', 'G', 'T']]
    df['coverage'] = df[['A', 'T', 'G', 'C']].sum(axis=1)
    df['ref_count'] = df.apply(ref_count, axis=1)
    df['alt_count'] = df.apply(alt_count, axis=1)
    df['ratio'] = 1.0 * df.alt_count / (df.alt_count + df.ref_count)
    return df


In [3]:
bam = '/hive/users/cbosworth/imputation/H9/A1_c8_realigned_96kb.sorted.bam'
consensus_fasta = '/hive/users/ifiddes/notch2nl_berkeley_data/imputation_pipeline/copy_number/consensus.fa'
pileup_converter = '/cluster/home/ifiddes/pileup2base/pileup2base.pl'
cutoff = 0.95
min_cover = 10

pileup_recs = make_pileup(bam)

In [6]:
tmp = 'tmp'
tmp2 = 'tmp2'
with open(tmp, 'w') as outf:
    for l in pileup_recs:
        outf.write(l + '\n')
    
cmd = ['perl', pileup_converter, tmp, 0, tmp2]
r = run_proc(cmd, stderr='/dev/null', stdout='/dev/null')

In [7]:
r[0]

TypeError: 'NoneType' object has no attribute '__getitem__'