In [1]:
#default_exp utils

In [32]:
#export
import os
import subprocess
import shutil
import socket
import pwd
import glob
from itertools import groupby
import gzip

import numpy as N
import pandas as PD

#from prefect import task, Flow, Parameter, unmapped, case
from prefect.engine import signals
#from prefect.tasks import Task

import paramiko

from dask_rnaseq.hdict import *

def mkdirs(pdir: str,mode=0o775)->None:
    # utility function to create a directory (recursively)
    if os.path.exists(pdir):
        #print(f"Directory {pdir} already exists") 
        return
    # make output directory
    try: 
        oumask = os.umask(0o777-mode) #0o002
        os.makedirs(pdir, exist_ok = True, mode=mode) 
        #os.chmod(pdir, mode)     
        print(f"Directory {pdir} created successfully") 
    except OSError as error: 
        signals.FAIL(message=f"Directory {pdir} can not be created")
    finally:
        os.umask(oumask)
        

def make_params(samples, base):
    # combine sample specific params and common params
    if not isinstance(base, HBox):
        base = HBox(base)
    def _make_params(dic):
        p = base.merge(dic)
        if p.did=='':
            p.did= p.odir.split('/')[-1]
        if p.idir=='':
            p.idir=os.path.dirname(p.r1)
        return p
    return [_make_params(x) for x in samples]


#################### User/Host ####################################################
def get_username():
    return pwd.getpwuid( os.getuid() )[0]

def get_hostname():
    return socket.gethostname()

#################### TASK related utils ###########################################
# file related
def deletefiles(files):
    for f in files:
        if os.path.exists(f):
            os.unlink(f)


# subprocess
def subprocesscall(cmd,stdout=None):
    print('SHELL CMD ----------------------------------')
    print(cmd)
    print('--------- ----------------------------------')
    if stdout is None:
        rslt = subprocess.call(cmd, shell=True)
    else:
        with open(stdout,'w') as sout:
            rslt = subprocess.call(cmd, stdout=sout, shell=True)
    if rslt !=0:
        raise signals.FAIL(message=f'{cmd} returned non zero result {rslt}')   

# ssh
def remote_remove_files(host, files, sshuser):
    if len(files)==0:
        return []
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    remain = []
    try:
        ssh.connect(host, username=sshuser)
        sftp = ssh.open_sftp()
        for f in files:
            try:
                sftp.remove(f)
                print(f'REMOTE CLEAN:{host}:{f}')
            except IOError:
                remain.append(f)
                print(f'ERROR REMOTE CLEAN:{host}:{f}')
            except FileNotFoundError:
                print(f'REMOTE NO FILE:{host}:{f}')
    finally:
        sftp.close()
        ssh.close()
    return remain
def sftp_mkdirs(sftp, rabsdir):
    try:
        sftp.chdir(rabsdir) # directory exists
    except IOError:
        dirname, basename = os.path.split(rabsdir.rstrip('/'))
        _remote_mkdirs(sftp, dirname) # make parent directories
        sftp.mkdir(rabsdir) # sub-directory missing, so created it
        return True
def sftp_put(sftp, src, dst, host):
    try:
        # make sure parent directory exists
        _remote_mkdirs(sftp, os.path.dirname(dst))
        sftp.put(src,dst)
        print(f'REMOTE PUT:{host}:{dst}')
    except:
        print(f'FAILED REMOTE PUT:{host}:{dst}')
        raise
def remote_put_files(host, srcdsts, sshuser,overwrite=False):
    if len(srcdsts)==0:
        return
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    try:
        ssh.connect(host, username=p0.sshuser)
        sftp = ssh.open_sftp()
        for src,dst in srcdsts:
            if overwrite:
                _sftp_put(sftp,src,dst,host)
            else:
                try:
                    # check remote file
                    sftp.stat(dst)
                    print(f'REMOTE ALREADY EXISTST:{host}:{dst}')
                except IOError:
                    _sftp_put(sftp,src,dst,host)
    finally:
        sftp.close()
        ssh.close()


In [26]:
shutil.rmtree('./tests/a')

In [28]:
# test mkdirs mode
import shutil
#!rm -rf ./tests/a
try:
    shutil.rmtree('./tests/a')
except:
    pass
mkdirs('./tests/a/b')
assert(os.stat('./tests/a/b').st_mode==0o40775)
assert(os.stat('./tests/a').st_mode==0o40775)

Directory ./tests/a/b created successfully


In [33]:
# export

# FASTQ paired reader
#from io import StringIO
def FASTQP(r1,r2):
    if r1[-3:]=='.gz':
        open = gzip.open
    with open(r1,'rt') as f1:
        with open(r2,'rt') as f2:
            f1b = zip(*[f1]*4) # read 4 lines at a time
            f2b = zip(*[f2]*4)
            for l1,l2 in zip(*[f1b,f2b]):
                yield (l1,l2)            

def FASTQP_buffered(r1,r2):
    if r1[-3:]=='.gz':
        open = gzip.open
    f1 = iter(open(r1,'rt').readlines())
    f2 = iter(open(r2,'rt').readlines())
    f1b = zip(*[f1]*4)
    f2b = zip(*[f2]*4)
    for l1,l2 in zip(*[f1b,f2b]):
        yield (l1,l2)
                
def FASTQP_readall(r1,r2):
    it = FASTQP(r1,r2)
    return [x for x in it]



In [35]:
# export

CDIC = {'A':'T','T':'A','G':'C','C':'G','N':'N'}
def revcomp(s):
    return ''.join([CDIC.get(x,x) for x in s[::-1]])
def comp(s):
    return ''.join([CDIC.get(x,x) for x in s])

def detect_overlap(s1,s2,edge=8):
    # check overlap of edge
    # returns length of ovl, total length
    p1s = s2.find(revcomp(s1[:edge]))
    p1e = s2.find(revcomp(s1[-edge:]))
    p2s = s1.find(revcomp(s2[:edge]))
    p2e = s1.find(revcomp(s2[-edge:]))
    ovl = 0
    tot = len(s1)+len(s2)
    sl1 = -1,-1
    sl2 = -1,-1    
    if (p1s>=0) and (p1e>=0): # s1 contained in s2
        ovl = len(s1)
        tot = len(s2)
        sl1 = 0,len(s1)
        sl2 = p1e,p1s+edge
    elif (p2s>=0) and (p2e>=0): # s2 contained in s1
        ovl = len(s2)
        tot = len(s1)
        sl1 = p2e,p2s+edge
        sl2 = 0,len(s2)
    elif (p1s>=0)&(p1e<0)&(p2s>=0)&(p2e<0): # end (5') overlap
        o1 = p2s+edge
        o2 = p1s+edge
        #assert(o1==o2)
        #if o1!=o2: # polyA, polyT can match 
        #    print('-'*20)
        #    print(f"5'overlap o1!=o2 p1s={p1s}, p2s={p2s}")
        #    print(s1)
        #    print(revcomp(s2))
        if o1==o2:
            ovl = o1
            tot = len(s1)+len(s2)-ovl
            sl1 = 0,p2s+edge
            sl2 = 0,p1s+edge
    elif (p1s<0)&(p1e>=0)&(p2s<0)&(p2s>=0): # front (3') overlap
        o1 = len(s1)-p2e
        o2 = len(s2)-p1e
        #assert(o1==o2)
        #if o1!=o2:
        #    print('-'*20)
        #    print(f"3'overlap o1!=o2 p1e={p1e}, p2e={p2e}")
        #    print(s1)
        #    print(revcomp(s2))        
        if o1==o2:
            ovl = o1
            tot = len(s1)+len(s2)-ovl
            sl1 = p1e,len(s1)
            sl2 = p2e,len(s2)
    return ovl, tot, sl1, sl2


In [17]:
os.chmod?