# Testing dirichlet process prior w/ and w/o buffering/ordering


In [27]:
import os
import time
import subprocess

#msbayes paths
MSBAYES_ROOTDIR="/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/"
MSBAYES_EXECDIR=MSBAYES_ROOTDIR+"src/"

# Binaries
MSBAYES_BIN=MSBAYES_EXECDIR+"msbayes.pl"

# Results directories
MSBAYES_DATADIR=MSBAYES_ROOTDIR+"data/"

# Dirichlet
DIRICHLET_OUT=MSBAYES_DATADIR+"dirichlet/"
DIRICHLET_PRIORS_DIR=DIRICHLET_OUT+"priors/"
DIRICHLET_CONF_DIR=DIRICHLET_OUT+"conf/"
DIRICHLET_RESULTS_DIR=DIRICHLET_OUT+"results/"

os.chdir(MSBAYES_EXECDIR)

In [80]:
## Define the msbayes priors command
def do_priors(num, sorting="7", outname, conf_file):
    os.chdir(MSBAYES_EXECDIR)
    cmd = MSBAYES_BIN \
        + " -s " + sorting \
        + " -r " + str(num) \
        + " -c " + conf_file \
        + " -o " + outname
    try:
        print(cmd)
        time.sleep(2)
        os.system(cmd)
        #subprocess.check_output([MSBAYES_BIN, "-h"],
        #                        stderr=subprocess.STDOUT)
    except Exception as inst:
        print(inst)

In [56]:
## Generate a chunk of the reference table
## This takes the better part of a day, so don't run it unless you're _sure_
## you want it. If you need to kill open a term and `killall -9 perl`
DIRICHLET_CONF=DIRICHLET_CONF_DIR+"conf_dirichlet_buffer0.txt"
DIR_BUFF0_PRIORS_DIR=DIRICHLET_PRIORS_DIR+"buffer0/"

PRIORS_SIZE=3000000
NPROC=10
CHUNK_SIZE=PRIORS_SIZE/NPROC
print("chunk size = "+str(CHUNK_SIZE))
for i in range(NPROC):
    outfile = DIR_BUFF0_PRIORS+"dirichlet-unsorted-"+str(i)+".prior"
    print(outfile)

    p = multiprocessing.Process(target=do_priors, args=(CHUNK_SIZE,outfile, DIRICHLET_CONF))
    p.start()
    time.sleep(2)


chunk size = 300000
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/dirichlet-unsorted-0.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/dirichlet-unsorted-1.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/src/msbayes.pl -s 0 -r 300000 -c /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/conf/conf_dirichlet.txt -o /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/dirichlet-unsorted-0.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/dirichlet-unsorted-2.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/src/msbayes.pl -s 0 -r 300000 -c /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/conf/conf_dirichlet.txt -o /Volumes/WorkDrive/msbayes

In [78]:
## Make the dirichlet buffering files and priors
BUFFER_VALUES = ["0.1", "0.05", "0.01"]
buffer_conf_dict={}
for buff in BUFFER_VALUES:
    print(DIRICHLET_CONF.split("buffer")[-1])
    DIR_BUFF_CONF=DIRICHLET_CONF.split("0")[0]+buff+".txt"
    print("Making - " + DIR_BUFF_CONF)
    with open(DIRICHLET_CONF, 'r') as infile:
        lines = infile.readlines()
        with open(DIR_BUFF_CONF, 'w') as outfile:
            for line in lines:
                if "bufferTauClasses = 0" in line:
                    outfile.write("bufferTauClasses = " + buff)
                else:
                    outfile.write(line)
        buffer_conf_dict[buff]=DIR_BUFF_CONF

0.txt
Making - /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/conf/conf_dirichlet_buffer0.1.txt
0.txt
Making - /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/conf/conf_dirichlet_buffer0.05.txt
0.txt
Making - /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/conf/conf_dirichlet_buffer0.01.txt


In [None]:
## Generate a chunk of the reference table
## This takes the better part of a day, so don't run it unless you're _sure_
## you want it. If you need to kill open a term and `killall -9 perl`
#BUFFER_VALUES = ["0.01", "0.05", "0.1"]
BUFFER_VALUES = ["0.05", "0.1"]

for buff in BUFFER_VALUES:
    
    DIR_BUFF_PRIORS_DIR=DIRICHLET_PRIORS_DIR+"buffer"+buff+"/"
    directory=DIR_BUFF_PRIORS_DIR
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    PRIORS_SIZE=3000000
    NPROC=10
    CHUNK_SIZE=PRIORS_SIZE/NPROC
    ## Don't sort
    SORTING="0"
    print("chunk size = "+str(CHUNK_SIZE))
    for i in range(NPROC):
        outfile = DIR_BUFF_PRIORS_DIR+"dirichlet-buffer"+buff+"-unsorted-"+str(i)+".prior"
        print(outfile)

        p = multiprocessing.Process(target=do_priors, args=(CHUNK_SIZE, SORTING, outfile, buffer_conf_dict[buff]))
        p.start()
        time.sleep(2)
    ## Join the main process ot the last spawned process and wait for it to end before doing the next set of buffers.
    p.join()

chunk size = 300000
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/buffer0.05/dirichlet-buffer0.05-unsorted-0.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/buffer0.05/dirichlet-buffer0.05-unsorted-1.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/src/msbayes.pl -s 0 -r 300000 -c /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/conf/conf_dirichlet_buffer0.05.txt -o /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/buffer0.05/dirichlet-buffer0.05-unsorted-0.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/buffer0.05/dirichlet-buffer0.05-unsorted-2.prior
/Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/src/msbayes.pl -s 0 -r 300000 -c /Volumes/WorkDrive/msbayes-buffering/hickerla

In [8]:
## For each of the buffer values go through and concatenate all the
## priors files into one giant file
## Takes about 5 minutes per directory
#BUFFER_VALUES = ["0.05", "0.1", "0"]
BUFFER_VALUES = ["0"]

for buff in BUFFER_VALUES:
    print("Doing - "+buff)
    
    DIR_BUFF_PRIORS_DIR=DIRICHLET_PRIORS_DIR+"buffer"+buff+"/"
    NCHUNKS=10
    MASSIVE_PRIOR=DIR_BUFF_PRIORS_DIR+"buffer"+buff+".prior"
    with open(MASSIVE_PRIOR, 'w') as outfile:
        for i in range(NCHUNKS):
            with open(DIR_BUFF_PRIORS_DIR+"dirichlet-buffer"+buff+"-unsorted-"+str(i)+".prior", 'r') as infile:
                if i == 0:
                    ## Write the header only once
                    outfile.write(infile.readline())
                else:
                    ## Read the header and swallow it
                    wat = infile.readline()
                ## Copy the rest of the file in 1024kb chunks
                while True:
                    data = infile.read(1024)
                    if not data:
                        break
                    outfile.write(data)

Doing - 0


# Do the PODS

In [30]:
import time
#BUFFER_VALUES = ["0.05", "0.1", "0"]
BUFFER_VALUES = ["0.01"]

## Go to the msbayes working directory
os.chdir(MSBAYES_EXECDIR)

NPODS = 100
STEP = 5
SORTING="0"
for buff in BUFFER_VALUES:
    print("Doing - "+buff)

    # Find the priors file
    DIR_BUFF_PRIORS_DIR=DIRICHLET_PRIORS_DIR+"buffer"+buff+"/"
    MASSIVE_PRIOR=DIR_BUFF_PRIORS_DIR+"buffer"+buff+".prior"
    
    # And the conf file
    DIRICHLET_CONF=DIRICHLET_CONF_DIR+"conf_dirichlet_buffer0.txt"
    DIR_BUFF_CONF=DIRICHLET_CONF.split("0")[0]+buff+".txt"

    cmd = "./bu_doPODS.sh" \
        + " -s " + SORTING\
        + " -p " + MASSIVE_PRIOR\
        + " -c " + DIR_BUFF_CONF\
        + " -n " + str(STEP)
    print(cmd)

    ## Do it in chunks so we can track progress
    count = 0
    while count < NPODS:
        try:
            print("On step {}\t{}".format(count, time.strftime('%X')))
            time.sleep(2)
            os.system(cmd)
            count += STEP
        except Exception as inst:
            print(inst)
    
    # Make the results directory
    MY_RESULTS_DIR=DIRICHLET_RESULTS_DIR+"buffer"+buff+"/"
    directory=MY_RESULTS_DIR
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    ## When you're done, save the output files
    results_files = ["results_psi_omega_et_tol.out", "results_psi.out", \
                     "results_omega_unadjusted.out", "results_omega.out", "all_pods.obs"]
    for res in results_files:
        os.rename(res, MY_RESULTS_DIR+res)

Doing - 0.01
./bu_doPODS.sh -s 0 -p /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/priors/buffer0.01/buffer0.01.prior -c /Volumes/WorkDrive/msbayes-buffering/hickerlab-repository/msbayes-buffering/data/dirichlet/conf/conf_dirichlet_buffer0.01.txt -n 5
On step 0	12:33:32
On step 5	15:05:24
On step 10	17:37:17
On step 15	20:06:00
On step 20	22:33:24
On step 25	01:01:16
On step 30	03:28:10
On step 35	05:55:26
On step 40	08:22:00
On step 45	10:48:44
On step 50	13:15:25
On step 55	15:42:01
On step 60	18:08:46
On step 65	20:35:32
On step 70	23:03:01
On step 75	01:30:14
On step 80	03:58:27
On step 85	06:25:37
On step 90	08:53:18
On step 95	11:20:35


In [16]:
import time
time.strftime('%X %x %Z')

'17:29:51 04/24/16 EDT'