In [None]:
import pandas as pd
import numpy as np 
import sys 
import matplotlib.pyplot as plt
import time 
%matplotlib inline

#filename = '/home/kp/Desktop/flow_a-l_0.3-time_1000.0hr.dat' # no bugs with this one at all 
filename = '/home/kp/Desktop/flow_a-l_0.6-time_1000.0hr.dat' # maybe this one has problems with it which trigger the bug
entrainpath = filename[:-4]+'-entrains.dat'
depositpath = filename[:-4]+'-deposits.dat'

In [None]:
# use pandas to load the file in for analysis. 
# pandas is nice because it's faster than np.loadtxt and it also supports chunking
nrows = 5000000 # try 5 million rows 
#linecount = num_lines = sum(1 for line in open(filename))-2 # this takes time. 
#print('{} lines to process '.format(linecount))
chunksize = int(50000) # take 20 million lines at once 
#print('{} chunks to iterate'.format(float(linecount)/chunksize))
def loadfile(filename):
    return pd.read_csv(filename, skiprows = 2, usecols = [0,1,2], header = None, 
                   delim_whitespace = True, index_col = False, iterator = True, chunksize = chunksize, nrows=nrows)
file = loadfile(filename)

In [None]:
def discern_ed(entrainpath, depositpath, filename):
    # find all entrainment and deposition times within a file 
    # useful 
    # https://www.pythonforthelab.com/blog/introduction-to-storing-data-in-files/
    file = loadfile(filename)
    entrainfile = open(entrainpath, 'wb+') # w means write 
    depositfile = open(depositpath, 'wb+')
    m0 = 50000 # the value of the top of the last chunk

    i = 0 
    for f in file: 
        i+=1

        n,m,t = f.values.T # unpack the values from the chunk

        # check for entrainment and deposition on the boundary element 
        if m[0]==m0+1: # if deposition occurs on lowest boundary of chunk
            np.savetxt(depositfile, np.column_stack((t[0], m[0])))
        elif m[0]==m0-1: # if entrainment occurs on lowest boundary of chunk
            np.savetxt(entrainfile, np.column_stack((t[0], m[0])))
        m0 = m[-1] # reset the boundary value for the next chunk 

        # now check for entrainment and deposition within the chunk. Need to exclude lower boundary
        erodemask = m[1:]-np.roll(m,1)[1:]==-1  # indices into t[1:] where erosion occurred 
        depositmask = m[1:]-np.roll(m,1)[1:]==1 # indices into t[1:] where deposition occurred 
        # chunk save errything 
        data = np.array([t[1:][erodemask], m[1:][erodemask]]).T
        np.savetxt(entrainfile, data)
        entrainfile.flush()
        data = np.array([t[1:][depositmask], m[1:][depositmask]]).T
        np.savetxt(depositfile, data) 
        depositfile.flush()


    entrainfile.close()
    depositfile.close()
    
discern_ed(entrainpath,depositpath,filename)

In [None]:
def mstats(filename):
    # data loader 
    load = lambda filename: pd.read_csv(filename, skiprows = 2, usecols = [0,1,2], header = None, 
                   delim_whitespace = True, index_col = False, iterator = True, chunksize = 50000,nrows=nrows)#, nrows=100)

    file = load(filename)
    i = 0 #keep track of which chunk you're on 
    chunked_mean = 0 # running sum of mean(m) values from each chunk 
    for chunk in file: 
        _, m, _ = chunk.values.T
        chunked_mean += m.mean()
        i+=1 # keep track of chunk number 
    mean_m = chunked_mean/i # the mean

    i=0
    chunked_var = 0 # running sum of (m-mean(m))**2/chunksize
    file = load(filename)
    for chunk in file:
        _, m, _ = chunk.values.T
        chunked_var += ((m-mean_m)**2).sum()/m.size # compute the variance of the chunk
        i+=1
    std_m = np.sqrt(chunked_var/i)

    # compute all of the m values to consider.. go from mean(m)-3*std(m) to mean(m)+3*std(m) 
    mvals = np.arange(int(round(mean_m-3*std_m)), int(round(mean_m+3*std_m))+1, 1)
    # so now you have the range of m values to consider

    # compute the probabilities of each m value 
    file = load(filename) # load the iterators again
    counts = []
    for chunk in file:
        _, m, _ = chunk.values.T
        chunkcounts = []
        for mv in mvals:
            mvcounts = (m == mv).sum()
            chunkcounts.append(mvcounts)
        chunkcounts = np.array(chunkcounts)
        counts.append(chunkcounts)
    counts = np.array(counts)
    counts = counts.sum(axis = 0)
    pmvals = counts/counts.sum()
    data = np.array((mvals, pmvals)).T
    mstatsfile = filename[:-4]+'-mstats.dat'

    np.savetxt(mstatsfile, data)

    return data

In [None]:
m, pm = mstats(filename).T

In [None]:
plt.plot(m,pm)

In [None]:
tmax = 500*3600 # maximum return time to allow for 
dt = 0.5 # bin size 
bins = np.arange(0, tmax, dt) # bins 
t0 = time.time()
def conditional_rt(entrainfile, depositfile, mstar, bins = bins, dt = 0.5, tmax = tmax):
    """compute the return time distribution conditional to elevation mstar
    given entrainfile, the dat of all t, m at entrainment, and depositfile
    the analogue for deposition"""
    
    # first compute all return times
    returns = np.array([]) # compute all of the return times to mstar
    for chunk in entrainfile: # iterate thru entrainment times and m values 
        te,me = chunk.values.T
        chunk_times = te[me==mstar] # returns to mstar within the chunk
        returns = np.concatenate((returns, chunk_times))
    # then compute the departure times     
    departures = np.array([]) # compute all of the departure times from mstar 
    for chunk in depositfile: # compute all of the departures from mstar 
        td,md = chunk.values.T
        chunk_times = td[md==mstar+1]
        departures = np.concatenate((departures, chunk_times))
    
    if ( len(returns) > 0 ) and ( len(departures) > 0 ):

        while ( returns[0] < departures[0] ): # while first return smaller than first departure
            returns = returns[1:] # crop off first return  
        while ( departures[-1] > returns[-1] ): # while last departure greater than last return 
            departures = departures[:-1] # crop off last departure
            
        # at this point, the return time series starts from a departure and end from a return 
        # but there is no guarantee that it doesn't start with k departures or k returns 
        
        # so if it isn't woven as |d,r,d,r,d,r,d,r| ...  that's because 
        # (a) there are two or more departures prior to the first return, |d,d,d,r,d,r,d,r,d,r,d,r| ... or 
        # (b) there are two or more returns post the last departure, |d,r,d,r,d,r,r,r,r| .... 
        # so the number k of repeats is 
        k = len(departures) - len(returns)
        # and if k > 0 it's case (a)
        # and if k < 0 it's case b
        if k > 0: # if there are two or more departures prior to the first return 
            departures = departures[k:] # k is the number of excess departures
        if k < 0: # if there are two or more returns post the last departure
            returns = returns[:k] # k is the number of excess returns

            
        # Now compute the return time distribution 
        crt = returns - departures # compute the conditional return times
        H, bins = np.histogram(crt, bins = bins, density = True) # compute the cdf
        F = 1 - H.cumsum()*dt # each val is the probability that TR exceeds the bin in question
    else:
        F = np.zeros(len(bins)-1,dtype=float)
    
    return F 

In [None]:
# the entire unconditional return time analysis sequence 

def load(filename, chunksize = 5000):
    """filename is the original output file of the simulation
    chunksize is the size at which to chunk the simulation output (which is huge)
    chunksize_ed is the size at which to chunk the entrain and deposit file output """
    
    entrainpath = filename[:-4]+'-entrains.dat' # generate the filenames for entrainment and depsoition 
    depositpath = filename[:-4]+'-deposits.dat' #
    entrainfile = pd.read_csv(entrainpath, delim_whitespace = True, 
                              index_col = None, iterator = True, chunksize = chunksize)
    depositfile = pd.read_csv(depositpath, delim_whitespace = True,
                              index_col = None, iterator = True, chunksize = chunksize)
    return entrainfile, depositfile # return iteratables for each file 

In [None]:
def cdfcompute(filename):
    tt0 = time.time()
    tmax = 500*3600 # maximum return time to allow for 
    dt = 0.5 # bin size 
    bins = np.arange(0, tmax, dt) # bins 
    chunksize=1000000 # moderate chunk size 

    # get the statistics of m 
    cdffilename = filename[:-4]+'-rtcdf.dat'
    mstatsfilename = filename[:-4]+'-mstats.dat'
    m, pm = np.loadtxt(mstatsfilename).T
    entrainfile, depositfile = load(filename,chunksize=chunksize)

    # compute all of the conditional return time cdfs for each m value under consideration
    cdf = np.zeros(len(bins)-1,dtype=float)
    for p,mv in zip(pm,m): # iterate through all mvalues in question
        entrainfile, depositfile = load(filename,chunksize=chunksize) # load in the files again to refresh the iterator 
        Fmv = conditional_rt(entrainfile, depositfile, mv) # calculate the rt cdf conditional to mv
        cdf+=p*Fmv

    # compute the final output which is the bin centers and the unconditional return time pdf corresponding to them 
    #cdf = (conditionals*pm.reshape(-1, 1)).sum(axis = 0) # compute the unconditional cdf
    bins = (bins[1:]+bins[:-1])/2.0 # compute the midpoints of the bins 

    data = np.array((bins,cdf)).T
    print(time.time()-tt0, ' total time up to saving at chunksize ', chunksize)
    
    print ('---------------------------------------')
    t0 = time.time()
    np.save(cdffilename, data) # numpy save method 1 
    print(time.time()-t0, ' numpy version 1 ')
    
    print ('---------------------------------------')
    t0 = time.time()
    np.savetxt(cdffilename,data) # numpy save method 2
    print(time.time()-t0, 'numpy version 2')
    

    return bins, cdf

In [None]:
bins, cdf = cdfcompute(filename)

In [None]:
368/10

In [None]:
# so you can get a 10% speedup with a big chunksize 
bins, cdf = cdfcompute(filename)

In [None]:
# 277 million lines for flow condition a.. this should take about 100s for 1million
277*100/3600 # only 8 hrs... 

In [None]:
# how many GB is 50,000,000 lines? 
21.0/277e6*50e6 # should be able to chunk at 50 million lines ... 

In [None]:
array = np.random.random(size=(2,int(200e6))) # should be able to chunk at 100 million lines... with 8GB or 16G

array.nbytes/1e9

In [None]:
array[array<0.2]

In [2]:
# 277 million lines at least 
# 700 million lines at most 
# compute time also scales linearly with number of mvalues which ranges from 100 to 700

# therefore max compute time is 
print(7*700*100/3600/24) # days assuming changes in saving overhead are small 

# while min compute time is about 
print(277*100/3600/24) # days

5.671296296296297
0.32060185185185186


In [None]:
bins, cdf = np.load('/home/kp/Desktop/flow_a-l_0.6-time_1000.0hr-rtcdf.dat.npy').T

In [None]:
plt.loglog(bins, cdf)
plt.ylim(1e-3,1)

In [None]:
plt.loglog(bins, cdf)
plt.ylim(1e-3,1)

In [None]:
t0 = time.time()
# define the parameters for analysis 
tmax = 500*3600 # maximum return time to allow for 
dt = 0.5 # bin size 
bins = np.arange(0, tmax, dt) # bins 

# get the statistics of m 
cdffilename = filename[:-4]+'-rtcdf.dat'
mstatsfilename = filename[:-4]+'-mstats.dat'
m, pm = np.loadtxt(mstatsfilename).T

# compute all of the conditional return time cdfs for each m value under consideration
cdf = np.zeros(len(bins)-1,dtype=float)
for p,mv in zip(pm,m): # iterate through all mvalues in question
    entrainfile, depositfile = load(filename) # load in the files again to refresh the iterator 
    Fmv = conditional_rt(entrainfile, depositfile, mv) # calculate the rt cdf conditional to mv
    cdf+=p*Fmv

# compute the final output which is the bin centers and the unconditional return time pdf corresponding to them 
#cdf = (conditionals*pm.reshape(-1, 1)).sum(axis = 0) # compute the unconditional cdf
bins = (bins[1:]+bins[:-1])/2.0 # compute the midpoints of the bins 

data = np.array((bins,cdf)).T
np.savetxt(cdffilename,data)
print(time.time()-t0)

In [None]:
plt.loglog(bins,cdf)

In [None]:

t0 = time.time()
# define the parameters for analysis 
tmax = 500*3600 # maximum return time to allow for 
dt = 0.5 # bin size 
bins = np.arange(0, tmax, dt) # bins 

# get the statistics of m 
cdffilename = filename[:-4]+'-rtcdf.dat'
mstatsfilename = filename[:-4]+'-mstats.dat'
m, pm = np.loadtxt(mstatsfilename).T
entrainfile, depositfile = load(filename)

# compute all of the conditional return time cdfs for each m value under consideration
cdf = np.zeros(len(bins)-1,dtype=float)
for p,mv in zip(pm,m): # iterate through all mvalues in question
    entrainfile, depositfile = load(filename) # load in the files again to refresh the iterator 
    Fmv = conditional_rt(entrainfile, depositfile, mv) # calculate the rt cdf conditional to mv
    cdf+=p*Fmv

# compute the final output which is the bin centers and the unconditional return time pdf corresponding to them 
#cdf = (conditionals*pm.reshape(-1, 1)).sum(axis = 0) # compute the unconditional cdf
bins = (bins[1:]+bins[:-1])/2.0 # compute the midpoints of the bins 

data = np.array((bins,cdf)).T
#np.savetxt(cdffilename,data)
print(time.time()-t0)