# In the main branch, this file should be no executed and with no parameters set.
# In the development branch, it must be fully executed and must include tests.

GOAL: generate a CSV result file from raw results files compressed at a GZ file.

In [None]:
import subprocess as __cmd
import numpy as __np
import pandas as __pd
import os as __os
import sys as __sys

## Parameters

In [None]:
# Set path for the GZ file containg the raw results files 
# (must ends with a slash) and the GZ file name.
# By raw I mean coming directly from simulations.
#
# Example: 
#
# path = '/home/laercio/Dropbox/pesquisa/2017-voterModel/'
# path += 'code/resultados_e_analises/'
# path += '002__bubble_filtering__8_neighbors_squared_network/'
# path += 'raw_results/'     # Too long path... does not fit my screen.
#
# The file name
# gz = 'test.tar.gz'


# The path for the gz file (must ends with a slash).
path = '/home/laercio/Dropbox/pesquisa/2017-voterModel/'
path += 'code/resultados_e_analises/'
path += '002__bubble_filtering__8_neighbors_squared_network/'
path += 'raw_results/'     # Too long path... does not fit my screen.

# The file name
gz = 'test.tar.gz'

<br>
Hereafter, the code should not be modified at the standard usage.

## Functions

In [None]:
# The idea to pick all the results from each raw result file is 
# to read the simulation parameters from the file name, 
# read the results in the file, create a list, 
# and attach to a big list. In the end, generate a pandas data frame 
# using the big list.

def get_results(resultsPath, programNickname):
    
    # Get the list of results files names
    ls = __os.listdir(resultsPath)
    ls = [i for i in ls if i.startswith(programNickname) 
          and i.endswith('_m')]
    ls.sort()
    
    dataMatrix = []
    
    # f - filename
    for f in ls:
        
        # get n
        i1 = f.find('N_') + 2
        i2 = f[i1:].find('__q') + i1
        n = int(f[i1:i2])
        
        # get q
        i1 = f.find('q_') + 2
        i2 = f[i1:].find('__') + i1
        q = float(f[i1:i2])
        
        # get network label
        p = f.find('net_')
        if p != -1:
            i1 = f.find('net_') + 4
            i2 = i1 + f[i1:].find('_rep')
            net = int(f[i1:i2])
        else:
            net = -1
        
        # get replication label
        i1 = f.find('rep_') + 4
        i2 = i1 + f[i1:].find('_m')
        rep = int(f[i1:i2])
        
        data = __np.loadtxt(resultsPath+f)
        
        if len(data) == 4:
            mT,m2T,m4T,mT_noMod = data
            std = __np.sqrt(m2T - mT**2)
            std_noMod = __np.sqrt(m2T - mT_noMod**2)
            
        elif len(data) == 3:
            mT,m2T,m4T = data
            mT_noMod = __np.nan
            std = __np.sqrt(m2T - mT**2)
            std_noMod = __np.nan            
        else:
            raise Exception(
'Each raw result file should contain just three or four values.')
        
        dataMatrix.append([n,q,net,rep,mT,m2T,m4T,mT_noMod,
                           std,std_noMod])
    
    df = __pd.DataFrame(dataMatrix, 
                columns=['N','q','net','rep','mT','m2T','m4T',
                         'mT_noMod', 'std_', 'std_noMod'])
    
    df.sort_values(['N','q','net','rep'],inplace=True)
    
    return df

## Main

In [None]:
path_temp = '/home/laercio/temp1977/'

In [None]:
# Create a temporary folder.
__cmd.getoutput("mkdir %s" % path_temp)

In [None]:
# Copy the gz file to the temporary folder.
__cmd.getoutput("cp %s%s %s" % (path, gz, path_temp))

In [None]:
# Uncompress the results.
__cmd.getoutput("tar -zxf %s%s -C %s" % (path_temp, gz, path_temp))

In [None]:
# Delete gz and execInfo files from the temporary folder.
print(__cmd.getoutput("rm -v %s%s" % (path_temp, gz)))
print(__cmd.getoutput("rm -v %sexecInfo.txt" % path_temp))

In [None]:
# Create a results data frame from the raw results files.
%time results = get_results(path_temp, '')

In [None]:
# Count the number of files in the temporary folder.
__cmd.getoutput("ls %s -1 | wc -l" % path_temp)

In [None]:
# The size of the results data frame must be equals to the above number.
len(results)

In [None]:
# Reset the results data frame index.
results.reset_index(drop=True, inplace=True)

In [None]:
# Reset the results data frame index.
results.to_csv(path + gz[:-7] + '.csv')

In [None]:
# Delete the temporary folder and its content.
__cmd.getoutput("rm -r %s" % path_temp)

## Testing

In [None]:
# To test this code consists of copy a test.tar.gz for the 
# indicated path, run the notebook, and check if a data frame 
# with the results was created.
#
# The above outputs were produced when it tested the notebook.