In [None]:
import collections
import json
import os
import pandas as pd

Goal: take the acurite export files from v2 and create easily reviewed summaries and easily loaded monthly files.  

Output is split into directories for each property listed in `properties`. 
This decouples the files from the properties they record, allowing analysis of . 
In the property directory, the output is in a subdirectory for the `major` version of the output. 

# Collect files for cleaning (generic process)

Cleaning considers all files in the `inputDir` directory compared against the list of `cleanedFiles`, which records all the files that have been completely processed and last version used in processing. 

Depending on the settings of the current versions and the booleans `minorRerun` and `allRerun`, the system will not reprocess files that have been successfully cleaned. 

The versioning method is informed by [semantic versioning](https://en.wikipedia.org/wiki/Software_versioning#Degree_of_compatibility)  When the major version of this cleaning notebook changes, the next run should load all the files and reprocess, placing the output in a new subdirectory for that property. Future development may use the minor number for changes that are compatible with existing consumers of the cleaned files. (More data in the report, additional export files.)  The patch version may be used for keeping track with development. 

## Merging details

The process loads each file into a dataframe, renames the columns to `inputCols` and drops all the columns in `dropCols`. Duplicate data -- all columns match -- is dropped.

### Assumptions

- That the file suffix `fileType` is an exact match (case sensitive)
- That the files are comma separated value files that can be directly read into a data frame.
- That the files have headers.


In [None]:
# ---------------------------------------------------------------------#
# Collect files for cleaning: initialization
# ---------------------------------------------------------------------#
major = 0
minor = 0 # Downstream notebooks can use previous minor versions.  
patch = 0 # Only internal processing changes that should be tracked

# Should files be reprocessed if the minor version is different?
minorRerun = False 

# Should all files be reprocessed no matter what versions?
allRerun = True

#inputDir = '../Acurite.v2'
inputDir = '../DeleteMe/'
fileType = '.csv'
cleanedFilesRecord = './acuritev2.json'

inputCols = ["Name","X00","Timestamp","Temperature_F","Humidity_pct",
             "Dew_Point_F","Heat_Index_F","X01","X02","Pressure_inHg",
             "X03","X04","X05","X06","X07","X08","X09","X10","X11","X12",
             "X13","X14","X15"]
dropCols = ["X00","X01","X02","X03","X04","X05","X06","X07","X08","X09",
            "X10","X11","X12","X13","X14","X15"]


now = pd.Timestamp.utcnow().isoformat()
Status = collections.namedtuple('Status', ['majorStatus', 'minorStatus',
                               'patchStatus','timestampStatus']) 
currentStatus = Status(major,minor,patch,now)
def statusStmt(myStatus):
    r = str(myStatus[0])+"."+str(myStatus[1])+"."+str(myStatus[2])+" "+now
    return r

report = "Status: "+statusStmt(currentStatus)+".\n\n"
print(type(report))

# Cleaning the files


- Persist important details
  - first and last observations of sensors


In [None]:
# ---------------------------------------------------------------------#
# Cleaning the files: initialization
# ---------------------------------------------------------------------#
sensorRecord = '../sensorHistory.json'



# Persisting the cleaned data 

- Create output directories, if needed, labeled with the major version.
- Convert the timestamp to a standard.
- Separate on the `timestampCol` column into `period` blocks.
- A block is considered complete if there is a record in the first and last `subPeriod` in the period.
- Output with a new ISO standard timestamp `isoTimestampCol`
- Generate a run report.

In [None]:
# ---------------------------------------------------------------------#
# Persisting the cleaned data: initialization
# ---------------------------------------------------------------------#
outputPath = "../"
# These are the general output directories.
properties = ['Temperature','Humidity','Pressure']

# The input timestamp format is "2019/11/16 12:00 AM"
timestampCol = 'Timestamp'
isoTimestampCol = 'ISO_Timestamp'
period = "month"
subPeriod = "day"


In [None]:
# ---------------------------------------------------------------------#
# Collect files for cleaning: execution
# ---------------------------------------------------------------------#

# newFiles lists all the available input files, initially.
input = os.listdir(inputDir)
newFiles = []
for f in input:
    if f.endswith(fileType):
        newFiles.append(f)

report = report+str(len(newFiles))+" "+fileType+" files in the "+inputDir+" directory.\n\n"

# cleanedFiles: a dictionary where the key is the input file name and 
# the output is a namedtuple Status of major, minor, patch, and 
# execution timestamp.
try:    
    with open(cleanedFilesRecord) as f:
        cleanedFiles = json.load(f)
        report = report+str(len(cleanedFiles))+" files already processed:\n"      
        for ff in cleanedFiles:
            cleanedFiles[ff] = Status(*cleanedFiles[ff])
            report = report+ff+": \n\t"+statusStmt(cleanedFiles[ff])+"\n"
                     
except FileNotFoundError:
    cleanedFiles = {}
    report = report+"No cleaned files.\n"
except:
    print("Unexpected error")
    raise
    
# If a file has been cleaned by an acceptable version of the processing 
# script, it will be removed from the new files list.

#TODO put in stop processing controls
newFilesReport = ""

if not allRerun:
    for f in newFiles:
        if f in cleanedFiles:
            next
            # check dictionary tuple
            #if checked file major >= major :
            #    newFiles.remove(f)
            #if checkminor = True AND checked file minor >= minor
            #    newFiles.remove(f)
       
        newFilesReport=newFilesReport+"\t"+f+"\n"
        
else:
    report = report+'\n\nNonetheless, all files will be rerun.\n\n'
    

report = report+"The following "+str(len(newFiles))+" files are included in the cleaning:\n"
report = report+newFilesReport

# Import each new file into a dataframe, relabel the columns, pop out 
# the undesired columns and add to a list.
newFilesDF = []

for f in newFiles:
    df = pd.read_csv(inputDir+f)
    df.columns = inputCols
    for c in dropCols:
        df.pop(c)
    cleanedFiles[f] = currentStatus
    newFilesDF.append(df)
    
# Concatenate the list of dataframes into one frame, removing dupes.
inputDF = pd.concat(newFilesDF,ignore_index=True)

report = report+"\nThere are "+str(len(inputDF))+" rows in the sum of all the files. "
inputDF.drop_duplicates(inplace=True)
report = report+"After removing duplicates, there are "+str(len(inputDF))+" rows.\n"

# inputDF is the concatenation of all dataframes. 
print(report)

In [None]:
# ---------------------------------------------------------------------#
# Persisting the cleaned data: execution
# ---------------------------------------------------------------------#
# Convert the timestamp column `timestampCol` to the timestamp format 
## inputDF[timestampCol] = pd.to_datetime(inputDF[timestampCol])

# TODO How to split into period blocks

# TODO How to verify period is complete

# By convention, the output is stored one directory up from this  
# cleaning notebook in a directory named for the property measured.
outputDir = {}
versionDir = 'cleaned.v'+str(major)
for p in properties:
    # https://docs.python.org/3/library/os.html?highlight=os%20makedirs#os.makedirs 
    # Absolute path because documentation indicates risk with using "../"
    pp = os.path.abspath(outputPath+p) 
    outputDir[p] = pp+"/"+versionDir
    os.makedirs(outputDir[p], exist_ok=True)

    
# TODO make final report
with open('run'+pd.Timestamp.utcnow().strftime('%Y%M%dT%H%M')+'.txt', 'w') as f:
    f.write(report)

In [None]:
# ---------------------------------------------------------------------#
# Collect files for cleaning: persist success
# ---------------------------------------------------------------------#
# LAST CELL
# Record the completed files
with open(cleanedFilesRecord, 'w') as outfile:
    json.dump(cleanedFiles, outfile)