## Modules / Library

In [1]:
#Import modules / library
import os
import hashlib
import numpy as np
import matplotlib.pyplot as plt
import random
import string



## Initial Function List

In [2]:
#Function List

#For exploding file names into readable array (with index)
def explodeArray(fileName, replaceParams, splitParams):
    replaces=fileName.replace(replaceParams, '')
    splitToArrays=replaces.split(splitParams)
    return splitToArrays

#Return md5 checksum from specific file path
def md5(fname):
    import hashlib
    hash_md5=hashlib.md5()
    with open(fname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

#Return first and second index of file in the exploded file array
def searchFileIndex(coorName, explodedFileArray):
    #search full name from array
    #firstIndex=explodedFileArray.index(coorName)
    firstIndex=0
    secondIndex=0
    for i in range(len(explodedFileArray)):
        if explodedFileArray[i][3] == coorName:
            if firstIndex==0:
                firstIndex=i
            else:
                secondIndex=i
    return firstIndex, secondIndex

#Return file size from specific file path
def fileSize(filePath):
    import os
    osStat=os.stat(filePath)
    return osStat.st_size

#Return last modified mtime from specific file path
def lastModified(path):
    import os
    osStat=os.stat(filePath)
    return osStat.st_mtime

#Replace something (identifier) with blank
def replaceWithBlank(contents, identifier):
    return contents.replace(identifier, '')

#Shortening long precision coordinates to precision used in crawled data
def coordShortener(longCoord):
    return round(longCoord, 4)

#Generate array of date from initial date for x number of day(s)
def hourDateGenerator(year, month, day, numbOfDays):
    import datetime
    from datetime import time, timedelta
    initialDate=datetime.datetime(year, month, day)
    generateAllHours=24*numbOfDays
    resultArray=[]
    for i in range(generateAllHours):
        resultArray=resultArray+[initialDate+timedelta(hours=i)]
    return resultArray

#Return Date With Hours Offset from Initial Date
def dateOffsetHours(year,month, day, hoursOffset):
    import datetime
    from datetime import time, timedelta
    initialDate=datetime.datetime(year, month, day)
    return initialDate+timedelta(hours=hoursOffset)

#Generate alphanumerical random text for specific length
def getRandomString(length):
    import random
    import string
    letters=string.ascii_letters + string.digits
    result_str=''.join(random.choice(letters) for i in range(length))
    return result_str

#Show folder entries in specific folder path
def folderEntries(folderPath):
    import os
    result=[]
    with os.scandir(folderPath) as entries:
        for entry in entries:
            result=result+[entry.name]
    return result
    
#Check if there are duplicate entries in Array, and return clean array of No Duplicate data
def checkAndReturnNoDuplicatesArray(someArray):
    noFilesDuplicate=[]
    collectDuplicate=[]
    for i in range(len(someArray)):
        if someArray[i] not in noFilesDuplicate:
            noFilesDuplicate=noFilesDuplicate+[someArray[i]]
        else:
            collectDuplicate=collectDuplicate+[someArray[i]]
    return noFilesDuplicate, collectDuplicate

def openFile(filePath):
    with open(filePath, 'r') as f:
        result=f.read()
    return result






## Advanced Function

In [3]:
#Advanced Function

#Exploit file name to extract latitude and longitude
def exploitLatLong(fileName):
    split1=explodeArray(fileName, ']', '[')
    split2=splitToArray(split1, '', ',')
    return split2[0], split2[1]

#Convert Array of file name list into two array of coordinates (get1 & get2) + array of all exploded filenames
def convertToCoordinatesArrays(fileNameArray):
    crawledCoordinates1=[]
    crawledCoordinates2=[]
    explodedDataArray=[]
    for i in range(len(fileNameArray)):
        #Split name into array
        splitToArray=explodeArray(fileNameArray[i], ']', '[')
        explodedDataArray=explodedDataArray+[splitToArray]
        if splitToArray[3] not in crawledCoordinates1:
            crawledCoordinates1=crawledCoordinates1+[splitToArray[3]]
        else:
            crawledCoordinates2=crawledCoordinates2+[splitToArray[3]]
    return crawledCoordinates1, crawledCoordinates2, explodedDataArray

#Return JSON Crawled file statistics (including coordinates need to be retried)
def crawledFileStatistics(folderPath, arrayOfFiles, explodedArrayOfFiles, arrayOfCoordinates, refFileSize):
    #Loop for each files
    md5CC1=[]
    md5CC2=[]
    fileSize1=[]
    fileSize2=[]
    
    #Filling md5 and fileSize data from 2 samples for future analysis
    for i in range(len(arrayOfCoordinates)):
        sResultIdx1, sResultIdx2=searchFile(arrayOfCoordinates[i], explodedArrayOfFiles)
        fileName1=folderPath+arrayOfFiles[sResultIdx1]
        fileName2=folderPath+arrayOfFiles[sResultIdx2]
        #Insert MD5
        md5CC1=md5CC1+[md5(fileName1)]
        md5CC2=md5CC2+[md5(fileName2)]
        #Insert File Size
        fileSize1=fileSize1+[fileSize(fileName1)]
        fileSize2=fileSize2+[fileSize(fileName2)]
        
    #Check for files with different MD5
    differentMD5=[]
    for i in range(len(md5CC1)):
        if md5CC1[i] != md5CC2[i]:
            differentMD5=differentMD5+[i]
    
    #Check for file size anomaly
    fileSizeAnomaly1=[]
    fileSizeAnomaly2=[]
    for i in range(len(fileSize1)):
        if fileSize[i] >= refFileSize:
            fileSizeAnomaly1=fileSizeAnomaly1+[i]
        if fileSize2[i] >= refFileSize:
            fileSizeAnomaly2=fileSizeAnomaly2+[i]
    
    #Performing slice analysis from MD5 and File Size Parameter
    indexToRetry=differentMD5.copy()
    for i in range(len(fileSizeAnomaly1)):
        if fileSizeAnomaly1[i] not in indexToRetry:
            indexToRetry=indexToRetry+[fileSizeAnomaly1[i]]
    for i in range(len(fileSizeAnomaly2)):
        if fileSizeAnomaly2[i] not in indexToRetry:
            indexToRetry=indexToRetry+[fileSizeAnomaly2[i]]
            
    #Conversion from index-base-error-reporting to Coordinate to Retry
    coordinatesToRetry=[]
    for i in range(len(indexToRetry)):
        coordinatesToRetry=coordinatesToRetry+[arrayOfCoordinates[indexToRetry[i]]]
        
    #Counting on land coordinates (based from file-size)
    onLandCount=0
    for i in range(len(fileSize1)):
        if fileSize1[i] == 24185:
            onLandCount=onLandCount+1
        
    #Returning JSON Format
    result={
        '_inputFolderPath':folderPath,
        '_inputArrayOfFiles':arrayOfFiles,
        '_inputExplodedArrayOfFiles':explodedArrayOfFiles,
        '_inputArrayOfCoordinates':arrayOfCoordinates,
        '_inputRefFileSize':refFileSize,
        'dataLength':len(arrayOfFiles),
        'coordinatesLength':len(arrayOfCoordinates),
        'dataOnLand':onLandCount,
        'percentageOnLand':onLandCount/len(fileSize1)*100,
        'md5_1':md5CC1,
        'md5_2':md5CC2,
        'fileSize_1':fileSize1,
        'fileSize_2':fileSize2,
        'diffMD5_idx':differentMD5,
        'fileSizeAnomaly_1':fileSizeAnomaly1,
        'fileSizeAnomaly_2':fileSizeAnomaly2,
        'indexToRetry':indexToRetry,
        'coordinatesToRetry':coordinatesToRetry,
    }
    
    return result

#Return True of False for specific files (using file Size and)
def ifProper(folderPath, fileName):
    #Merge address
    filePath=folderPath+fileName
    checkFileSize=fileSize(filePath)
    
    #Return true/false
    if(checkFileSize >= 24185):
        result=False
    else:
        result=True
    return result    
    
#Return File Metadata from inputted FileName & Folder location
def convertToMetadata(folderPath, fileName):
    #Merge address
    filePath=folderPath+fileName
    
    #Open file to edit extract coordinate(s)
    eliminate=openFile(filePath)
    
    #Get File metadata | Latitude and Longitude (from crawled files), et cetera
    latitude, longitude=exploitLatLong(fileName)
    lat=coordShortener(latitude)
    long=coordShortener(longitude)
    
    #Initial Condition (for replacing header(s))
    header=' \n     Lat       Lon        yyyy-mm-dd hh:mm:ss (UTC)     z(m)\n \n'
    dSpace1='     '
    dSpace2='   '
    dSpace3='     '
    dSpace4_pos='     '
    dSpace4_neg='    '
    #DateInit
    startYear=2020
    startMonth=10
    startDay=1
    numberOfDays=15
    dateInit=hourDateGenerator(startYear,startMonth,startDay,numberOfDays)
    
    #Remove Header
    eliminate=replaceWithBlank(eliminate, header)
    
    #Remove unused data from the crawled data
    for i in range(len(dateInit)):
        perLineReplaceParams=dSpace1+str(long)+dSpace2+str(lat)+dSpace3+str(dateInit[i])+dSpace4_pos
        perLineReplaceParamsNegative=dSpace1+str(long)+dSpace2+str(lat)+dSpace3+str(dateInit[i])+dSpace4_neg
        eliminate=replaceWithBlank(eliminate, perLineReplaceParams)
        eliminate=replaceWithBlank(eliminate, perLineReplaceParamsNegative)
        
    #Convert Data to Array of Text
    convertDataFromTextToArray=eliminate.split('\n')
    
    #Delete last item to prevent float conversion error (blank data)
    convertDataFromTextToArray=convertDataFromTextToArray[:-1]
    
    #Convert data from Text to Float
    dataInFloat=[]
    for i in range(len(convertDataFromTextToArray)):
        try:
            dataInFloat=dataInFloat+[float(convertDataFromTextToArray[i])]
        except ValueError:
            print('error', ' on line', i)
    
    #Constructing Metadata
    metadata={
        latitude+','+longitude{
            'filename':fileName,
            'md5':md5(filePath),
            'lastModified':lastModified(filePath),
            'fileSize':fileSize(filePath),
            'latitude':latitude,
            'longitude':longitude,
            'lat_crawl':longitude,
            'long_crawl':latitude,
            'lat_crawl_short':long,
            'long_crawl_short':lat,
            'start_date':dateOffsetHours(startYear,startMonth,startDay,0),
            'end_date':dateOffsetHours(startYear,startMonth,startDay,numberOfDays),
            'date_range':numberOfDays,
            'tidesData': dataInFloat
        }
    }
    
    return metadata


#Comparing and Append Data if Not Duplicated (Coordinates)
def compareAppendCoor(dataArray, insert):
    if insert not in dataArray:
        dataArray=dataArray+[insert]
    return dataArray

#Make new Database

#Write to Table
def writeTb(dbPath, database, password, data):
    #Constructing Path
    dbRead=dbPath+database
    import json
    with open(dbRead, 'r') as readDb:
        db=json.load(readDb)
        
    #write to table
    tbPath=dbPath+db[password]['tablePath']+db[password]['table']
    f=open(tbPath, 'w')
    f.write(data)
    f.close

#Read Database
def readTb(dbPath, database, password):
    #Constructing Path
    dbRead=dbPath+database
    import json
    with open(dbRead, 'r') as readDb:
        db=json.load(readDb)
        
    #open table
    tbPath=dbPath+db[password]['tablePath']+db[password]['table']
    with open(tbRead, 'r') as readTb:
        tb=json.load(readTb)
    
    return tb
    
    


## Initial Condition

In [4]:
#Initial Condition

#Crawled data folder
rawDataPath='../tides/test1/'

#Processed Database
dbPath='../tides/'
password='OlrseTDW5Q0IINlQMtAWLqA9kugyWB'
tablePath='YiJzexGqyHKyfYZpvL5b6vLp2Q8Od7/'
database='RiqOYsHMvGfAhuL7NQjttQFR2dXV2R.json'
table='yJ1MKQlIAeG5osozY1mhIJzGyL0eMI.json'

#Coefficient
referenceFileSize=25000 #bytes

#Replaced header


