In [315]:
import math
import pandas as pd
import pickle
import os
import copy
import re
import random

In [None]:
df = pd.read_csv('data.csv')

In [2]:
def getRMSE(predictions, measures):
    if(len(predictions)!=len(measures)):
        raise ValueException('The array size of predictions and measures should be the same')
    n = len(predictions)
    squareErr = 0
    for idx in range(n):
        squareErr += (predictions[idx]-measures[idx])**2
    print(squareErr)
    squareErr = math.sqrt(squareErr/float(n))
    return squareErr

In [3]:
def getMAE(predictions, measures):
    if(len(predictions)!=len(measures)):
        raise ValueException('The array size of predictions and measures should be the same')
    n = len(predictions)
    MAE = 0
    for idx in range(n):
        MAE += abs(predictions[idx] - measures[idx])
    MAE /= n
    return MAE
    

In [4]:
def convertTimeforExperiment(time):
    return time.split(' ')[0]

In [5]:
def getPathDepth(directory='.'):
    MAX_DEPTH = 0
    for root, dirs, files in os.walk(directory):
        MAX_DEPTH = max(MAX_DEPTH, root.count(os.sep))
    return MAX_DEPTH

In [565]:
class FileNode:
    def __init__(self, folderName, fileName, nodeData, state=1, nextNode=None, restore=False):
        self.__state = state
        self.__fileName = fileName
        self.__filePointer = folderName + '\\' + fileName
        if not restore:
            self.__dumpData(folderName, nodeData)
            
    def attemptReplace(self):
        if self.__state == 1:
            self.__state = 0
            return False
        elif self.__state == 0:
            self.__state = -1
            return True
        else:
            raise ValueError('FileNode[{0}] in invalid state {1}'.format(self.__fileName,self.__state))
          
    def updateFileNode(self, newNode):
        self.__filePointer = newNode.__filePointer
        self.__fileName = newNode.__fileName
        self.__state = 1
        return self.getFileInfo()

    def getFileInfo(self):
        return self.__fileName, self.__filePointer


    def getData(self):
        self.__state = 1
        with open(self.__filePointer, 'r') as f:
            return pickle.load(f)
        
    def __dumpData(self, folderName, data):
        with open('{0}/{1}'.format(folderName, self.__fileName), 'w') as f:
            pickle.dump(data, f)

In [583]:
class FolderSystem:

    def __init__(self, folderName, cacheLimit, restore = True):
        #print('Constructing folder [{0}] with cacheLimit [{1}]'.format(folderName, cacheLimit))
        if (cacheLimit<10):
            raise ValueError('Please enter a limit larger than 10...')
        #print("Constructing foldersystem for [{0}]".format(folderName))
        self.__folderName = folderName
        self.__fileNodes = []
        self.__fileNodesDict = dict()
        self.__nodeNumLimit = cacheLimit
        self.__numofNode = 0
        self.__nodePointer = 0
        if restore:
            self.__folderBuilder(limit = cacheLimit)
        self.__saveFolderSystem()
        
    def __folderBuilder(self, limit = 100):
        self.__cleanUpExcessFile(limit)
        for root, _, files in os.walk("."): 
            if(self.__folderName == root):
                for fileName in files:
                    if "cacheFolderSystem" in fileName:
                        continue
                    self.addNode(fileName, None, restore = True)

    def __cleanUpExcessFile(self, limit):
        for root, _, files in os.walk("."): 
            if(self.__folderName == root):
                for idx, fileName in enumerate(files):
                    if "cacheFolderSystem" in fileName:
                        limit += 1
                        continue
                    if idx >= limit:
                        os.remove(self.__folderName + '\\' + fileName)
    
    def addNode(self, fileName, nodeData, restore = False):
        if fileName in self.__fileNodesDict:
            return ('keep', self.__fileNodesDict[fileName])
        elif self.__numofNode < self.__nodeNumLimit:
            newNode = FileNode(self.__folderName, fileName, nodeData, restore = restore)
            self.__fileNodes.append(newNode)
            self.__fileNodesDict[newNode.getFileInfo()[0]] = newNode
            self.__numofNode += 1
            self.__saveFolderSystem()
            return ('add', copy.deepcopy(newNode))
        elif self.__numofNode >= self.__nodeNumLimit:
            return self.__replaceNode(fileName, nodeData)
        else:
            raise Error('Unexpected Error - __numofNode >= __nodeNumLimit')
            
    def __replaceNode(self, fileName, nodeData):
        while(True):
            currentNode = self.__fileNodes[self.__nodePointer]
            if(currentNode.attemptReplace()):
                replacedNode = self.__fileNodesDict.pop(currentNode.getFileInfo()[0])
                newNode = FileNode(self.__folderName, fileName, nodeData)
                self.__fileNodesDict[newNode.getFileInfo()[0]] = newNode
                self.__fileNodes[self.__nodePointer] = newNode
                os.remove(self.__folderName + '\\' + replacedNode.getFileInfo()[0])
                self.__nodePointer = (self.__nodePointer+1)%self.__nodeNumLimit
                self.__saveFolderSystem()
                return ('replace', copy.deepcopy(replacedNode))
            else:
                self.__nodePointer = (self.__nodePointer+1)%self.__nodeNumLimit
        raise Error('Unknown Error in __replaceNode...')
    
    def getFileNodesByName(self, filename):
        return self.__fileNodesDict[filename]
    
    def __saveFolderSystem(self):
        with open('{0}/cacheFolderSystem-{1}.pkl'.format(self.__folderName, re.sub(r'\W+', '', self.__folderName)), 'w') as f:
            pickle.dump(self, f)
    
    def updateCacheLimit(self, newCacheLimit):
        if (newCacheLimit < 10):
            raise ValueError("The new cache limit should be larger than or equal to {0}".format(10))
        elif(newCacheLimit >= self.__numofNode):
            self.__nodeNumLimit = newCacheLimit
        elif(newCacheLimit<self.__numofNode):
            self.__cleanUpExcessFile(newCacheLimit)
            self.__nodeNUmLimit = newCacheLimit
        else:
            raise Exception("Unexpected error in updateCacheLimit()")
    
    def getNumofNode(self):
        return self.__numofNode
    
    def getNodeNumLimit(self):
        return self.__nodeNumLimit

In [588]:
class CacheSystem:
    @staticmethod
    def loadCacheSystem():
        if(os.path.exists('cacheSystemState.pkl')):
            print("Loaded saved cacheSystem in the directory: {0}".format(os.getcwd()+'\\'+"cacheSystemState.pkl"))
            with open('cacheSystemState.pkl', 'r') as f:
                return pickle.load(f)
        else:
            print("The cacheSystemState does not exist...")
            print("Calling the state builder to restore the state...")
            return CacheSystem()
        
    def __init__(self, cacheLimit = 5000):
        print("Initializing cachesystem...")
        if getPathDepth() > 1:
            raise ValueError('The folder structure is incorrect: only allow directory with a depth of 1...')
        self.__folderSystemDict = dict()
        self.__cachedFile = dict()
        self.__folderCacheLimit = cacheLimit
        self.__restoreFolderSystem(cacheLimit)
        self.__saveCacheSystem()
        self.printStatus()
        
    def __restoreFolderSystem(self, cacheLimit):
        print("Restoring folder system...")
        for folderName, _, files in os.walk("."): 
            if folderName == ".":
                continue
            self.__folderSystemDict[folderName] = FolderSystem(folderName, cacheLimit, restore = True)
            self.__cachedFile[folderName] = dict()
        print("Generated folder system...")
        for folderName, _, files in os.walk("."): 
            if folderName == ".":
                continue
            for fileName in files:
                if "cacheFolderSystem" in fileName:
                    continue
                self.__cachedFile[folderName][fileName] = self.__folderSystemDict[folderName].getFileNodesByName(fileName)       


    def addFile(self,folderName, fileName, nodeData):
        currentFolderPointer = self.__createFolder(folderName)
        addNodeResult = currentFolderPointer.addNode(fileName, nodeData)
        if(addNodeResult[0]=='replace'):
            print('File [{0}] in directory [{1}] is being replace...'.format(addNodeResult[1].getFileInfo()[0], folderName))
            del self.__cachedFile[folderName][addNodeResult[1].getFileInfo()[0]]
            self.__cachedFile[folderName][fileName] = addNodeResult[1]
        elif(addNodeResult[0]=='keep'):
            print('File [{0}] in directory [{1}] is already exist...'.format(addNodeResult[1].getFileInfo()[0], folderName))
            pass
        elif(addNodeResult[0]=='add'):
            print('File [{0}] in directory [{1}] is added...'.format(addNodeResult[1].getFileInfo()[0], folderName))
            self.__cachedFile[folderName][fileName] = addNodeResult[1]
        self.__saveCacheSystem()
  
    def __createFolder(self, folderName):
        if(not os.path.exists(folderName)):
            os.makedirs(folderName)
            newFolderSystem = FolderSystem(folderName, self.__folderCacheLimit)
            self.__folderSystemDict[folderName] = newFolderSystem
            self.__cachedFile[folderName] = dict()
            print("The folder did not exist. A new folder [{0}] is created...".format(folderName))
        currentFolderPointer = self.__folderSystemDict[folderName]
        return currentFolderPointer
        
        
    def getFile(self, folderName, fileName):
        if (folderName in self.__cachedFile) and (fileName in self.__cachedFile[folderName]):
            return self.__cachedFile[folderName][fileName].getData()
        else:
            raise ValueError("File [{0}] in directory [{1}] not found...".format(fileName, folderName))
    
    def setFolderCacheLimit(self, folderName, newCacheLimit):
        if (folderName in self.__cachedFile):
            self.__folderSystemDict[folderName].updateCacheLimit(newCacheLimit)
        else:
            raise ValueError("Folder not found...".format(folderName))
        self.__saveCacheSystem()
    
    def __saveCacheSystem(self):
        with open('cacheSystemState.pkl', 'w') as f:
            pickle.dump(self, f)
    
    def printStatus(self):
        print('-------------------Cache System Status---------------------')
        for folderName, folder in self.__folderSystemDict.items():
            print("FolderName: {0} | FolderLimit: {1} | Number of node in folder: {2}".format(folderName, folder.getNodeNumLimit(), folder.getNumofNode()))
        print('-----------------------------------------------------------')


In [589]:
cache = CacheSystem()

Initializing cachesystem...
Restoring folder system...
Generated folder system...
-------------------Cache System Status---------------------
FolderName: .\data | FolderLimit: 5000 | Number of node in folder: 10
FolderName: .\Trading_table1 | FolderLimit: 5000 | Number of node in folder: 6
FolderName: .\Trading_table | FolderLimit: 5000 | Number of node in folder: 10
FolderName: .\.ipynb_checkpoints | FolderLimit: 5000 | Number of node in folder: 1
FolderName: .\data2 | FolderLimit: 5000 | Number of node in folder: 10
-----------------------------------------------------------


In [574]:
cache.setFolderCacheLimit('.\\data2', 10)

In [563]:
cache = CacheSystem.loadCacheSystem()

Loaded saved cacheSystem in the directory: C:\Users\Henry\Desktop\Credit Suisse\testfile\cacheSystemState.pkl


In [575]:
cache.printStatus()

------------------------------
FolderName: .\data | FolderLimit: 5000 | Number of node in folder: 10
FolderName: .\Trading_table | FolderLimit: 5000 | Number of node in folder: 10
FolderName: .\.ipynb_checkpoints | FolderLimit: 5000 | Number of node in folder: 1
FolderName: .\data2 | FolderLimit: 5000 | Number of node in folder: 50
------------------------------


In [601]:
'.\henry'.split('\\')

['.', 'henry']

In [599]:
for root, folders, files in os.walk("."): 
    print('root: [{0}] | folders: [{1}] | files: [{2}]'.format(root, folders, files))


root: [.] | folders: [['.ipynb_checkpoints', 'data', 'data2', 'Trading_table', 'Trading_table1']] | files: [['cacheSystemState.pkl', 'data.csv', 'Untitled.ipynb']]
root: [.\.ipynb_checkpoints] | folders: [[]] | files: [['cacheFolderSystem-ipynb_checkpoints.pkl', 'cacheFolderSystem.pkl', 'Untitled-checkpoint.ipynb']]
root: [.\data] | folders: [[]] | files: [['2014-01-01.pkl', '2014-01-02.pkl', '2014-01-03.pkl', '2014-01-04.pkl', '2014-01-05.pkl', '2014-01-06.pkl', '2014-01-09.pkl', '2014-01-10.pkl', '2014-01-11.pkl', '2014-01-12.pkl', 'cacheFolderSystem-data.pkl']]
root: [.\data2] | folders: [['data3']] | files: [['2014-01-01 - Copy (2) - Copy.pkl', '2014-01-01 - Copy (2).pkl', '2014-01-01 - Copy (3) - Copy.pkl', '2014-01-01 - Copy (3).pkl', '2014-01-01 - Copy (4) - Copy.pkl', '2014-01-01 - Copy (4).pkl', '2014-01-01 - Copy (5) - Copy.pkl', '2014-01-01 - Copy (5).pkl', '2014-01-01 - Copy (6) - Copy.pkl', '2014-01-01 - Copy (6).pkl', 'cacheFolderSystem-data2.pkl']]
root: [.\data2\data3] 