## Dependencies

In [1]:
from tqdm import tqdm_notebook as tqdm 
import numpy as np
from collections import deque, defaultdict
import timeit
import pandas as pd
import random
from collections import Counter
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
import sys

### Global Variables

In [2]:
cache_size = 100
eviction = int(0.2 * cache_size)  


def setGlobal(c, e):
    global cache_size, eviction
    cache_size = c
    eviction = e

### Files (Do not run unnesessarily)

In [5]:
def fileLoad(path, partition):
    df = pd.read_csv(path, sep=' ',header = None)
    df.columns = ['timestamp','pid','pname','blockNo', \
                  'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

    fullTrace = df['blockNo'].tolist()
    trace = fullTrace[:int(len(fullTrace)*partition)]

    print(len(trace))
    return trace

train = fileLoad("DATA/cheetah.cs.fiu.edu-110108-113008.5.blkparse", 0.5)
test = fileLoad("DATA/cheetah.cs.fiu.edu-110108-113008.6.blkparse", 0.5)

11963425
11869759


## LFU

In [6]:
def LFU(blocktrace, frame):
    
    cache = set()
    cache_frequency = defaultdict(int)
    frequency = defaultdict(int)
    
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        frequency[block] += 1
        
        if block in cache:
            hit += 1
            cache_frequency[block] += 1
        
        elif len(cache) < frame:
            cache.add(block)
            cache_frequency[block] += 1
            miss += 1

        else:
            e, f = min(cache_frequency.items(), key=lambda a: a[1])
            cache_frequency.pop(e)
            cache.remove(e)
            cache.add(block)
            cache_frequency[block] = frequency[block]
            miss += 1
    
    hitrate = hit / ( hit + miss )
    return hitrate

## LRU

In [7]:
def LRU(blocktrace, frame):
    
    cache = set()
    recency = deque()
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        
        if block in cache:
            recency.remove(block)
            recency.append(block)
            hit += 1
            
        elif len(cache) < frame:
            cache.add(block)
            recency.append(block)
            miss += 1
            
        else:
            cache.remove(recency[0])
            recency.popleft()
            cache.add(block)
            recency.append(block)
            miss += 1
    
    hitrate = hit / (hit + miss)
    return hitrate

# Second Approach (Maharshi)

In [8]:
def belady_opt(blocktrace, frame):
    '''
    INPUT
    ============
    blocktrace - list of blocks in sequence of request
    cachesize - int value for capacity of the cache
    
    OUTPUT
    ============
    (1) hitrate (int)
    (2) cache configuration and eviction block at time of miss (np.array)  
    '''
    global eviction
    
    infinite_index = 100 * len(blocktrace) 
    # should be a large integer than block number
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    dataset = np.array([]).reshape(0,3*frame+1)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                  
                # find the farthest i.e. max_index in upcoming_index
                max_index = max(upcoming_index)

                if (i % 1000 +1 == 1000):
                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    stack = np.append(stack, Cache.index(upcoming_index[max_index]))
                    dataset = np.vstack((dataset, stack))
                # remove the block with max_index from cache
                Cache[Cache.index(upcoming_index[max_index])] = block

                # remove the block with max_index from recency dict
                recency.remove(upcoming_index[max_index])

                # remove max_index element from upcoming_index
                del upcoming_index[max_index]
                    
            
            else:
                 
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)
                
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
 
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

### ML Functions

In [9]:
def LR(X, Y):
    LR = LogisticRegression()
    LR.fit(X, Y)
    return LR

In [10]:
def KNN(X, Y):
    KNN = KNeighborsClassifier(n_neighbors=3)
    KNN.fit(X, Y)
    return KNN

In [11]:
def NN(X, Y):
    NN = MLPClassifier()
    NN.fit(X, Y)
    return NN

In [12]:
def getAccuracy(model, X, Y):
    return model.score(X, Y)

## ML Policy

In [13]:
def ML_policy(blocktrace, frame, model):
    global eviction
    LFUDict = defaultdict(int)
    LRUQ = []
#     CacheTS = defaultdict(int)
#     CachePID = defaultdict(int)

    hit, miss = 0, 0

    C = []
    evictCacheIndex = np.array([])
    #count=0
    #seq_number = 0
    for seq_number, block in enumerate(tqdm(blocktrace, desc="OPT")):
        #print(len(evictCacheIndex))
        LFUDict[block] +=1
        #CacheTS[blocktrace[seq_number]] = timestamp[seq_number]
        #CachePID[blocktrace[seq_number]] = pid[seq_number]
        if block in C:
            hit+=1
#             if C.index(block) in evictCacheIndex:
#                 np.delete(evictCacheIndex, C.index(block))
                
            LRUQ.remove(block)
            LRUQ.append(block)
        else:
            evictPos = -1
            miss+=1
            if len(C) == frame:
                if len(evictCacheIndex) == 0: # call eviction candidates
                    #X_test = getX(LRUQ, LFUDict, C)
                    #X_test = getX(LRUQ, LFUDict, C, CacheTS, CachePID)
                    blockNo = C / np.linalg.norm(C)
                    recency_ = np.array([LRUQ.index(i) for i in C])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([LFUDict[i] for i in C])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    #X_current = model.predict(stack)[0]
                    Y_pred_prob = model.predict_proba(stack)
                    evictCacheIndex = Y_pred_prob.argsort()[0][::-1][:eviction]
                    # index of cache blocks that should be removed
                    #return Y_pred_prob, evictCacheIndex
                # evict from cache
                evictPos = evictCacheIndex[0]
                evictBlock = C[evictPos]
                LRUQ.remove(evictBlock)
                #del CacheTS [evictBlock]
                #del CachePID [evictBlock]
            if evictPos is -1:
                C.append(block)
            else:
                C[evictPos] = block
                evictCacheIndex = np.delete(evictCacheIndex, 0)
            LRUQ.append(block)
            #CacheTS [blocktrace[seq_number]] = timestamp[seq_number]
            #CachePID [blocktrace[seq_number]] = pid[seq_number]
        #seq_number += 1

    hitrate = hit / (hit + miss)
    print(hitrate)
    return hitrate

In [24]:
def script1():
    global cache_size, eviction, train, test
    
    cache_array = [100, 10, 1000]
    approx_array = [0.05, 0.1, 0.2, 0.5, 0.7, 0.9]
    
    f = open('Maharshi.txt', 'a+')
    for i in cache_array:
        
        lru_train = LRU(train, i)
        lfu_train = LFU(train, i)
        lru_test = LRU(test, i)
        lfu_test = LFU(test, i)
        
        
        f.write('\nRun Default Algorithms')
        f.write('\n========================')
        f.write('\nCache Size = {}'.format(i))
        f.write('\nLRU Hitrate on TrainingData = {}'.format(lru_train))
        f.write('\nLFU Hitrate on TrainingData = {}'.format(lfu_train))
        f.write('\nLRU Hitrate on TestData = {}'.format(lru_test))
        f.write('\nLFU Hitrate on TestData = {}'.format(lfu_test))

    f.close()

In [30]:
def script2():
    global cache_size, eviction, train, test
    
    cache_array = [100, 10, 1000]
    approx_array = [0.05, 0.1, 0.2, 0.5, 0.7, 0.9]
    
    f = open('Maharshi.txt', 'a+')
    f.write('\n\n')
    f.write('\nRun Script 2')
    f.write('\n===========================')
    
    for i in cache_array:
        
        trainopthitrate, trainData = belady_opt(train, i)
        testoptHitrate, testData = belady_opt(test, i)
        
        X_train = trainData[:,:-1]
        Y_train = trainData[:,-1].astype(int)
        
        X_test = testData[:,:-1]
        Y_test = testData[:,-1].astype(int)
        
        LR_ = LR(X_train, Y_train)
        LRAccuracy = getAccuracy(LR_, X_test, Y_test)
        
        
        KNN_ = KNN(X_train, Y_train)
        KNNAccuracy = getAccuracy(KNN_, X_test, Y_test)
        
        #KNNHitrate = ML_policy(test, i, KNN)
        
        NN_ = NN(X_train, Y_train)
        NNAccuracy = getAccuracy(NN_, X_test, Y_test)
        
        NNHitrate = ML_policy(test, i, NN_)
        LRHitrate = ML_policy(test, i, LR_)
        
        f.write('\nCache = {}'.format(i))
        f.write('\nBelady Hitrate on TrainingData = {}'.format(trainopthitrate))
        f.write('\nBelady Hitrate on TestingData = {}'.format(testoptHitrate))
        f.write('\nLR Accuracy = {}'.format(LRAccuracy))
        f.write('\nLR Hitrate on TestingData = {}'.format(LRHitrate))
        f.write('\nNN Accuracy = {}'.format(NNAccuracy))
        f.write('\nNN Hitrate on TestingData = {}'.format(LRHitrate))
        f.write('\nKNN Accuracy = {}'.format(KNNAccuracy))
        #f.write('\nKNN Hitrate on TestingData = {}'.format(LRHitrate))
        
    f.write('\n===================================\n\n')
    f.close()

In [31]:
def script3():
    global cache_size, eviction, train, test
    
    approx_array = [0.05, 0.1, 0.2, 0.5, 0.7, 0.9]

    
    f.open('Maharshi.txt', 'a+')
    f.write('Different Approx with same Cache size of 100')
    f.write('\n===========================================')
    
    trainopthitrate, trainData = belady_opt(train, 100)
    testoptHitrate, testData = belady_opt(test, 100)
        
    X_train = trainData[:,:-1]
    Y_train = trainData[:,-1].astype(int)

    X_test = testData[:,:-1]
    Y_test = testData[:,-1].astype(int)

    LR_ = LR(X_train, Y_train)
    NN_ = NN(X_train, Y_train)    
    
    for i in approx_array:
        
        eviction = int(i*100)
        
        NNHitrate = ML_policy(test, 100, NN_)
        LRHitrate = ML_policy(test, 100, LR_)
        
        f.write('\n\nApproximate Eviction = {}'.format(i))
        f.write('\nLR Hitrate = {}'.format(LRHitrate))
        f.write('\nNN Hitrate = {}'.format(NNHitrate))
    
    f.write('\n===================================\n\n')
    f.close()
        

In [None]:
def script4():
    global cache_size, eviction, train, test
    
    eviction = int(0.2*100)
    
    f.open('Maharshi.txt', 'a+')
    f.write('Different Train/Test Split with same Cache size of 100 and eviction approximation 20%')
    f.write('\n===========================================')
    
    trainopthitrate, trainData = belady_opt(train, 100)
    
    X = trainData[:,:-1]
    Y = trainData[:,-1].astype(int)
    
    for i in [0.7, 0.5, 0.3]:
        print('Split = {}'.format(i))
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y.astype(int), test_size=i, \
                                                    random_state=None, shuffle=True)
        print('LR...')
        LR_ = LR(X_train, Y_train)
        LRAccuracy = getAccuracy(LR_, X_test, Y_test)
        
        print('NN...')
        NN_ = NN(X_train, Y_train) 
        NNAccuracy = getAccuracy(NN_, X_test, Y_test)

        print('KNN...')
        KNN_ = KNN(X_train, Y_train)
        KNNAccuracy = getAccuracy(KNN_, X_test, Y_test)
        
        print('Hitrate...')
        NNHitrate = ML_policy(test, i, NN_)
        LRHitrate = ML_policy(test, i, LR_)
        
        f.write('\n\nTestSize = {}'.format(i))
        f.write('\nLR Accuracy = {}'.format(LRAccuracy))
        f.write('\nKNN Accuracy = {}'.format(KNNAccuracy))
        f.write('\nNN Accuracy = {}'.format(NNAccuracy))
        f.write('\nLR Hitrate = {}'.format(LRHitrate))
        f.write('\nNN Hitrate = {}'.format(NNHitrate))
        
    f.write('\n======================================')
    f.close()

In [25]:
script1()

HBox(children=(IntProgress(value=0, max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, max=11869759), HTML(value='')))



In [None]:
script2()

HBox(children=(IntProgress(value=0, description='buidling index', max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, description='sequence', max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, description='buidling index', max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, description='sequence', max=11869759), HTML(value='')))





HBox(children=(IntProgress(value=0, description='OPT', max=11869759), HTML(value='')))


0.04287820839496404


HBox(children=(IntProgress(value=0, description='OPT', max=11869759), HTML(value='')))


0.04283911745807139


HBox(children=(IntProgress(value=0, description='buidling index', max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, description='sequence', max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, description='buidling index', max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, description='sequence', max=11869759), HTML(value='')))



HBox(children=(IntProgress(value=0, description='OPT', max=11869759), HTML(value='')))


0.03993316123773027


HBox(children=(IntProgress(value=0, description='OPT', max=11869759), HTML(value='')))


0.03994992653178552


HBox(children=(IntProgress(value=0, description='buidling index', max=11963425), HTML(value='')))



HBox(children=(IntProgress(value=0, description='sequence', max=11963425), HTML(value='')))

In [None]:
script3()