In [8]:
from tqdm import tqdm_notebook as tqdm 
import numpy as np
from collections import deque, defaultdict
import timeit
import pandas as pd
import random
from collections import Counter
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
import sys

### Global Variables

In [22]:
def resetGlobal():
    global maxpos, num_params, cache_size, sampling_freq, eviction, lruCorrect, lruIncorrect
    global lfuCorrect, lfuIncorrect, X, Y
    
    maxpos = 1000000000000

    num_params = 3

    cache_size = 100 # default cache size
    #sampling_freq = cache_size # number of samples skipped
    eviction = int(0.2 * cache_size)  


    lruCorrect = 0
    lruIncorrect = 0

    lfuCorrect = 0
    lfuIncorrect = 0


    X = np.array([], dtype=np.int64).reshape(0,num_params)
    Y = np.array([], dtype=np.int64).reshape(0,1)


### Files (Do not run unnesessarily)

In [13]:
def fileLoad(path, partition):
    df = pd.read_csv(path, sep=' ',header = None)
    df.columns = ['timestamp','pid','pname','blockNo', \
                  'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

    fullTrace = df['blockNo'].tolist()
    trace = fullTrace[:int(len(fullTrace)*partition)]

    print(len(trace))
    return trace

train = fileLoad("DATA/cheetah.cs.fiu.edu-110108-113008.5.blkparse", 0.1)
test = fileLoad("DATA/cheetah.cs.fiu.edu-110108-113008.6.blkparse", 0.1)

2392685
2373951


## Helper Functions

In [32]:
def lruPredict(C,LRUQ,Y_OPT):
    global lruCorrect, lruIncorrect
    Y_current = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = len(LRUQ) - i
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lruCorrect+=1
        else:
            lruIncorrect+=1
    return Y_current

# returns sequence of blocks in prioirty order

def Y_getBlockSeq(Y_pred_prob):
    x = []
    for i in range(len(Y_pred_prob)):
        x.append(Y_pred_prob[i][0])
    x = np.array(x)
    idx = np.argsort(x)
    idx = idx[:eviction]
    return idx


def Y_getMinPredict(Y_pred_prob):
    x = []
    for i in range(len(Y_pred_prob)):
        x.append(Y_pred_prob[i][0])
    x = np.array(x)
    idx = np.argpartition(x, eviction)
    
    Y_pred = np.zeros(len(Y_pred_prob), dtype=int)
    for i in range(eviction):
        Y_pred[idx[i]] = 1
    assert(Counter(Y_pred)[1] == eviction)
    return Y_pred


def lfuPredict(C,LFUDict,Y_OPT):
    global lfuCorrect, lfuIncorrect
    Y_current = []
    KV = defaultdict()
    for e in C:
        KV[e] = LFUDict[e]
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lfuCorrect+=1
        else:
            lfuIncorrect+=1
    return Y_current

# return "eviction" blocks that are being accessed furthest
# from the cache that was sent to us.

def getY(C,D):
    assert(len(C) == len(D))
    Y_current = []
    KV_sorted = Counter(D)
    evict_dict = dict(KV_sorted.most_common(eviction))
    assert(len(evict_dict) == eviction)
    all_vals = evict_dict.values()
    for e in C:
        if e in evict_dict.values():
            Y_current.append(1)
        else:
            Y_current.append(0)
    #print (Y_current.count(1))
    assert(Y_current.count(1) == eviction)
    assert((set(all_vals)).issubset(set(C)))
    return Y_current

def getLFURow(LFUDict, C):
    x_lfurow = []
    for e in C:
        x_lfurow.append(LFUDict[e])
    norm = x_lfurow / np.linalg.norm(x_lfurow)
    return norm
    
def getLRURow(LRUQ, C):
    x_lrurow = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = i
    for e in C:
        x_lrurow.append(KV[e])
    norm = x_lrurow / np.linalg.norm(x_lrurow)
    return norm

def normalize(feature, blocks):
    x_feature = []
    for i in range(len(blocks)):
        x_feature.append(feature[blocks[i]])
    return x_feature / np.linalg.norm(x_feature)

def getX(LRUQ, LFUDict, C):
#def getX(LRUQ, LFUDict, C, CacheTS, CachePID):   
    X_lfurow = getLFURow(LFUDict, C)
    X_lrurow = getLRURow(LRUQ, C)
    X_bno    = C / np.linalg.norm(C)
#     X_ts     = normalize(CacheTS, C)
#     X_pid    = normalize(CachePID, C)
    return (np.column_stack((X_lfurow, X_lrurow, X_bno)))
    
    
def populateData(LFUDict, LRUQ, C, D):
#def populateData(LFUDict, LRUQ, C, D, CacheTS, CachePID):
    global X,Y
    C = list(C)
    Y_current = getY(C, D)
    #X_current = getX(LRUQ, LFUDict, C, CacheTS, CachePID)
    X_current = getX(LRUQ, LFUDict, C)

    Y = np.append(Y, Y_current)
    X = np.concatenate((X,X_current))
    assert(Y_current.count(1) == eviction)
    return Y_current

## LFU

In [33]:
def LFU(blocktrace, frame):
    
    cache = set()
    cache_frequency = defaultdict(int)
    frequency = defaultdict(int)
    
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        frequency[block] += 1
        
        if block in cache:
            hit += 1
            cache_frequency[block] += 1
        
        elif len(cache) < frame:
            cache.add(block)
            cache_frequency[block] += 1
            miss += 1

        else:
            e, f = min(cache_frequency.items(), key=lambda a: a[1])
            cache_frequency.pop(e)
            cache.remove(e)
            cache.add(block)
            cache_frequency[block] = frequency[block]
            miss += 1
    
    hitrate = hit / ( hit + miss )
    return hitrate

## LRU

In [34]:
def LRU(blocktrace, frame):
    
    cache = set()
    recency = deque()
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        
        if block in cache:
            recency.remove(block)
            recency.append(block)
            hit += 1
            
        elif len(cache) < frame:
            cache.add(block)
            recency.append(block)
            miss += 1
            
        else:
            cache.remove(recency[0])
            recency.popleft()
            cache.add(block)
            recency.append(block)
            miss += 1
    
    hitrate = hit / (hit + miss)
    return hitrate

## Belady Opt 

In [43]:
def belady_opt(blocktrace, frame):
    global maxpos, num_params, cache_size, sampling_freq, eviction, lruCorrect, lruIncorrect
    global lfuCorrect, lfuIncorrect, X, Y
    
    OPT = defaultdict(deque)
    D = defaultdict(int)
    LFUDict = defaultdict(int)
    LRUQ = []
    #CacheTS = defaultdict(int)
    #CachePID = defaultdict(int)

    for i, block in enumerate(tqdm(blocktrace, desc="OPT: building index")):
        OPT[block].append(i)

    hit, miss = 0, 0

    C = []
    #count=0
    #seq_number = 0
    for seq_number, block in enumerate(tqdm(blocktrace, desc="OPT")):
#    for block in blocktrace: 
        LFUDict[block] +=1

        if len(OPT[block]) is not 0 and OPT[block][0] == seq_number:
            OPT[block].popleft()
        #CacheTS [blocktrace[seq_number]] = timestamp[seq_number]
        #CachePID [blocktrace[seq_number]] = pid[seq_number]
        if block in C:
            hit+=1
            LRUQ.remove(block)
            LRUQ.append(block)
            assert( seq_number in D)
            del D[seq_number]
            if len(OPT[block]) is not 0:
                D[OPT[block][0]] = block
                OPT[block].popleft()
            else:
                D[maxpos] = block
                maxpos -= 1
        else:
            miss+=1
            if len(C) == frame:
                assert(len(D) == frame)
                evictpos = max(D)
                
                if (seq_number % cache_size +1 == cache_size):
                    #Y_OPT = populateData(LFUDict, LRUQ, C, D, CacheTS, CachePID)
                    Y_OPT = populateData(LFUDict, LRUQ, C, D)
                    lruPredict(C,LRUQ,Y_OPT)
                    lfuPredict(C,LFUDict,Y_OPT)
                
                C[C.index(D[evictpos])] = block
                LRUQ.remove(D[evictpos])
                #del CacheTS [D[evictpos]]
                #del CachePID [D[evictpos]]
                del D[evictpos]
            else:
                C.append(block)
                
            if len(OPT[block]) is not 0:
                D[OPT[block][0]] = block
                OPT[block].popleft()
            else:
                D[maxpos] = block
                maxpos -= 1
            LRUQ.append(block)


    hitrate = hit / (hit + miss)
    #print(hitrate)
    return hitrate

### ML Functions

In [36]:
def LR(X, Y):
    LR = LogisticRegression()
    LR.fit(X, Y)
    return LR

In [37]:
def KNN(X, Y):
    KNN = KNeighborsClassifier(n_neighbors=3)
    KNN.fit(X, Y)
    return KNN

In [38]:
def NN(X, Y):
    NN = MLPClassifier()
    NN.fit(X, Y)
    return NN

In [39]:
def getAccuracy(model, X, Y):
    return model.score(X, Y)

## ML Policy

In [40]:
def hitRate(blocktrace, frame, model):
    LFUDict = defaultdict(int)
    LRUQ = []
#     CacheTS = defaultdict(int)
#     CachePID = defaultdict(int)

    hit, miss = 0, 0

    C = []
    evictCacheIndex = np.array([])
    #count=0
    #seq_number = 0
    for seq_number, block in enumerate(tqdm(blocktrace, desc="OPT")):
        #print(len(evictCacheIndex))
        LFUDict[block] +=1
        #CacheTS[blocktrace[seq_number]] = timestamp[seq_number]
        #CachePID[blocktrace[seq_number]] = pid[seq_number]
        if block in C:
            hit+=1
#             if C.index(block) in evictCacheIndex:
#                 np.delete(evictCacheIndex, C.index(block))
                
            LRUQ.remove(block)
            LRUQ.append(block)
        else:
            evictPos = -1
            miss+=1
            if len(C) == frame:
                if len(evictCacheIndex) == 0: # call eviction candidates
                    X_test = getX(LRUQ, LFUDict, C)
                    #X_test = getX(LRUQ, LFUDict, C, CacheTS, CachePID)

                    Y_pred_prob = model.predict_proba(X_test)
                    # index of cache blocks that should be removed
                    evictCacheIndex = Y_getBlockSeq(Y_pred_prob)
                    #return Y_pred_prob, evictCacheIndex
                # evict from cache
                evictPos = evictCacheIndex[0]
                evictBlock = C[evictPos]
                LRUQ.remove(evictBlock)
                #del CacheTS [evictBlock]
                #del CachePID [evictBlock]
            if evictPos is -1:
                C.append(block)
            else:
                C[evictPos] = block
                evictCacheIndex = np.delete(evictCacheIndex, 0)
            LRUQ.append(block)
            #CacheTS [blocktrace[seq_number]] = timestamp[seq_number]
            #CachePID [blocktrace[seq_number]] = pid[seq_number]
        #seq_number += 1

    hitrate = hit / (hit + miss)
    print(hitrate)
    return hitrate

In [20]:
def script1():
    global cache_size, eviction, train, test
    
    cache_array = [100, 10, 1000]
    approx_array = [0.05, 0.1, 0.2, 0.5, 0.7, 0.9]
    
    f = open('Shehbaz.txt', 'a+')
    f.write('\nRun Default Algorithms')
    f.write('\n========================')
    for i in cache_array:
        print('CacheSize = {}'.format(i))
        lru_train = LRU(train, i)
        lfu_train = LFU(train, i)
        lru_test = LRU(test, i)
        lfu_test = LFU(test, i)
        
        
        f.write('\nCache Size = {}'.format(i))
        f.write('\nLRU Hitrate on TrainingData = {}'.format(lru_train))
        f.write('\nLFU Hitrate on TrainingData = {}'.format(lfu_train))
        f.write('\nLRU Hitrate on TestData = {}'.format(lru_test))
        f.write('\nLFU Hitrate on TestData = {}'.format(lfu_test))
        
    f.write('\n========================')
    f.close()

In [46]:
def script2():
    global train, test, trainOPT10, trainOPT100, trainOPT1000, testOPT1000, testOPT100, testOPT10
    global X_train10, Y_train10, X_test10, Y_test10, X_train100, Y_train100, X_test100, Y_test100
    global X_train1000, Y_train1000, X_test1000, Y_test1000
    global maxpos, num_params, cache_size, sampling_freq, eviction, lruCorrect, lruIncorrect
    global lfuCorrect, lfuIncorrect, X, Y
    
    f = open('Shehbaz.txt', 'a+')
    f.write('\n\n========================')
    f.write('\n Different Cache Size')
    f.write('\n\n========================')
    print('10')
    resetGlobal()
    cache_size = 10
    eviction = int(0.2 * cache_size)  
    trainOPT10 = belady_opt(train, cache_size)
    X_train10 = X
    Y_train10 = Y
    
    resetGlobal()
    cache_size = 10
    eviction = int(0.2 * cache_size)  
    testOPT10 = belady_opt(test, cache_size)
    X_test10 = X
    Y_test10 = Y
    f.write('\nCache Size = {}'.format(cache_size))
    f.write('\nEviction Approx  {}'.format(eviction))
    f.write('\ntrain OPT hitrate = {}'.format(trainOPT10))
    f.write('\ntest OPT hitrate = {}'.format(testOPT10))
    
    print('100')
    resetGlobal()
    cache_size = 100
    eviction = int(0.2 * cache_size)  
    trainOPT100 = belady_opt(train, cache_size)
    X_train100 = X
    Y_train100 = Y
    
    resetGlobal()
    cache_size = 100
    eviction = int(0.2 * cache_size)  
    testOPT100 = belady_opt(test, cache_size)
    X_test100 = X
    Y_test100 = Y
    f.write('\nCache Size = {}'.format(cache_size))
    f.write('\nEviction Approx  {}'.format(eviction))
    f.write('\ntrain OPT hitrate = {}'.format(trainOPT100))
    f.write('\ntest OPT hitrate = {}'.format(testOPT100))
    
    
    print('1000')
    resetGlobal()
    cache_size = 1000
    eviction = int(0.2 * cache_size)  
    trainOPT1000 = belady_opt(train, cache_size)
    X_train1000 = X
    Y_train1000 = Y
    
    resetGlobal()
    cache_size = 1000
    eviction = int(0.2 * cache_size)  
    testOPT1000 = belady_opt(test, cache_size)
    X_test1000 = X
    Y_test1000 = Y
    f.write('\nCache Size = {}'.format(cache_size))
    f.write('\nEviction Approx  {}'.format(eviction))
    f.write('\ntrain OPT hitrate = {}'.format(trainOPT1000))
    f.write('\ntest OPT hitrate = {}'.format(testOPT1000))
    
    f.close()

In [21]:
script1()

CacheSize = 100


HBox(children=(IntProgress(value=0, max=2392685), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2392685), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2373951), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2373951), HTML(value='')))

CacheSize = 10


HBox(children=(IntProgress(value=0, max=2392685), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2392685), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2373951), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2373951), HTML(value='')))

CacheSize = 1000


HBox(children=(IntProgress(value=0, max=2392685), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2392685), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2373951), HTML(value='')))



HBox(children=(IntProgress(value=0, max=2373951), HTML(value='')))



In [None]:
script2()

10


HBox(children=(IntProgress(value=0, description='OPT: building index', max=2392685), HTML(value='')))




HBox(children=(IntProgress(value=0, description='OPT', max=2392685), HTML(value='')))