In [53]:
from collections import defaultdict, deque, Counter
import numpy as np
from tqdm import tqdm_notebook as tqdm 
import pandas as pd

#from sklearn.preprocessing import normalize

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [44]:
infinite_index = 1000000000
sample_interval = 1000
num_params = 3


In [31]:
filename = "cheetah.cs.fiu.edu-110108-113008.1.blkparse"
df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']
blocktrace = df['blockNo'].tolist()
len(blocktrace)

1322890

In [51]:
def belady_opt(blocktrace, frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate
    '''
    global sample_interval # interval of choice for sampling
    global infinite_index # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = set()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index+=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                # evict the farthest block in future request from cache
                if len(upcoming_index) != 0:
                    
                    # find the farthest i.e. max_index in upcoming_index
                    max_index = max(upcoming_index)
                    
                    # remove the block with max_index from cache
                    Cache.remove(upcoming_index[max_index])
                    
                    # remove the block with max_index from recency dict
                    recency.remove(upcoming_index[max_index])
                    
                    # remove max_index element from upcoming_index
                    del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index += 1
                
            
            # add block into Cache
            Cache.add(block)
            
            # add block into recency
            recency.append(block)
            
            #### regression extra part
            if ( i % sample_interval == 0):
                Y_OPT = populateData(frequency, recency, Cache, block_index)
                lruPredict(Cache,recency,Y_OPT)
                lfuPredict(Cache,frequency,Y_OPT)
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [54]:
belady_opt(blocktrace, 500)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…



NameError: name 'eviction' is not defined

In [45]:
lruCorrect = 0
lruIncorrect = 0

def lruPredict(C,LRUQ,Y_OPT):
    global lruCorrect, lruIncorrect
    Y_current = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = len(LRUQ) - i
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lruCorrect+=1
        else:
            lruIncorrect+=1
    return Y_current

'''
    given C, use LFUDict to find eviction number of blocks from the Cache
    compare it with Y_OPT and store number of places the two differ
    The number of correct and incorrect predictions with respect to OPT.
'''

lfuCorrect = 0
lfuIncorrect = 0

def lfuPredict(C,LFUDict,Y_OPT):
    global lfuCorrect, lfuIncorrect
    Y_current = []
    KV = defaultdict()
    for e in C:
        KV[e] = LFUDict[e]
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lfuCorrect+=1
        else:
            lfuIncorrect+=1
    return Y_current

# return "eviction" blocks that are being accessed furthest
# from the cache that was sent to us.

def getY(C,OPT):
    global maxpos
    KV = defaultdict(int)
    Y_current = []
    for e in C:
        if len(OPT[e]) is not 0:
            KV[e] = OPT[e][0]
        else:
            KV[e] = maxpos
            maxpos+=1
    # extract "eviction" blocks from KV_sorted hashmap
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    return Y_current

def getLFURow(LFUDict, C):
    x_lfurow = []
    for e in C:
        x_lfurow.append(LFUDict[e])
    norm = x_lfurow / np.linalg.norm(x_lfurow)
    return norm
    
def getLRURow(LRUQ, C):
    x_lrurow = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = len(C) - i
    for e in C:
        x_lrurow.append(KV[e])
    norm = x_lrurow / np.linalg.norm(x_lrurow)
    return norm

def getX(LRUQ, LFUDict, C):
    X_lfurow = getLFURow(LFUDict, C)
    X_lrurow = getLRURow(LRUQ, C)
    X_bno    = C / np.linalg.norm(C)
    return (np.column_stack((X_lfurow, X_lrurow, X_bno)))

# appends OPT sample to X, Y arrays

X = np.array([], dtype=np.int64).reshape(0,num_params)
Y = np.array([], dtype=np.int64).reshape(0,1)

# C - cache, LFUDict - dictionary containing block-> access frequency
# LRUQ - order of element access in Cache.

def populateData(LFUDict, LRUQ, C, OPT):
    global X,Y
    C = list(C)
    Y_current = getY(C, OPT)
    X_current = getX(LRUQ, LFUDict, C)

    Y = np.append(Y, Y_current)
    X = np.concatenate((X,X_current))
    return Y_current


In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y ,test_size=0.3, random_state=0)

#Fitting Logistic Regression Model
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

confusion_matrix = confusion_matrix(Y_test,Y_pred)
print (confusion_matrix)

print ("LFU Correct / Incorrect Ratio")
total = lfuCorrect + lfuIncorrect
print ( (total - (lfuIncorrect/2) ) / total )

print ("LRU Correct / Incorrect Ratio")
total = lruCorrect + lruIncorrect
print ( (total - (lruIncorrect/2) ) / total )

ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required.