In [1]:
from collections import defaultdict, deque, Counter
import numpy as np
from tqdm import tqdm_notebook as tqdm 
import pandas as pd

#from sklearn.preprocessing import normalize

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
filename = "cheetah.cs.fiu.edu-110108-113008.1.blkparse"
df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']
blocktrace = df['blockNo'].tolist()
len(blocktrace)

1322890

In [3]:
def belady_opt(blocktrace, frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate 
    '''
    infinite_index = 10000 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    dataset = np.array([]).reshape(0,3*frame+1)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                
                # evict the farthest block in future request from cache
                if len(upcoming_index) != 0:
                    
                    # find the farthest i.e. max_index in upcoming_index
                    max_index = max(upcoming_index)
                    
                    if (i % 1000 +1 == 1000):
                        blockNo = np.array([i for i in Cache])
                        blockNo = blockNo / np.linalg.norm(blockNo)
                        recency_ = np.array([recency.index(i) for i in Cache])
                        recency_ = recency_ / np.linalg.norm(recency_)
                        frequency_ = np.array([frequency[i] for i in Cache])
                        frequency_ = frequency_ / np.linalg.norm(frequency_)
                        stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                        
                        stack = np.append(stack, Cache.index(upcoming_index[max_index]))
                        dataset = np.vstack((dataset, stack))
                    # remove the block with max_index from cache
                    Cache.remove(upcoming_index[max_index])
                    
                    # remove the block with max_index from recency dict
                    recency.remove(upcoming_index[max_index])
                    
                    # remove max_index element from upcoming_index
                    del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
                
                
            
            # add block into Cache
            Cache.append(block)
            
            # add block into recency
            recency.append(block)
            
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

In [4]:
hitrate, dataset = belady_opt(blocktrace, 1000)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…



In [16]:
hitrate

0.0713407766329778

# LRU for Benchmark

In [5]:
def LRU(blocktrace, frame):
    
    cache = set()
    recency = deque()
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        
        if block in cache:
            recency.remove(block)
            recency.append(block)
            hit += 1
            
        elif len(cache) < frame:
            cache.add(block)
            recency.append(block)
            miss += 1
            
        else:
            cache.remove(recency[0])
            recency.popleft()
            cache.add(block)
            recency.append(block)
            miss += 1
    
    hitrate = hit / (hit + miss)
    return hitrate

In [6]:
LRU(blocktrace[:20000],1000)

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))



0.0401

# Neural Network I

In [7]:
dataset

array([[5.68679136e-02, 0.00000000e+00, 2.58371195e-02, ...,
        5.47585541e-02, 2.58371195e-02, 1.10000000e+01],
       [5.56609085e-02, 0.00000000e+00, 3.14347307e-02, ...,
        5.47585541e-02, 3.14347307e-02, 1.23000000e+02],
       [5.22193872e-02, 5.04831115e-02, 9.35219530e-02, ...,
        5.47585541e-02, 3.11739843e-02, 2.81000000e+02],
       ...,
       [5.76438802e-02, 0.00000000e+00, 3.16104860e-02, ...,
        5.47585541e-02, 3.16104860e-02, 0.00000000e+00],
       [3.16207593e-02, 0.00000000e+00, 3.64071235e-02, ...,
        5.47585541e-02, 2.42714157e-02, 0.00000000e+00],
       [3.16187781e-02, 0.00000000e+00, 2.48778542e-02, ...,
        5.47585541e-02, 1.24389271e-02, 0.00000000e+00]])

In [8]:
dataset.shape

(1240, 3001)

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:-1], dataset[:,-1].astype(int), test_size=0.3, random_state=None, shuffle=True)

#Fitting Logistic Regression Model
NN=MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=42)
NN.fit(X_train, Y_train)
#NN.fit(dataset[:,:-1], dataset[:,-1].astype(int))

Y_pred = NN.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(NN.score(X_test, Y_test)))

print(confusion_matrix(Y_test,Y_pred))

Accuracy of logistic regression classifier on test set: 0.82
[[  1   0   0 ...   0   0   7]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   1]
 ...
 [  0   0   0 ...  11   0   8]
 [  0   0   0 ...   0   5   2]
 [  1   0   0 ...   5   2 446]]


In [21]:
def ourcacheNN(blocktrace,frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    #for i, block in enumerate(tqdm(blocktrace, \
      #                        desc="buidling index", leave=False)):
     #   block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        #if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
        #    block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            #if i in upcoming_index:
                
                # delete old index
             #   del upcoming_index[i]
        
              #  if len(block_index[block]) is not 0:
                    # add new upcoming index
               #     upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                #    block_index[block].popleft()
#                 else:
#                     # add a large integer as index
#                     upcoming_index[infinite_index] = block
#                     # increament large integer
#                     infinite_index+=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                blockNo = np.array([i for i in Cache])
                blockNo = blockNo / np.linalg.norm(blockNo)
                recency_ = np.array([recency.index(i) for i in Cache])
                recency_ = recency_ / np.linalg.norm(recency_)
                frequency_ = np.array([frequency[i] for i in Cache])
                frequency_ = frequency_ / np.linalg.norm(frequency_)
                stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                X_current = NN.predict(stack)
#                 return X_current
                Cache.remove(Cache[X_current[0]])
                
                # evict the farthest block in future request from cache
#                 if len(upcoming_index) != 0:
                    
#                     # find the farthest i.e. max_index in upcoming_index
#                     max_index = max(upcoming_index)
                    
#                     # remove the block with max_index from cache
#                     Cache.remove(upcoming_index[max_index])
                    
#                     # remove the block with max_index from recency dict
#                     recency.remove(upcoming_index[max_index])
                    
#                     # remove max_index element from upcoming_index
#                     del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
#             if len(block_index[block]) != 0:
                
#                 # add upcoming index of block
#                 upcoming_index[block_index[block][0]] = block
               
#                 # remove the index from block_index 
#                 block_index[block].popleft()
            
#             else:
                
#                 # add a large integer as index
#                 upcoming_index[infinite_index] = block
                
#                 # increament high number
#                 infinite_index += 1
                
            
            # add block into Cache
            Cache.append(block)
            
            # add block into recency
            recency.append(block)
            
            #### regression extra part
#             if (i % sample_interval +1 == sample_interval):
#                 Y_OPT = populateData(frequency, recency, Cache, block_index)
#                 lruPredict(Cache,recency,Y_OPT)
#                 lfuPredict(Cache,frequency,Y_OPT)
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [22]:
ourcacheNN(blocktrace[:20000],1000)

HBox(children=(IntProgress(value=0, description='sequence', max=20000, style=ProgressStyle(description_width='…



0.0334

# Neural Network II

In [7]:
dataset

array([[5.68679136e-02, 0.00000000e+00, 2.58371195e-02, ...,
        5.47585541e-02, 2.58371195e-02, 1.10000000e+01],
       [5.56609085e-02, 0.00000000e+00, 3.14347307e-02, ...,
        5.47585541e-02, 3.14347307e-02, 1.23000000e+02],
       [5.22193872e-02, 5.04831115e-02, 9.35219530e-02, ...,
        5.47585541e-02, 3.11739843e-02, 2.81000000e+02],
       ...,
       [5.76438802e-02, 0.00000000e+00, 3.16104860e-02, ...,
        5.47585541e-02, 3.16104860e-02, 0.00000000e+00],
       [3.16207593e-02, 0.00000000e+00, 3.64071235e-02, ...,
        5.47585541e-02, 2.42714157e-02, 0.00000000e+00],
       [3.16187781e-02, 0.00000000e+00, 2.48778542e-02, ...,
        5.47585541e-02, 1.24389271e-02, 0.00000000e+00]])

In [8]:
dataset.shape

(1240, 3001)

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:-1], dataset[:,-1].astype(int), test_size=0.3, random_state=None, shuffle=True)

#Fitting Logistic Regression Model
NN=MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=42)
NN.fit(X_train, Y_train)

Y_pred = NN.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(NN.score(X_test, Y_test)))

print(confusion_matrix(Y_test,Y_pred))

Accuracy of logistic regression classifier on test set: 0.84
[[  3   0   0 ...   0   0   4]
 [  0   0   0 ...   0   0   1]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   9   0   0]
 [  0   0   0 ...   0   3   1]
 [  2   0   0 ...   2   2 269]]


In [14]:
def ourcacheNN(blocktrace,frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    #for i, block in enumerate(tqdm(blocktrace, \
      #                        desc="buidling index", leave=False)):
     #   block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        #if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
        #    block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            #if i in upcoming_index:
                
                # delete old index
             #   del upcoming_index[i]
        
              #  if len(block_index[block]) is not 0:
                    # add new upcoming index
               #     upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                #    block_index[block].popleft()
#                 else:
#                     # add a large integer as index
#                     upcoming_index[infinite_index] = block
#                     # increament large integer
#                     infinite_index+=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                blockNo = np.array([i for i in Cache])
                blockNo = blockNo / np.linalg.norm(blockNo)
                recency_ = np.array([recency.index(i) for i in Cache])
                recency_ = recency_ / np.linalg.norm(recency_)
                frequency_ = np.array([frequency[i] for i in Cache])
                frequency_ = frequency_ / np.linalg.norm(frequency_)
                stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                X_current = NN.predict(stack)
#                 return X_current
                Cache.remove(Cache[X_current[0]])
                
                # evict the farthest block in future request from cache
#                 if len(upcoming_index) != 0:
                    
#                     # find the farthest i.e. max_index in upcoming_index
#                     max_index = max(upcoming_index)
                    
#                     # remove the block with max_index from cache
#                     Cache.remove(upcoming_index[max_index])
                    
#                     # remove the block with max_index from recency dict
#                     recency.remove(upcoming_index[max_index])
                    
#                     # remove max_index element from upcoming_index
#                     del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
#             if len(block_index[block]) != 0:
                
#                 # add upcoming index of block
#                 upcoming_index[block_index[block][0]] = block
               
#                 # remove the index from block_index 
#                 block_index[block].popleft()
            
#             else:
                
#                 # add a large integer as index
#                 upcoming_index[infinite_index] = block
                
#                 # increament high number
#                 infinite_index += 1
                
            
            # add block into Cache
            Cache.append(block)
            
            # add block into recency
            recency.append(block)
            
            #### regression extra part
#             if (i % sample_interval +1 == sample_interval):
#                 Y_OPT = populateData(frequency, recency, Cache, block_index)
#                 lruPredict(Cache,recency,Y_OPT)
#                 lfuPredict(Cache,frequency,Y_OPT)
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [15]:
ourcacheNN(blocktrace[:20000],1000)

HBox(children=(IntProgress(value=0, description='sequence', max=20000, style=ProgressStyle(description_width='…



0.03845

# Grid Search CV

In [112]:
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV

In [113]:
# Create logistic regression
logistic = linear_model.LogisticRegression()

In [114]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [122]:
C

array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04])

In [116]:
# Create grid search using 5-fold cross validation
log_tun = GridSearchCV(logistic, hyperparameters, cv=5)
log_tun

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [117]:
# Fit grid search
log_best = log_tun.fit(X_train, Y_train)



In [118]:
def our_cache_tuned(blocktrace,frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    #for i, block in enumerate(tqdm(blocktrace, \
      #                        desc="buidling index", leave=False)):
     #   block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        #if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
        #    block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            #if i in upcoming_index:
                
                # delete old index
             #   del upcoming_index[i]
        
              #  if len(block_index[block]) is not 0:
                    # add new upcoming index
               #     upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                #    block_index[block].popleft()
#                 else:
#                     # add a large integer as index
#                     upcoming_index[infinite_index] = block
#                     # increament large integer
#                     infinite_index+=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                blockNo = np.array([i for i in Cache])
                blockNo = blockNo / np.linalg.norm(blockNo)
                recency_ = np.array([recency.index(i) for i in Cache])
                recency_ = recency_ / np.linalg.norm(recency_)
                frequency_ = np.array([frequency[i] for i in Cache])
                frequency_ = frequency_ / np.linalg.norm(frequency_)
                stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                X_current = log_best.predict(stack)
#                 return X_current
                Cache.remove(Cache[X_current[0]])
                
                # evict the farthest block in future request from cache
#                 if len(upcoming_index) != 0:
                    
#                     # find the farthest i.e. max_index in upcoming_index
#                     max_index = max(upcoming_index)
                    
#                     # remove the block with max_index from cache
#                     Cache.remove(upcoming_index[max_index])
                    
#                     # remove the block with max_index from recency dict
#                     recency.remove(upcoming_index[max_index])
                    
#                     # remove max_index element from upcoming_index
#                     del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
#             if len(block_index[block]) != 0:
                
#                 # add upcoming index of block
#                 upcoming_index[block_index[block][0]] = block
               
#                 # remove the index from block_index 
#                 block_index[block].popleft()
            
#             else:
                
#                 # add a large integer as index
#                 upcoming_index[infinite_index] = block
                
#                 # increament high number
#                 infinite_index += 1
                
            
            # add block into Cache
            Cache.append(block)
            
            # add block into recency
            recency.append(block)
            
            #### regression extra part
#             if (i % sample_interval +1 == sample_interval):
#                 Y_OPT = populateData(frequency, recency, Cache, block_index)
#                 lruPredict(Cache,recency,Y_OPT)
#                 lfuPredict(Cache,frequency,Y_OPT)
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [119]:
our_cache_tuned(blocktrace[:20000],1000)

HBox(children=(IntProgress(value=0, description='sequence', max=20000, style=ProgressStyle(description_width='…

0.03525

In [121]:
# View best hyperparameters
print('Best Penalty:', log_best.best_estimator_.get_params()['penalty'])
print('Best C:', log_best.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 166.81005372000593
