In [12]:
!pip install tqdm



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


#### Dependencies

In [237]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm 
from collections import defaultdict, deque, Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier

#### Blocktrace Data

In [238]:
filename = "cheetah.cs.fiu.edu-110108-113008.1.blkparse"

df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

blocktrace = df['blockNo'].tolist()

len(blocktrace)

1322890

## Belady's Optimal Algorithm (OPT) - 1

Belady Algorithm given below, returns the hitrate and dataset. Dataset is (Cachesize*3) + 1 column long. Every 3 columns represents the blockNumber, Recency, Frequency. The Last column of the dataset is the INDEX of cache from which the block got evicted when new block is requested. The last column is therefore, a target colunm for Machie Learning, which represent single class at a time.

In [239]:
def belady_opt_1(blocktrace, frame):
    '''
    INPUT
    ============
    blocktrace - list of blocks in sequence of request
    cachesize - int value for capacity of the cache
    
    OUTPUT
    ============
    (1) hitrate (int)
    (2) cache configuration and eviction block at time of miss (np.array)  
    '''
    
    infinite_index = 100 * len(blocktrace) 
    # should be a large integer than block number
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    dataset = np.array([]).reshape(0,3*frame+1)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                  
                # find the farthest i.e. max_index in upcoming_index
                max_index = max(upcoming_index)

                if (i % 1000 +1 == 1000):
                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    stack = np.append(stack, Cache.index(upcoming_index[max_index]))
                    dataset = np.vstack((dataset, stack))
                # remove the block with max_index from cache
                Cache[Cache.index(upcoming_index[max_index])] = block

                # remove the block with max_index from recency dict
                recency.remove(upcoming_index[max_index])

                # remove max_index element from upcoming_index
                del upcoming_index[max_index]
                    
            
            else:
                 
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)
                
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
 
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

In [248]:
hitrate, dataset= belady_opt_1(blocktrace, 100)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…

In [249]:
hitrate

0.0562216057268556

In [250]:
dataset

array([[1.55680380e-01, 0.00000000e+00, 5.19875245e-02, ...,
        8.72572445e-02, 1.03975049e-01, 5.90000000e+01],
       [2.88891688e-01, 0.00000000e+00, 1.00000000e-01, ...,
        6.98057956e-02, 1.00000000e-01, 8.30000000e+01],
       [1.86146133e-01, 0.00000000e+00, 9.57826285e-02, ...,
        6.45703609e-02, 9.57826285e-02, 1.90000000e+01],
       ...,
       [9.99998340e-02, 5.93349262e-02, 1.00000000e-01, ...,
        2.44320284e-02, 1.00000000e-01, 8.80000000e+01],
       [9.99998341e-02, 5.93349262e-02, 1.00000000e-01, ...,
        2.44320284e-02, 1.00000000e-01, 8.80000000e+01],
       [9.99312470e-02, 5.93349262e-02, 1.36363636e-01, ...,
        2.44320284e-02, 1.36363636e-01, 8.80000000e+01]])

### Train-Test Split

In [251]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:-1], dataset[:,-1].astype(int), test_size=0.3, \
                                                    random_state=None, shuffle=True)

### Logistic Regression

In [252]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

print(logreg.score(X_test, Y_test))

print(confusion_matrix(Y_test,logreg.predict(X_test)))

0.42480211081794195
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Neural Network

In [75]:
NN = MLPClassifier()
NN.fit(X_train, Y_train)

print(NN.score(X_test, Y_test))

print(confusion_matrix(Y_test,NN.predict(X_test)))

0.46774193548387094
[[3 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [76]:
NN

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

### Calculate Hitrate

In [246]:
def ML_policy_1(blocktrace,frame, model):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    model = trained ML model
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    #for i, block in enumerate(tqdm(blocktrace, \
      #                        desc="buidling index", leave=False)):
     #   block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        #if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
        #    block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                blockNo = np.array([i for i in Cache])
                blockNo = blockNo / np.linalg.norm(blockNo)
                recency_ = np.array([recency.index(i) for i in Cache])
                recency_ = recency_ / np.linalg.norm(recency_)
                frequency_ = np.array([frequency[i] for i in Cache])
                frequency_ = frequency_ / np.linalg.norm(frequency_)
                stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                X_current = model.predict(stack)
                
                Cache[X_current[0]] = block
                
                
            else:
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)


    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [254]:
ML_policy_1(blocktrace,100, logreg)

HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…

KeyboardInterrupt: 

## Belady's Optimal Algorithm (OPT) - 2

Belady Algorithm given below, returns the hitrate and dataset. Dataset is [(Cachesize*3) + CacheSize] column long. Every 3 columns represents the blockNumber, Recency, Frequency. The Last column of the dataset is the a numpy 1d array of length same as Cache length. It repreent a binary array where the x% of element get 1 that are likely to be evicted and remaining get 0 that are not being evicted.

In [15]:
def belady_opt_2(blocktrace, frame, perc):
    '''
    INPUT
    ============
    blocktrace - list of blocks in sequence of request
    cachesize - int value for capacity of the cache
    perc - percentage of blocks that get 1(i.e. likely to be evicted) in numpy array of label
    
    OUTPUT
    ============
    (1) hitrate (int)
    (2) cache configuration and eviction block at time of miss (np.array)  
    '''
    
    infinite_index = 100 * len(blocktrace) 
    # should be a large integer than block number
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    dataset = np.array([]).reshape(0,(3*frame)+frame)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                  
                # find the farthest i.e. max_index in upcoming_index
                max_index = max(upcoming_index)
                
                if (i % 1000 +1 == 1000):
                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    labelData = np.zeros(int(frame))
                    temp_upcomingIndex = Counter({j:i for i,j in upcoming_index.items()})
                    mostCommonElements = [i[0] for i in temp_upcomingIndex.most_common(int(frame*perc))]
                    mostCommonElementsIndex = [Cache.index(j) for j in mostCommonElements]
                    labelData[[mostCommonElementsIndex]] = 1
                    stack = np.append(stack, labelData)
                    dataset = np.vstack((dataset, stack))
                # remove the block with max_index from cache
                Cache[Cache.index(upcoming_index[max_index])] = block

                # remove the block with max_index from recency dict
                recency.remove(upcoming_index[max_index])

                # remove max_index element from upcoming_index
                del upcoming_index[max_index]
                    
            
            else:
                 
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)
                
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
 
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

In [16]:
hitrate, dataset = belady_opt_2(blocktrace, 1000, 0.1)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…



In [17]:
dataset

array([[0.05686791, 0.        , 0.02583712, ..., 0.        , 0.        ,
        0.        ],
       [0.05566091, 0.        , 0.03143473, ..., 0.        , 0.        ,
        0.        ],
       [0.05221939, 0.05048311, 0.09352195, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.05764424, 0.01227819, 0.03161049, ..., 1.        , 0.        ,
        0.        ],
       [0.03162163, 0.01227819, 0.03640712, ..., 1.        , 0.        ,
        0.        ],
       [0.03161963, 0.01227819, 0.02487785, ..., 1.        , 0.        ,
        0.        ]])

### Train-Test Split

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:1000*3], dataset[:,1000*3:].astype(int), test_size=0.3, \
                                                    random_state=None, shuffle=True)

### Neural Network

In [22]:
NN_ = MLPClassifier(alpha=0.001,max_iter=300,hidden_layer_sizes=(50,))
NN_.fit(X_train, Y_train)

print(NN_.score(X_test, Y_test))

#print(confusion_matrix(Y_test,NN.predict(X_test)))



0.06182795698924731


In [197]:
NN_

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [206]:
NN_.predict_proba([X_test[1]])

array([[9.89261252e-01, 2.47479750e-03, 2.65404309e-03, 2.39137631e-03,
        5.36183456e-03, 6.03085639e-03, 5.62941147e-03, 6.10392872e-04,
        4.25873524e-04, 5.25568859e-04, 1.62251429e-03, 3.10489225e-06,
        2.11008625e-06, 2.68305251e-06, 2.87497405e-06, 1.67032355e-05,
        1.08016777e-05, 8.34543988e-01, 2.30255664e-03, 5.83022269e-03,
        6.30008590e-03, 1.07011826e-02, 4.53601711e-07, 1.11661557e-04,
        3.23861601e-05, 2.22375838e-04, 3.77641940e-04, 1.69900684e-06,
        9.50953501e-04, 4.61984978e-03, 2.25004177e-02, 5.49036329e-03,
        9.85794305e-01, 6.45597137e-01, 8.94743344e-01, 2.11632240e-03,
        1.64397152e-03, 5.83642070e-03, 1.77193783e-05, 1.22831707e-05,
        1.16757219e-05, 8.97231259e-06, 7.60048219e-04, 1.29327044e-05,
        1.54938503e-05, 9.94018062e-01, 5.75445003e-03, 9.03297947e-03,
        5.33755343e-05, 2.21738152e-05, 5.79952682e-04, 9.97289288e-01,
        9.94576303e-01, 9.96311545e-01, 5.57876705e-04, 3.535704

In [207]:
NN_.predict([X_test[1]])

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Calculate Hitrate

In [192]:
def ML_policy_2(blocktrace,frame, model):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    model = trained ML model
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    bufferCache = defaultdict(int)
    # initialize bufferCache
    
    hit, miss = 0, 0
    
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            if block in bufferCache:
                bufferCache.delete()
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                if len(bufferCache) == 0:

                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)

                    X_current = model.predict_proba(stack)
                    max_block=X_current.argsort()[-100:]
                    bufferCache = np.append(bufferCache, list(i for i in max_block[0]))
                    

                Cache[bufferCache[-1]] = block
                bufferCache.pop()

                
            else:
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)


    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [193]:
s = ML_policy_2(blocktrace, 1000, NN_)

HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…

AttributeError: 'numpy.ndarray' object has no attribute 'pop'

In [237]:
s

283207024

In [9]:
n = np.array([])

In [11]:
len(n)

0

In [234]:
l[-2]

5

## Belady's Optimal Algorithm (OPT) - 3

Belady Algorithm given below, returns the hitrate and dataset. Dataset is [3] column long. A set of 1000 continuous rows represent one Cache configuration. The label column is binary, if the block get evicted then 1 otherwise 0. We want to have x% of eviction candidate rather than only 1 block which actually get evicted. In conclusion, we will have dataset of X rows where every y continuos row rrepresent one cache configuration. Label of those y rows would be 1 and 0.

In [203]:
def belady_opt_3(blocktrace, frame, perc):
    '''
    INPUT
    ============
    blocktrace - list of blocks in sequence of request
    cachesize - int value for capacity of the cache
    perc - percentage of blocks that get 1(i.e. likely to be evicted) in numpy array of label
    
    OUTPUT
    ============
    (1) hitrate (int)
    (2) cache configuration and eviction block at time of miss (np.array)  
    '''
    
    infinite_index = 100 * len(blocktrace) 
    # should be a large integer than block number
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = list()
    # Cache with block
    
    dataset = np.array([]).reshape(0,4)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                  
                assert(len(upcoming_index) == frame)
                # find the farthest i.e. max_index in upcoming_index
                max_index = max(upcoming_index)
                
                if (i % 1000 +1 == 1000):
                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((frequency_, recency_, blockNo))
                    labelData = np.zeros(int(frame))
                    temp_upcomingIndex = Counter({j:i for i,j in upcoming_index.items()})
                    mostCommonElements = [i[0] for i in temp_upcomingIndex.most_common(int(frame*perc))]
                    mostCommonElementsIndex = [Cache.index(j) for j in mostCommonElements]
                    labelData[[mostCommonElementsIndex]] = 1
                    stack = np.column_stack((stack, labelData))
                    dataset = np.vstack((dataset, stack))
                # remove the block with max_index from cache
                Cache[Cache.index(upcoming_index[max_index])] = block

                # remove the block with max_index from recency dict
                recency.remove(upcoming_index[max_index])

                # remove max_index element from upcoming_index
                del upcoming_index[max_index]
                    
            
            else:
                 
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)
                
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
 
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

In [219]:
hitrate, dataset = belady_opt_3(blocktrace, 1000, 0.1)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…

In [220]:
dataset

array([[2.58371195e-02, 0.00000000e+00, 5.68679136e-02, 0.00000000e+00],
       [2.58371195e-02, 5.48133675e-05, 5.91491047e-02, 0.00000000e+00],
       [2.58371195e-02, 1.09626735e-04, 5.91491056e-02, 0.00000000e+00],
       ...,
       [2.48778542e-02, 2.90510848e-03, 3.16189576e-02, 1.00000000e+00],
       [1.24389271e-02, 5.22919526e-02, 3.16629612e-02, 0.00000000e+00],
       [1.24389271e-02, 5.38267269e-02, 3.16630052e-02, 0.00000000e+00]])

### Train-Test Split

In [221]:
dataset = dataset[0:len(dataset)-(len(dataset)%(100 * 10))]
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:-1], dataset[:,-1].astype(int), test_size=0.3, \
                                                    random_state=None, shuffle=True)

### Logistic Regression

In [222]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

print(logreg.score(X_test, Y_test))

print(confusion_matrix(Y_test,logreg.predict(X_test)))

0.9003951612903226
[[334483    197]
 [ 36856    464]]


### Calculate Hitrate

In [225]:
def ML_policy_3(blocktrace,frame, model):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    model = trained ML model
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    #infinite_index = 1000000000000
    
    #block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    #upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = list()
    # Cache with block
    
    bufferCache = np.array([])
    # initialize bufferCache
    
    hit, miss = 0, 0
    
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # if block exist in current cache
        if block in Cache:
            
            #if block in bufferCache:
                #bufferCache = np.delete(bufferCache, np.where(bufferCache == block))
                
            # increment hit
            hit += 1
            
            #if block in bufferCache:
            #    bufferCache.delete()
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                if len(bufferCache) == 0:

                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_))
                    X_current = model.predict_proba(stack)
                    bufferCache = X_current[:,0].argsort()[:10]
                    #bufferCache = np.array([Cache[i] for i in bufferIndex])
#                     return bufferCache, X_current
                evictPos = bufferCache[0]
                evictBlock = Cache[evictPos]
                Cache[evictPos] = block
                recency.remove(evictBlock)
                
                bufferCache = np.delete(bufferCache, 0)

                
            else:
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)


    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [226]:
m = ML_policy_3(blocktrace, 1000, logreg)

HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…

In [229]:
def ML_policy_3_1(blocktrace,frame, model):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    model = trained ML model
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    #infinite_index = 1000000000000
    
    #block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    #upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = list()
    # Cache with block
    
    bufferCache = np.array([])
    # initialize bufferCache
    
    hit, miss = 0, 0
    
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # if block exist in current cache
        if block in Cache:
            
            #if block in bufferCache:
                #bufferCache = np.delete(bufferCache, np.where(bufferCache == block))
                
            # increment hit
            hit += 1
            
            #if block in bufferCache:
            #    bufferCache.delete()
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                if len(bufferCache) == 0:
                    
                    recency_ = []
                    frequency_ = []
                    for i in Cache: 
                        recency_.append(recency.index(i))
                        frequency_.append(frequency[i])
                    
                    blockNo = Cache / np.linalg.norm(Cache)
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_))
                    X_current = model.predict_proba(stack)
                    bufferCache = X_current[:,0].argsort()[:10]
                    #bufferCache = np.array([Cache[i] for i in bufferIndex])
#                     return bufferCache, X_current
                evictPos = bufferCache[0]
                evictBlock = Cache[evictPos]
                Cache[evictPos] = block
                recency.remove(evictBlock)
                
                bufferCache = np.delete(bufferCache, 0)

                
            else:
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)


    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [230]:
l = ML_policy_3_1(blocktrace, 1000, logreg)

HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…

KeyboardInterrupt: 

In [214]:
m[1]

array([0.86924288, 0.13075712])

In [215]:
m[11]

array([0.87427611, 0.12572389])

In [236]:
filename = "DATA/cheetah.cs.fiu.edu-110108-113008.3.blkparse"

df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
           'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

blocktrace = df['blockNo'].tolist()

len(blocktrace)

ValueError: Length mismatch: Expected axis has 1 elements, new values have 9 elements