In [63]:
!pip install tqdm



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


#### Dependencies

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm 
from collections import defaultdict, deque, Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier

#### Blocktrace Data

In [2]:
filename = "cheetah.cs.fiu.edu-110108-113008.1.blkparse"

df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

blocktrace = df['blockNo'].tolist()

len(blocktrace)

1322890

## Belady's Optimal Algorithm (OPT) - 1

Belady Algorithm given below, returns the hitrate and dataset. Dataset is (Cachesize*3) + 1 column long. Every 3 columns represents the blockNumber, Recency, Frequency. The Last column of the dataset is the INDEX of cache from which the block got evicted when new block is requested. The last column is therefore, a target colunm for Machie Learning, which represent single class at a time.

In [79]:
def belady_opt_1(blocktrace, frame):
    '''
    INPUT
    ============
    blocktrace - list of blocks in sequence of request
    cachesize - int value for capacity of the cache
    
    OUTPUT
    ============
    (1) hitrate (int)
    (2) cache configuration and eviction block at time of miss (np.array)  
    '''
    
    infinite_index = 100 * len(blocktrace) 
    # should be a large integer than block number
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    dataset = np.array([]).reshape(0,3*frame+1)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                  
                # find the farthest i.e. max_index in upcoming_index
                max_index = max(upcoming_index)

                if (i % 1000 +1 == 1000):
                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    stack = np.append(stack, Cache.index(upcoming_index[max_index]))
                    dataset = np.vstack((dataset, stack))
                # remove the block with max_index from cache
                Cache[Cache.index(upcoming_index[max_index])] = block

                # remove the block with max_index from recency dict
                recency.remove(upcoming_index[max_index])

                # remove max_index element from upcoming_index
                del upcoming_index[max_index]
                    
            
            else:
                 
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)
                
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
 
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

In [42]:
hitrate, dataset= belady_opt_1(blocktrace, 1000)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…



In [43]:
hitrate

0.0713407766329778

In [35]:
dataset

array([[5.68679136e-02, 0.00000000e+00, 2.58371195e-02, ...,
        3.19013799e-02, 2.58371195e-02, 4.28000000e+02],
       [5.56609085e-02, 0.00000000e+00, 3.14347307e-02, ...,
        2.66941100e-02, 3.14347307e-02, 5.48000000e+02],
       [5.22193872e-02, 5.04831115e-02, 9.35219530e-02, ...,
        1.88557984e-02, 3.11739843e-02, 8.90000000e+02],
       ...,
       [5.76442406e-02, 1.22781943e-02, 3.16104860e-02, ...,
        5.38267269e-02, 3.16104860e-02, 7.89000000e+02],
       [3.16216295e-02, 1.22781943e-02, 3.64071235e-02, ...,
        5.38267269e-02, 2.42714157e-02, 7.89000000e+02],
       [3.16196312e-02, 1.22781943e-02, 2.48778542e-02, ...,
        5.38267269e-02, 1.24389271e-02, 7.89000000e+02]])

In [58]:
logreg.predict([dataset[1][:-1]])

array([789])

### Train-Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:-1], dataset[:,-1].astype(int), test_size=0.3, \
                                                    random_state=None, shuffle=True)

### Logistic Regression

In [70]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

print(logreg.score(X_test, Y_test))

print(confusion_matrix(Y_test,logreg.predict(X_test)))

0.29838709677419356
[[3 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Neural Network

In [75]:
NN = MLPClassifier()
NN.fit(X_train, Y_train)

print(NN.score(X_test, Y_test))

print(confusion_matrix(Y_test,NN.predict(X_test)))

0.46774193548387094
[[3 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [76]:
NN

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

### Calculate Hitrate

In [77]:
def ML_policy_1(blocktrace,frame, model):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    model = trained ML model
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    #for i, block in enumerate(tqdm(blocktrace, \
      #                        desc="buidling index", leave=False)):
     #   block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        #if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
        #    block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                blockNo = np.array([i for i in Cache])
                blockNo = blockNo / np.linalg.norm(blockNo)
                recency_ = np.array([recency.index(i) for i in Cache])
                recency_ = recency_ / np.linalg.norm(recency_)
                frequency_ = np.array([frequency[i] for i in Cache])
                frequency_ = frequency_ / np.linalg.norm(frequency_)
                stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                X_current = model.predict(stack)
                
                Cache[X_current[0]] = block
                
                
            else:
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)


    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

## Belady's Optimal Algorithm (OPT) - 2

Belady Algorithm given below, returns the hitrate and dataset. Dataset is [(Cachesize*3) + CacheSize] column long. Every 3 columns represents the blockNumber, Recency, Frequency. The Last column of the dataset is the a numpy 1d array of length same as Cache length. It repreent a binary array where the x% of element get 1 that are likely to be evicted and remaining get 0 that are not being evicted.

In [5]:
def belady_opt_2(blocktrace, frame, perc):
    '''
    INPUT
    ============
    blocktrace - list of blocks in sequence of request
    cachesize - int value for capacity of the cache
    perc - percentage of blocks that get 1(i.e. likely to be evicted) in numpy array of label
    
    OUTPUT
    ============
    (1) hitrate (int)
    (2) cache configuration and eviction block at time of miss (np.array)  
    '''
    
    infinite_index = 100 * len(blocktrace) 
    # should be a large integer than block number
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    dataset = np.array([]).reshape(0,(3*frame)+frame)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                  
                # find the farthest i.e. max_index in upcoming_index
                max_index = max(upcoming_index)
                
                if (i % 1000 +1 == 1000):
                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    labelData = np.zeros(int(frame))
                    temp_upcomingIndex = Counter({j:i for i,j in upcoming_index.items()})
                    mostCommonElements = [i[0] for i in temp_upcomingIndex.most_common(int(frame*perc))]
                    mostCommonElementsIndex = [Cache.index(j) for j in mostCommonElements]
                    labelData[[mostCommonElementsIndex]] = 1
                    stack = np.append(stack, labelData)
                    dataset = np.vstack((dataset, stack))
                # remove the block with max_index from cache
                Cache[Cache.index(upcoming_index[max_index])] = block

                # remove the block with max_index from recency dict
                recency.remove(upcoming_index[max_index])

                # remove max_index element from upcoming_index
                del upcoming_index[max_index]
                    
            
            else:
                 
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)
                
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
 
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

In [6]:
hitrate, dataset = belady_opt_2(blocktrace, 1000, 0.1)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…



In [194]:
dataset

array([[0.05686791, 0.        , 0.02583712, ..., 0.        , 0.        ,
        0.        ],
       [0.05566091, 0.        , 0.03143473, ..., 0.        , 0.        ,
        0.        ],
       [0.05221939, 0.05048311, 0.09352195, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.05764424, 0.01227819, 0.03161049, ..., 1.        , 0.        ,
        0.        ],
       [0.03162163, 0.01227819, 0.03640712, ..., 1.        , 0.        ,
        0.        ],
       [0.03161963, 0.01227819, 0.02487785, ..., 1.        , 0.        ,
        0.        ]])

### Train-Test Split

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:1000*3], dataset[:,1000*3:].astype(int), test_size=0.3, \
                                                    random_state=None, shuffle=True)

### Neural Network

In [8]:
NN_ = MLPClassifier(alpha=0.001,max_iter=200,hidden_layer_sizes=(100,))
NN_.fit(X_train, Y_train)

print(NN_.score(X_test, Y_test))

#print(confusion_matrix(Y_test,NN.predict(X_test)))



0.08333333333333333


In [9]:
NN_

MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [206]:
NN_.predict_proba([X_test[1]])

array([[9.89261252e-01, 2.47479750e-03, 2.65404309e-03, 2.39137631e-03,
        5.36183456e-03, 6.03085639e-03, 5.62941147e-03, 6.10392872e-04,
        4.25873524e-04, 5.25568859e-04, 1.62251429e-03, 3.10489225e-06,
        2.11008625e-06, 2.68305251e-06, 2.87497405e-06, 1.67032355e-05,
        1.08016777e-05, 8.34543988e-01, 2.30255664e-03, 5.83022269e-03,
        6.30008590e-03, 1.07011826e-02, 4.53601711e-07, 1.11661557e-04,
        3.23861601e-05, 2.22375838e-04, 3.77641940e-04, 1.69900684e-06,
        9.50953501e-04, 4.61984978e-03, 2.25004177e-02, 5.49036329e-03,
        9.85794305e-01, 6.45597137e-01, 8.94743344e-01, 2.11632240e-03,
        1.64397152e-03, 5.83642070e-03, 1.77193783e-05, 1.22831707e-05,
        1.16757219e-05, 8.97231259e-06, 7.60048219e-04, 1.29327044e-05,
        1.54938503e-05, 9.94018062e-01, 5.75445003e-03, 9.03297947e-03,
        5.33755343e-05, 2.21738152e-05, 5.79952682e-04, 9.97289288e-01,
        9.94576303e-01, 9.96311545e-01, 5.57876705e-04, 3.535704

In [207]:
NN_.predict([X_test[1]])

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Calculate Hitrate

In [29]:
def ML_policy_2(blocktrace,frame, model):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    model = trained ML model
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    bufferCache = Counter()
    # initialize bufferCache
    
    hit, miss = 0, 0
    
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            if Cache.index(block) in bufferCache:
                bufferCache.remove(Cache.index(block))
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                if len(bufferCache) == 0:

                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)

                    X_current = model.predict_proba(stack)
                    max_block=X_current.argsort()[-100:]
                    for i in max_block[0]:
                        bufferCache[Cache[i]] = i
                
                mostcommon = bufferCache.most_common(1)[0][0]
                Cache.remove(mostcommon)
                del bufferCache[mostcommon]


            else:
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)


    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [30]:
s = ML_policy_2(blocktrace, 1000, NN_)

HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…

In [31]:
s

0.010261624171321879

In [232]:
l = [1,2,3,4,5,67]

In [234]:
l[-2]

5

In [12]:
c = Counter(a=4, b=2, c=0, d=-2)

In [13]:
c

Counter({'a': 4, 'b': 2, 'c': 0, 'd': -2})

In [17]:
c.most_common(1)[0][1]

4

In [24]:
c = Counter()

In [26]:
c.most_common(1)

IndexError: list index out of range

In [52]:
#!/usr/bin/env python
# coding: utf-8

# In[458]:

from tqdm import tqdm as tqdm 
import numpy as np
from collections import deque, defaultdict
import timeit
import pandas as pd
import random
from collections import Counter
from sklearn import preprocessing
from sklearn.preprocessing import normalize

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import sys

# dummy maxmimum position variable. assign the position of blocks that 
# will never get accessed a value greater than this value. this way OPT
# can be fooled to think that the block will be accessed but at a position
# far-far-away in time.

maxpos = 1000000000000

num_params = 3
sampling_freq = 1000 # number of samples skipped
cache_size = 1000    # default cache size
eviction = int(0.1 * cache_size)  # number of blocks evicted
filename = "ikki-110108-112108.1.blkparse"
#filename = "cheetah.1000"

df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', 'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']
df.head()

# In[460]:

blocktrace = df['blockNo'].tolist()

timestamp = df['timestamp'].tolist()

le = preprocessing.LabelEncoder()

le.fit(df['pid'].tolist())

pid = le.transform(df['pid'].tolist())


# In[466]:


#LRU(blocktrace, 500)


# In[467]:


def LFU(blocktrace, frame):
    
    cache = set()
    cache_frequency = defaultdict(int)
    frequency = defaultdict(int)
    
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace):
        frequency[block] += 1
        
        if block in cache:
            hit += 1
            cache_frequency[block] += 1
        
        elif len(cache) < frame:
            cache.add(block)
            cache_frequency[block] += 1
            miss += 1

        else:
            e, f = min(cache_frequency.items(), key=lambda a: a[1])
            cache_frequency.pop(e)
            cache.remove(e)
            cache.add(block)
            cache_frequency[block] = frequency[block]
            miss += 1
    
    hitrate = hit / ( hit + miss )
    return hitrate

'''
    given C, use LFUDict to find eviction number of blocks from the Cache
    compare it with Y_OPT and store number of places the two differ
'''
lruCorrect = 0
lruIncorrect = 0

def lruPredict(C,LRUQ,Y_OPT):
    global lruCorrect, lruIncorrect
    Y_current = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = len(LRUQ) - i
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lruCorrect+=1
        else:
            lruIncorrect+=1
    return Y_current

# returns sequence of blocks in prioirty order

def Y_getBlockSeq(Y_pred_prob):
    x = []
    for i in range(len(Y_pred_prob)):
        x.append(Y_pred_prob[i][0])
    x = np.array(x)
    idx = np.argsort(x)
    idx = idx[:eviction]
    return idx

def Y_getMinPredict(Y_pred_prob):
    x = []
    for i in range(len(Y_pred_prob)):
        x.append(Y_pred_prob[i][0])
    x = np.array(x)
    idx = np.argpartition(x, eviction)
    
    Y_pred = np.zeros(len(Y_pred_prob), dtype=int)
    for i in range(eviction):
        Y_pred[idx[i]] = 1
    assert(Counter(Y_pred)[1] == eviction)
    return Y_pred

'''
    given C, use LFUDict to find eviction number of blocks from the Cache
    compare it with Y_OPT and store number of places the two differ

    The number of correct and incorrect predictions with respect to OPT.
'''

lfuCorrect = 0
lfuIncorrect = 0

def lfuPredict(C,LFUDict,Y_OPT):
    global lfuCorrect, lfuIncorrect
    Y_current = []
    KV = defaultdict()
    for e in C:
        KV[e] = LFUDict[e]
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lfuCorrect+=1
        else:
            lfuIncorrect+=1
    return Y_current

# return "eviction" blocks that are being accessed furthest
# from the cache that was sent to us.

def getY(C,D):
    assert(len(C) == len(D))
    Y_current = []
    KV_sorted = Counter(D)
    evict_dict = dict(KV_sorted.most_common(eviction))
    assert(len(evict_dict) == eviction)
    all_vals = evict_dict.values()
    for e in C:
        if e in evict_dict.values():
            Y_current.append(1)
        else:
            Y_current.append(0)
    #print (Y_current.count(1))
    assert(Y_current.count(1) == eviction)
    assert((set(all_vals)).issubset(set(C)))
    return Y_current

def getLFURow(LFUDict, C):
    x_lfurow = []
    for e in C:
        x_lfurow.append(LFUDict[e])
    norm = x_lfurow / np.linalg.norm(x_lfurow)
    return norm
    
def getLRURow(LRUQ, C):
    x_lrurow = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = len(C) - i
    for e in C:
        x_lrurow.append(KV[e])
    norm = x_lrurow / np.linalg.norm(x_lrurow)
    return norm

def normalize(feature, blocks):
    x_feature = []
    for i in range(len(blocks)):
        x_feature.append(feature[blocks[i]])
    return x_feature / np.linalg.norm(x_feature)

def getX(LRUQ, LFUDict, C, CacheTS, CachePID):
    X_lfurow = getLFURow(LFUDict, C)
    X_lrurow = getLRURow(LRUQ, C)
    X_bno    = C / np.linalg.norm(C)
#     X_ts     = normalize(CacheTS, C)
#     X_pid    = normalize(CachePID, C)
    return (np.column_stack((X_lfurow, X_lrurow, X_bno)))
    
# appends OPT sample to X, Y arrays

X = np.array([], dtype=np.int64).reshape(0,num_params)
Y = np.array([], dtype=np.int64).reshape(0,1)

# C - cache, LFUDict - dictionary containing block-> access frequency
# LRUQ - order of element access in Cache.

def populateData(LFUDict, LRUQ, C, D, CacheTS, CachePID):
    global X,Y
    C = list(C)
    Y_current = getY(C, D)
    X_current = getX(LRUQ, LFUDict, C, CacheTS, CachePID)

    Y = np.append(Y, Y_current)
    X = np.concatenate((X,X_current))
    assert(Y_current.count(1) == eviction)
    return Y_current

#D - dictionary for faster max() finding among available blocks
#this dictionary contains next_position -> block_number of blocks in Cache
#LFUDict - dictionary containing {block -> access_frequencies}
#LRUQ - deque of all elements in cache based on recency of access

def belady_opt(blocktrace, frame):
    global maxpos
    OPT = defaultdict(deque)
    D = defaultdict(int)
    LFUDict = defaultdict(int)
    LRUQ = []
    CacheTS = defaultdict(int)
    CachePID = defaultdict(int)

    for i, block in enumerate(tqdm(blocktrace, desc="OPT: building index")):
        OPT[block].append(i)

    hit, miss = 0, 0

    C = []
    count=0
    seq_number = 0
    for block in tqdm(blocktrace, desc="OPT"):
#    for block in blocktrace: 
        LFUDict[block] +=1

        if len(OPT[block]) is not 0 and OPT[block][0] == seq_number:
            OPT[block].popleft()
        CacheTS [blocktrace[seq_number]] = timestamp[seq_number]
        CachePID [blocktrace[seq_number]] = pid[seq_number]
        if block in C:
            hit+=1
            LRUQ.remove(block)
            LRUQ.append(block)
            assert( seq_number in D)
            del D[seq_number]
            if len(OPT[block]) is not 0:
                D[OPT[block][0]] = block
                OPT[block].popleft()
            else:
                D[maxpos] = block
                maxpos -= 1
        else:
            miss+=1
            if len(C) == frame:
                assert(len(D) == frame)
                evictpos = max(D)
                C.remove(D[evictpos])
                LRUQ.remove(D[evictpos])
                del CacheTS [D[evictpos]]
                del CachePID [D[evictpos]]
                del D[evictpos]
            if len(OPT[block]) is not 0:
                D[OPT[block][0]] = block
                OPT[block].popleft()
            else:
                D[maxpos] = block
                maxpos -= 1
            C.append(block)
            LRUQ.append(block)
            if (seq_number % sampling_freq +1 == sampling_freq and len(C) == frame):
                Y_OPT = populateData(LFUDict, LRUQ, C, D, CacheTS, CachePID)
                lruPredict(C,LRUQ,Y_OPT)
                lfuPredict(C,LFUDict,Y_OPT)
        seq_number += 1

    hitrate = hit / (hit + miss)
    print(hitrate)
    return hitrate

belady_opt(blocktrace, cache_size)

print ("size of X " + str(len(X)))

# round off so that train, test splits are cache size aligned
X = X[0:len(X)-(len(X)%(cache_size * 10))]
Y = Y[0:len(Y)-(len(Y)%(cache_size * 10))]

print ("Test Y")

for i in range(int(len(X) / 1000)):
   y = Y[i*1000:(i+1) *1000]
   assert(Counter(y)[1] == eviction)

print ("size of X " + str(len(X)))
print ("size of Y " + str(len(Y)))

#Train-Test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y ,test_size=0.3, random_state=0, shuffle=False)

print ("Test Y_test")

for i in range(int(len(X_test) / cache_size)):
   y = Y_test[i*cache_size:(i+1) *cache_size]
   assert(Counter(y)[1] == eviction)

print ("size of X_train " + str(len(X_train)))
print ("size of X_test " + str(len(X_test)))

#Fitting Logistic Regression Model
#logreg = LogisticRegression(solver='lbfgs')
#‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’
logreg = LogisticRegression(solver='saga')
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)
print("======================================")
print(logreg.predict_proba([X_test[0]]))
print(Y_test[0])
print("=======================================")
#print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

#confusion_matrix = confusion_matrix(Y_test,Y_pred)
#print (confusion_matrix)

print (logreg.coef_)

print ("LFU Correct / Incorrect Ratio")
total = lfuCorrect + lfuIncorrect
print ( lfuCorrect / total )

print ("LRU Correct / Incorrect Ratio")
total = lruCorrect + lruIncorrect
print ( lruCorrect / total )

c=0

logRegIncorrect = 0
logRegCorrect = 0

for i in range(int(len(X_test)/cache_size)):
    Y_pred_prob = logreg.predict_proba(X_test[i*cache_size:(i+1)*cache_size])
    Y_pred_current = Y_getMinPredict(Y_pred_prob)
    Y_test_current = Y_test[i*cache_size:(i+1)*cache_size]
    assert(Counter(Y_test_current)[1] == eviction)
    for j in range(len(Y_test_current)):
        if np.equal(Y_test_current[j], Y_pred_current[j]):
            logRegCorrect +=1
        else:
            logRegIncorrect +=1

print ("logRegCorrect = " + str(logRegCorrect))
print ("logRegInorrect = " + str(logRegIncorrect))
print ("correct = " + str(logRegCorrect / ( logRegCorrect + logRegIncorrect)))


def hitRate(blocktrace, frame):
    LFUDict = defaultdict(int)
    LRUQ = []
    CacheTS = defaultdict(int)
    CachePID = defaultdict(int)

    hit, miss = 0, 0

    C = []
    evictCacheIndex = np.array([])
    count=0
    seq_number = 0
    for block in tqdm(blocktrace, desc="OPT"):
        LFUDict[block] +=1
        CacheTS [blocktrace[seq_number]] = timestamp[seq_number]
        CachePID [blocktrace[seq_number]] = pid[seq_number]
        if block in C:
            hit+=1
            #if C.index(block) in evictCacheIndex:
            #    np.delete(evictCacheIndex, C.index(block))
                
            LRUQ.remove(block)
            LRUQ.append(block)
        else:
            evictPos = -1
            miss+=1
            if len(C) == frame:
                if len(evictCacheIndex) == 0: # call eviction candidates
                    X_test = getX(LRUQ, LFUDict, C, CacheTS, CachePID)
                    Y_pred_prob = logreg.predict_proba(X_test)
                    # index of cache blocks that should be removed
                    evictCacheIndex = Y_getBlockSeq(Y_pred_prob)

                # evict from cache
                evictPos = evictCacheIndex[0]
                evictBlock = C[evictPos]
                LRUQ.remove(evictBlock)
                del CacheTS [evictBlock]
                del CachePID [evictBlock]
            if evictPos is -1:
                C.append(block)
            else:
                C[evictPos] = block
                np.delete(evictCacheIndex, 0)
            LRUQ.append(block)
            CacheTS [blocktrace[seq_number]] = timestamp[seq_number]
            CachePID [blocktrace[seq_number]] = pid[seq_number]
        seq_number += 1

    hitrate = hit / (hit + miss)
    print(hitrate)
    return hitrate

x = blocktrace[-int(0.3 * len(blocktrace)):]

belady_opt(x, cache_size)
hitRate(x, cache_size)
LFU(x, cache_size)
# get LFU hit rate.!!!!!
# OPT HIT RATE: 0.07700060725633524




OPT: building index: 100%|████████████████████████████████████████████████| 652710/652710 [00:00<00:00, 1090176.44it/s]
OPT: 100%|██████████████████████████████████████████████████████████████████| 652710/652710 [00:29<00:00, 22316.71it/s]


0.22377472384366717
size of X 518000
Test Y
size of X 510000
size of Y 510000
Test Y_test
size of X_train 357000
size of X_test 153000
[[0.94833497 0.05166503]]
0
[[  1.21755115  -4.20131699 121.72285148]]
LFU Correct / Incorrect Ratio
0.8208185328185328
LRU Correct / Incorrect Ratio
0.8216756756756757
logRegCorrect = 151454
logRegInorrect = 1546
correct = 0.9898954248366013


OPT: building index: 100%|████████████████████████████████████████████████| 195813/195813 [00:00<00:00, 1356152.06it/s]
OPT: 100%|██████████████████████████████████████████████████████████████████| 195813/195813 [00:08<00:00, 22437.72it/s]


0.3134112648291993


OPT: 100%|██████████████████████████████████████████████████████████████████| 195813/195813 [00:04<00:00, 40499.39it/s]


0.024257837835077345


100%|███████████████████████████████████████████████████████████████████████| 195813/195813 [00:12<00:00, 15658.94it/s]


0.25379315980042183

In [48]:
filename = "ikki-110108-112108.1.blkparse"

df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

blocktrace = df['blockNo'].tolist()

len(blocktrace)

652710

In [49]:
belady_opt(blocktrace:, cache_size)
hitRate(blocktrace, cache_size)
LFU(blocktrace, cache_size)

SyntaxError: invalid syntax (<ipython-input-49-e7a59cc91dcd>, line 1)

In [40]:
652710*0.3

195813.0