In [2]:
from collections import defaultdict, deque, Counter
import numpy as np
from tqdm import tqdm_notebook as tqdm 
import pandas as pd

#from sklearn.preprocessing import normalize

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
filename = "cheetah.cs.fiu.edu-110108-113008.1.blkparse"
df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']
blocktrace = df['blockNo'].tolist()
len(blocktrace)

1322890

In [4]:
def belady_opt(blocktrace, frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate 
    '''
    infinite_index = 100 * len(blocktrace) # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = set()
    # Cache with block
    
    dataset = np.array([]).reshape(0,3*frame+1)
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index+=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                
                # evict the farthest block in future request from cache
                if len(upcoming_index) != 0:
                    
                    # find the farthest i.e. max_index in upcoming_index
                    max_index = max(upcoming_index)
                    
                    if (i % 1000 +1 == 1000):
                        blockNo = np.array([i for i in Cache])
                        recency_ = np.array([recency.index(i) for i in Cache])
                        
                        frequency_ = np.array([frequency[i] for i in Cache])
                        stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                        stack = np.append(stack, upcoming_index[max_index])
                        dataset = np.vstack((dataset, stack))
                    # remove the block with max_index from cache
                    Cache.remove(upcoming_index[max_index])
                    
                    # remove the block with max_index from recency dict
                    recency.remove(upcoming_index[max_index])
                    
                    # remove max_index element from upcoming_index
                    del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index += 1
                
                
            
            # add block into Cache
            Cache.add(block)
            
            # add block into recency
            recency.append(block)
            
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

In [5]:
hitrate, dataset = belady_opt(blocktrace, 500)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…



In [6]:
dataset

array([[2.83193344e+08, 3.90000000e+01, 2.00000000e+00, ...,
        2.95000000e+02, 2.00000000e+00, 2.83197056e+08],
       [2.83193344e+08, 3.90000000e+01, 2.00000000e+00, ...,
        2.95000000e+02, 2.00000000e+00, 2.83210688e+08],
       [2.83193344e+08, 3.60000000e+01, 2.00000000e+00, ...,
        4.15000000e+02, 1.00000000e+00, 2.83217632e+08],
       ...,
       [2.77180416e+08, 3.27000000e+02, 5.00000000e+00, ...,
        3.26000000e+02, 5.00000000e+00, 7.47143120e+07],
       [2.77180416e+08, 3.27000000e+02, 5.00000000e+00, ...,
        3.26000000e+02, 5.00000000e+00, 7.47233840e+07],
       [2.77180416e+08, 3.27000000e+02, 5.00000000e+00, ...,
        3.26000000e+02, 5.00000000e+00, 7.48279760e+07]])

In [7]:
dataset.shape

(1254, 1501)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset[:,:-1], dataset[:,-1].astype(int), test_size=0.3, random_state=None,shuffle=False)

#Fitting Logistic Regression Model
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

print(confusion_matrix(Y_test,Y_pred))