In [1]:
!pip install tqdm



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [7]:
from collections import defaultdict, deque, Counter
import numpy as np
from tqdm import tqdm_notebook as tqdm 
import pandas as pd

#from sklearn.preprocessing import normalize

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [8]:
# dummy maxmimum position variable. assign the position of blocks that 
# will never get accessed a value greater than this value. This way OPT
# can be fooled to think that the block will be accessed but at a position
# far-far-away in time.
infinite_index = 1000000000 
sample_interval = 1500
num_params = 3
eviction = 1

In [9]:
filename = "cheetah.cs.fiu.edu-110108-113008.1.blkparse"
df = pd.read_csv(filename, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']
blocktrace = df['blockNo'].tolist()
len(blocktrace)

1322890

filename2 = 'cheetah.cs.fiu.edu-110108-113008.8.blkparse'
df = pd.read_csv(filename2, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']
blocktrace2 = df['blockNo'].tolist()
len(blocktrace2)

In [64]:
lruCorrect = 0
lruIncorrect = 0

def lruPredict(C,LRUQ,Y_OPT):
    global lruCorrect, lruIncorrect
    Y_current = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = len(LRUQ) - i
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lruCorrect+=1
        else:
            lruIncorrect+=1
    return Y_current

'''
    given C, use LFUDict to find eviction number of blocks from the Cache
    compare it with Y_OPT and store number of places the two differ
    The number of correct and incorrect predictions with respect to OPT.
'''

lfuCorrect = 0
lfuIncorrect = 0

def lfuPredict(C,LFUDict,Y_OPT):
    global lfuCorrect, lfuIncorrect
    Y_current = []
    KV = defaultdict()
    for e in C:
        KV[e] = LFUDict[e]
    KV_sorted = Counter(KV)
    evict_dict = dict(KV_sorted.most_common(eviction))
    for e in C:
        if e in evict_dict:
            Y_current.append(1)
        else:
            Y_current.append(0)
    for i in range(len(Y_current)):
        if Y_current[i] is Y_OPT[i]:
            lfuCorrect+=1
        else:
            lfuIncorrect+=1
    return Y_current

# return "eviction" blocks that are being accessed furthest
# from the cache that was sent to us.

def getY(C,D):
    global infinite_index, eviction
    #KV = defaultdict(int)
    Y_current = []
#     for e in C:
#         if len(OPT[e]) is not 0:
#             KV[e] = OPT[e][0]
#         else:
#             KV[e] = infinite_index
#             infinite_index-=1
    # extract "eviction" blocks from KV_sorted hashmap
    KV_sorted = Counter(D)
    evict_dict = dict(KV_sorted.most_common(eviction))
#     print(D)
#     print(C)
    for e in C:
        if e in evict_dict.values():
            #print('jaaa')
            Y_current.append(1)
        else:
            Y_current.append(0)
    return Y_current

def getLFURow(LFUDict, C):
    x_lfurow = []
    for e in C:
        x_lfurow.append(LFUDict[e])
    norm = x_lfurow / np.linalg.norm(x_lfurow)
    return norm
    
def getLRURow(LRUQ, C):
    x_lrurow = []
    KV = defaultdict(int)
    for i in range(len(LRUQ)):
        KV[LRUQ[i]] = len(C) - i
    for e in C:
        x_lrurow.append(KV[e])
    norm = x_lrurow / np.linalg.norm(x_lrurow)
    return norm

def getX(LRUQ, LFUDict, C):
    X_lfurow = getLFURow(LFUDict, C)
    X_lrurow = getLRURow(LRUQ, C)
    X_bno    = C / np.linalg.norm(C)
    return (np.column_stack((X_lfurow, X_lrurow, X_bno)))

# appends OPT sample to X, Y arrays

X = np.array([], dtype=np.int64).reshape(0,num_params)
Y = np.array([], dtype=np.int64).reshape(0,1)

# C - cache, LFUDict - dictionary containing block-> access frequency
# LRUQ - order of element access in Cache.

def populateData(frequency, recency, Cache, upcoming_index):
    '''
    Input: 
    Output:
    '''
    global X,Y
    Cache = list(Cache)
    Y_current = getY(Cache, upcoming_index)
    X_current = getX(recency, frequency, Cache)

    Y = np.append(Y, Y_current)
    X = np.concatenate((X,X_current))
    return Y_current


In [65]:
def belady_opt(blocktrace, frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate 
    '''
    global sample_interval # interval of choice for sampling
    global infinite_index # should be a large integer
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = set()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                
                # evict the farthest block in future request from cache
                if len(upcoming_index) != 0:
                    
                    # find the farthest i.e. max_index in upcoming_index
                    max_index = max(upcoming_index)
                     #### regression extra part
                    if (i % sample_interval +1 == sample_interval):
                        Y_OPT = populateData(frequency, recency, Cache, upcoming_index)
                        #print(len(Y_OPT))
                        lruPredict(Cache,recency,Y_OPT)
                        lfuPredict(Cache,frequency,Y_OPT)
                        return upcoming_index
                    # remove the block with max_index from cache
                    Cache.remove(upcoming_index[max_index])
                    
                    # remove the block with max_index from recency dict
                    recency.remove(upcoming_index[max_index])
                    
                   
                    # remove max_index element from upcoming_index
                    del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
                
            
            # add block into Cache
            Cache.add(block)
            
            # add block into recency
            recency.append(block)
            
            
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [66]:
belady_opt(blocktrace, 1000)

HBox(children=(IntProgress(value=0, description='buidling index', max=1322890, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='sequence', max=1322890, style=ProgressStyle(description_width…



defaultdict(int,
            {3007: 488846200,
             3335: 508500288,
             3336: 508500296,
             3337: 508500304,
             7068: 508516672,
             7069: 508516680,
             7070: 508516688,
             1024680: 395126368,
             522462: 3822584,
             522463: 3822592,
             522464: 3822600,
             522465: 3822608,
             522466: 3822616,
             522467: 3822624,
             522468: 3822632,
             522469: 3822640,
             524147: 3822648,
             524148: 3822656,
             524149: 3822664,
             524150: 3822672,
             524151: 3822680,
             524152: 3822688,
             524153: 3822696,
             524154: 3822704,
             524155: 3822712,
             524156: 3822720,
             524157: 3822728,
             524158: 3822736,
             524159: 3822744,
             524160: 3822752,
             524161: 3822760,
             524162: 3822768,
             524163:

In [58]:
X_main = np.array([], dtype=np.int64).reshape(0,1000*num_params)
Y_main = np.array([], dtype=np.int64)
def convert_x(frame):
    global X, Y, X_main, Y_main
    for i in range(int(X.shape[0]/frame)):
        X_new = X[i*frame:(i+1)*frame].reshape(1,frame*num_params)
        Y_new = Y[i*frame:(i+1)*frame]
#         print(Y_new)
#         break
        Y_lab = np.where(Y_new==1)[0]
        X_main = np.concatenate((X_main,X_new))
        Y_main = np.append(Y_main,Y_lab)    

In [59]:
convert_x(1000)

In [60]:
X

array([[0.03143473, 0.04674045, 0.032247  ],
       [0.03143473, 0.03152518, 0.03224724],
       [0.03143473, 0.01466796, 0.03224747],
       ...,
       [0.03143473, 0.03163464, 0.03224723],
       [0.03143473, 0.02769399, 0.02845418],
       [0.03143473, 0.01477743, 0.03224747]])

In [61]:
Y_main

array([143], dtype=int64)

In [17]:
X_main.shape

(826, 3000)

In [18]:
Y_main.shape

(826,)

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X_main, Y_main ,test_size=0.3, random_state=None,shuffle=True)

#Fitting Logistic Regression Model
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

print(confusion_matrix(Y_test,Y_pred))

Accuracy of logistic regression classifier on test set: 0.32
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 3]]


In [20]:
confusion_matrix(Y_test,Y_pred).diagonal()

array([0, 0, 0, 0, 0, 1, 0, 1, 3, 0, 0, 2, 3, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 8, 0, 0, 0, 4, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0,
       0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0, 0,
       0, 0, 4, 0, 3, 7, 0, 0, 0, 2, 0, 3, 0, 0, 1, 0, 5, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, 3], dtype=int64)

In [67]:
def getX2(LRUQ, LFUDict, C, frame):
    C = set(C)
    C = list(C)
    X_lfurow = getLFURow(LFUDict, C)
    X_lrurow = getLRURow(LRUQ, C)
    X_bno    = C / np.linalg.norm(C)
    oldreturn = np.concatenate((X_lfurow, X_lrurow, X_bno))
    pred =  logreg.predict(oldreturn.reshape(1,frame*num_params))
    return C[pred[0]]

In [68]:
def ourcache(blocktrace,frame):
    '''
    INPUT
    ==========
    blocktrace = list of block request sequence
    frame = size of the cache
    
    OUTPUT
    ==========
    hitrate 
    '''
    #global sample_interval # interval of choice for sampling
    global infinite_index # should be a large integer
    
    #block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    #upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    hit, miss = 0, 0
    
    # populate the block_index
    #for i, block in enumerate(tqdm(blocktrace, \
      #                        desc="buidling index", leave=False)):
     #   block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        #if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
        #    block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            #if i in upcoming_index:
                
                # delete old index
             #   del upcoming_index[i]
        
              #  if len(block_index[block]) is not 0:
                    # add new upcoming index
               #     upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                #    block_index[block].popleft()
#                 else:
#                     # add a large integer as index
#                     upcoming_index[infinite_index] = block
#                     # increament large integer
#                     infinite_index+=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                

                X_current = getX2(recency, frequency, Cache, frame)
                
                Cache.remove(X_current)
                
                # evict the farthest block in future request from cache
#                 if len(upcoming_index) != 0:
                    
#                     # find the farthest i.e. max_index in upcoming_index
#                     max_index = max(upcoming_index)
                    
#                     # remove the block with max_index from cache
#                     Cache.remove(upcoming_index[max_index])
                    
#                     # remove the block with max_index from recency dict
#                     recency.remove(upcoming_index[max_index])
                    
#                     # remove max_index element from upcoming_index
#                     del upcoming_index[max_index]
                    
            # add upcoming request of current block in upcoming_index
#             if len(block_index[block]) != 0:
                
#                 # add upcoming index of block
#                 upcoming_index[block_index[block][0]] = block
               
#                 # remove the index from block_index 
#                 block_index[block].popleft()
            
#             else:
                
#                 # add a large integer as index
#                 upcoming_index[infinite_index] = block
                
#                 # increament high number
#                 infinite_index += 1
                
            
            # add block into Cache
            Cache.append(block)
            
            # add block into recency
            recency.append(block)
            
            #### regression extra part
#             if (i % sample_interval +1 == sample_interval):
#                 Y_OPT = populateData(frequency, recency, Cache, block_index)
#                 lruPredict(Cache,recency,Y_OPT)
#                 lfuPredict(Cache,frequency,Y_OPT)
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate

In [69]:
ourcache(blocktrace[:20000],1000)

HBox(children=(IntProgress(value=0, description='sequence', max=20000, style=ProgressStyle(description_width='…



0.03115

In [24]:
def LRU(blocktrace, frame):
    
    cache = set()
    recency = deque()
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        
        if block in cache:
            recency.remove(block)
            recency.append(block)
            hit += 1
            
        elif len(cache) < frame:
            cache.add(block)
            recency.append(block)
            miss += 1
            
        else:
            cache.remove(recency[0])
            recency.popleft()
            cache.add(block)
            recency.append(block)
            miss += 1
    
    hitrate = hit / (hit + miss)
    return hitrate

In [None]:
0.03785

In [25]:
LRU(blocktrace[:20000],1000)

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))



0.0401

### Neural Network

In [26]:
from sklearn.neural_network import MLPClassifier

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X_main, Y_main ,test_size=0.3, random_state=None,shuffle=True)

nn = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(102, ), random_state=1)

In [32]:
nn.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(102,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [33]:
Y_pred = nn.predict(X_test)


In [34]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(nn.score(X_test, Y_test)))

print(confusion_matrix(Y_test,Y_pred))

Accuracy of logistic regression classifier on test set: 0.48
[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 2]]
