## Dependencies

In [5]:
!pip install tqdm
from tqdm import tqdm_notebook as tqdm 
import numpy as np
from collections import deque, defaultdict
import timeit
import pandas as pd
import random
from collections import Counter
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
import sys



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


### Global Variables

In [6]:
train = "DATA/cheetah.cs.fiu.edu-110108-113008.3.blkparse"

df = pd.read_csv(train, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

trainBlockTrace = df['blockNo'].tolist()
trainBlockTrace = trainBlockTrace[:int(len(trainBlockTrace)*0.5)]

len(trainBlockTrace)

11687258

In [7]:
test = "DATA/cheetah.cs.fiu.edu-110108-113008.4.blkparse"

df = pd.read_csv(test, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

testBlockTrace = df['blockNo'].tolist()
testBlockTrace = testBlockTrace[:int(len(testBlockTrace)*0.5)]

len(testBlockTrace)

10344955

In [38]:
cache_size = 1000
eviction = int(0.2 * cache_size)  

### Files (Do not run unnesessarily)

In [16]:
train = "DATA/cheetah.cs.fiu.edu-110108-113008.3.blkparse"

df = pd.read_csv(train, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

trainBlockTrace = df['blockNo'].tolist()
trainBlockTrace = trainBlockTrace[:int(len(trainBlockTrace)*0.5)]

len(trainBlockTrace)

11687258

In [17]:
test = "DATA/cheetah.cs.fiu.edu-110108-113008.4.blkparse"

df = pd.read_csv(test, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

testBlockTrace = df['blockNo'].tolist()
testBlockTrace = testBlockTrace[:int(len(testBlockTrace)*0.5)]

len(testBlockTrace)

10344955

## LFU

In [9]:
def LFU(blocktrace, frame):
    
    cache = set()
    cache_frequency = defaultdict(int)
    frequency = defaultdict(int)
    
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        frequency[block] += 1
        
        if block in cache:
            hit += 1
            cache_frequency[block] += 1
        
        elif len(cache) < frame:
            cache.add(block)
            cache_frequency[block] += 1
            miss += 1

        else:
            e, f = min(cache_frequency.items(), key=lambda a: a[1])
            cache_frequency.pop(e)
            cache.remove(e)
            cache.add(block)
            cache_frequency[block] = frequency[block]
            miss += 1
    
    hitrate = hit / ( hit + miss )
    return hitrate

## LRU

In [10]:
def LRU(blocktrace, frame):
    
    cache = set()
    recency = deque()
    hit, miss = 0, 0
    
    for block in tqdm(blocktrace, leave=False):
        
        if block in cache:
            recency.remove(block)
            recency.append(block)
            hit += 1
            
        elif len(cache) < frame:
            cache.add(block)
            recency.append(block)
            miss += 1
            
        else:
            cache.remove(recency[0])
            recency.popleft()
            cache.add(block)
            recency.append(block)
            miss += 1
    
    hitrate = hit / (hit + miss)
    return hitrate

# Second Approach (Maharshi)

In [11]:
def belady_opt_2(blocktrace, frame):
    '''
    INPUT
    ============
    blocktrace - list of blocks in sequence of request
    cachesize - int value for capacity of the cache
    
    OUTPUT
    ============
    (1) hitrate (int)
    (2) cache configuration and eviction block at time of miss (np.array)  
    '''
    
    infinite_index = 100 * len(blocktrace) 
    # should be a large integer than block number
    
    block_index = defaultdict(deque) 
    # dictionary with block number as key and list
    # of index value in blocktrace
    
    upcoming_index = defaultdict(int)
    # dictionary with index number as key and value as block
    
    frequency = defaultdict(int)
    # dictionary of block as key and number
    # of times it's been requested so far
    
    recency = list()
    # list of block in order of their request
    
    Cache = deque()
    # Cache with block
    
    dataset = np.array([]).reshape(0,3*frame+1)
    #columns represents the number of block in cache and 
    #3 is the number of features such as frequency, recency and block number
    #+1 is for label 0-1
    
    hit, miss = 0, 0
    
    # populate the block_index
    for i, block in enumerate(tqdm(blocktrace, \
                              desc="buidling index", leave=False)):
        block_index[block].append(i)
        
    # sequential block requests start
    for i, block in enumerate(tqdm(blocktrace, desc="sequence", leave=False)):
        
        # increament the frequency number for the block
        frequency[block] += 1
        
        # make sure block has the value in block_index dictionary 
        # as current seq_number
        if len(block_index[block]) != 0 and block_index[block][0] == i:
            
            # if yes, remove the first element of block_index[block]
            block_index[block].popleft()
        
        # if block exist in current cache
        if block in Cache:
            
            # increment hit
            hit += 1
            
            # update the recency
            recency.remove(block)
            recency.append(block)
            
            # update upcoming_index
            if i in upcoming_index:
                
                # delete old index
                del upcoming_index[i]
        
                if len(block_index[block]) is not 0:
                    # add new upcoming index
                    upcoming_index[block_index[block][0]] = block
                    # remove index from block_index
                    block_index[block].popleft()
                else:
                    # add a large integer as index
                    upcoming_index[infinite_index] = block
                    # increament large integer
                    infinite_index-=1
           
        # block not in current cache
        else:
            
            # increament miss
            miss += 1
            
            # if cache has no free space
            if len(Cache) == frame:
                  
                # find the farthest i.e. max_index in upcoming_index
                max_index = max(upcoming_index)

                if (i % 1000 +1 == 1000):
                    blockNo = np.array([i for i in Cache])
                    blockNo = blockNo / np.linalg.norm(blockNo)
                    recency_ = np.array([recency.index(i) for i in Cache])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([frequency[i] for i in Cache])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    stack = np.append(stack, Cache.index(upcoming_index[max_index]))
                    dataset = np.vstack((dataset, stack))
                # remove the block with max_index from cache
                Cache[Cache.index(upcoming_index[max_index])] = block

                # remove the block with max_index from recency dict
                recency.remove(upcoming_index[max_index])

                # remove max_index element from upcoming_index
                del upcoming_index[max_index]
                    
            
            else:
                 
                # add block into Cache
                Cache.append(block)

            # add block into recency
            recency.append(block)
                
            # add upcoming request of current block in upcoming_index
            if len(block_index[block]) != 0:
                
                # add upcoming index of block
                upcoming_index[block_index[block][0]] = block
               
                # remove the index from block_index 
                block_index[block].popleft()
            
            else:
                
                # add a large integer as index
                upcoming_index[infinite_index] = block
                
                # increament high number
                infinite_index -= 1
 
            
    # calculate hitrate
    hitrate = hit / (hit + miss)

    return hitrate, dataset

### TrainHitrate - create train data for training ML
#### Make sure to clear global variables

In [None]:
trainHitrate, trainData = belady_opt_2(trainBlockTrace, cache_size)
X_train = trainData[:,:-1]
Y_train = trainData[:,-1].astype(int)

HBox(children=(IntProgress(value=0, description='buidling index', max=11687258, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='sequence', max=11687258, style=ProgressStyle(description_widt…

In [13]:
# X = trainData[:,:-1]
# Y = trainData[:,-1].astype(int)

### TestHitrate - create test data for testing ML
#### Make sure to clear global variables

In [None]:
testHitrate, testData = belady_opt_2(testBlockTrace, cache_size)
X_test = testData[:,:-1]
Y_test = testData[:,-1].astype(int)

#### Train-Test Split

In [52]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y.astype(int), test_size=0.3, \
                                                    random_state=None, shuffle=True)

### Machine Learning

#### Logistic Regression

In [17]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

print(logreg.score(X_test, Y_test))
print(confusion_matrix(Y_test,logreg.predict(X_test)))

0.029234386180108353
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 5 ... 0 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 1]]


#### KNN 

In [63]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train, Y_train)

print(KNN.score(X_test, Y_test))
print(confusion_matrix(Y_test,KNN.predict(X_test)))

0.019114790963916998
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 0 0 2]
 ...
 [0 1 5 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 2]]


#### Neural Net

In [23]:
NN = MLPClassifier()
NN.fit(X_train, Y_train)

print(NN.score(X_test, Y_test))
print(confusion_matrix(Y_test,NN.predict(X_test)))



0.032505366452008584
[[ 0  0  0 ...  0  0  1]
 [ 0  0  1 ...  0  0  0]
 [ 1  0  4 ...  0  0  0]
 ...
 [ 1  0  0 ...  0  0  0]
 [ 2  0  0 ...  0  0  0]
 [ 0 12  0 ...  0  3  1]]


## ML Policy

In [18]:
def hitRate2(blocktrace, frame, model):
    LFUDict = defaultdict(int)
    LRUQ = []
#     CacheTS = defaultdict(int)
#     CachePID = defaultdict(int)

    hit, miss = 0, 0

    C = []
    evictCacheIndex = np.array([])
    #count=0
    #seq_number = 0
    for seq_number, block in enumerate(tqdm(blocktrace, desc="OPT")):
        #print(len(evictCacheIndex))
        LFUDict[block] +=1
        #CacheTS[blocktrace[seq_number]] = timestamp[seq_number]
        #CachePID[blocktrace[seq_number]] = pid[seq_number]
        if block in C:
            hit+=1
#             if C.index(block) in evictCacheIndex:
#                 np.delete(evictCacheIndex, C.index(block))
                
            LRUQ.remove(block)
            LRUQ.append(block)
        else:
            evictPos = -1
            miss+=1
            if len(C) == frame:
                if len(evictCacheIndex) == 0: # call eviction candidates
                    #X_test = getX(LRUQ, LFUDict, C)
                    #X_test = getX(LRUQ, LFUDict, C, CacheTS, CachePID)
                    blockNo = C / np.linalg.norm(C)
                    recency_ = np.array([LRUQ.index(i) for i in C])
                    recency_ = recency_ / np.linalg.norm(recency_)
                    frequency_ = np.array([LFUDict[i] for i in C])
                    frequency_ = frequency_ / np.linalg.norm(frequency_)
                    stack = np.column_stack((blockNo, recency_, frequency_)).reshape(1,frame*3)
                    #X_current = model.predict(stack)[0]
                    Y_pred_prob = model.predict_proba(stack)
                    evictCacheIndex = Y_pred_prob.argsort()[0][::-1][:eviction]
                    # index of cache blocks that should be removed
                    #return Y_pred_prob, evictCacheIndex
                # evict from cache
                evictPos = evictCacheIndex[0]
                evictBlock = C[evictPos]
                LRUQ.remove(evictBlock)
                #del CacheTS [evictBlock]
                #del CachePID [evictBlock]
            if evictPos is -1:
                C.append(block)
            else:
                C[evictPos] = block
                evictCacheIndex = np.delete(evictCacheIndex, 0)
            LRUQ.append(block)
            #CacheTS [blocktrace[seq_number]] = timestamp[seq_number]
            #CachePID [blocktrace[seq_number]] = pid[seq_number]
        #seq_number += 1

    hitrate = hit / (hit + miss)
    print(hitrate)
    return hitrate

#### OPT, LRU, LFU

In [19]:
testHitrate

0.05530570215143517

In [20]:
LRU(testBlockTrace, cache_size)

HBox(children=(IntProgress(value=0, max=10344955), HTML(value='')))

0.050023707208006224

In [21]:
LFU(testBlockTrace, cache_size)

HBox(children=(IntProgress(value=0, max=10344955), HTML(value='')))

0.04888237793204514

#### Logistic Regression Results

In [36]:
LRhitrate = hitRate2(testBlockTrace, cache_size, logreg)

HBox(children=(IntProgress(value=0, description='OPT', max=10344955, style=ProgressStyle(description_width='in…

0.04976039045119094


#### KNN Results

In [45]:
KNNhitrate = hitRate2(testBlockTrace, cache_size, KNN)

HBox(children=(IntProgress(value=0, description='OPT', max=10344955, style=ProgressStyle(description_width='in…

KeyboardInterrupt: 

#### NeuralNet Results

In [37]:
NNhitrate = hitRate2(testBlockTrace, cache_size, NN)

HBox(children=(IntProgress(value=0, description='OPT', max=10344955, style=ProgressStyle(description_width='in…

0.04967165154415848
