<a href="https://colab.research.google.com/github/kjiyun/Deepcache-LSTM/blob/main/Deepcache_Dataset2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np
import random
import math
import csv
import sys
import os


""""###########################  initialization  ####################################################################"""
NUM_OF_OBJECTS = 1000                            # number of objects to generate
lambdaFile = 'hourly_request_ratio.csv'         # file with hourly request ratio
lambdas = []                                    # stores the diurnal ratios for non-homogeneous Poisson
curTime = []                                    # for each object shows the last timeStamp that it was requested
objectPopularities = []                         # Contains object popularity
M = 178310                                      # max frequency for HPC dataset
traceType = 'HPC'
hourly_request_function_degree = 2              # the degree for the function that sets the objects per bin pattern, X^2
dayGaps = []                                    # interarrival between days
numOfObjectsIntroduced = []                     # number of objects generated in each day
interArrivals = []                              # generates the interarrival time between objects introduced in a day
lifeSpanType = []                               # for each object it holds the type of its lifeSpan
ObjectsLifeSpan = []                            # the length of lifeSpan value for each object
requestGenInfo = {}                             # for each object it holds the info about requests
startTimes = {}                                 # sorted objects based on their introduction time
introductionOrder = []                          # random order for introducing objects in a day
sortedOnIntoTime = []
requests = []                                   # generated requests
objectLengths = []
if sys.version_info[0] < 3:
    maxEndDay = -sys.maxint - 1
else:
    maxEndDay = -sys.maxsize - 1
WITH_INTRODUCTION = True                        # flag to allow objects to be introduced at any time
WITH_DAY_GAPS_INTRODUCTION = False              # If True, introduce gaps between the objects introduction days,
                                                # otherwise objects are introduced each day
GENERATE_NEW_HOURLY_REQUEST_RATIO = False       # If True, a new 'hourly_request_ratio.csv' is generated
MIN_REQ_PER_DAY_THRESHOLD = 1500                # min number of requests to be generated for each object in a day
MIN_OBJ_INTRODCUED_PER_DAY_THRESHOLD = 0.0035 * NUM_OF_OBJECTS  # min number of objects to be generated in a day
MAX_OBJ_INTRODCUED_PER_DAY_THRESHOLD = 0.0095 * NUM_OF_OBJECTS  # max number of objects to be generated in a day

# Creating output directory if it does not exist
OUTPUTDIR = './Datasets'
if not os.path.isdir(OUTPUTDIR):
    os.makedirs(OUTPUTDIR)

# Checking the existence of hourly_request_ratio.csv file
if not os.path.isfile('hourly_request_ratio.csv'):
    GENERATE_NEW_HOURLY_REQUEST_RATIO = True

if GENERATE_NEW_HOURLY_REQUEST_RATIO:
    print('Generating hourly request ratio file ...')
    rands = np.random.randint(1, 100, 24)
    rands = rands/float(np.sum(rands))
    index = np.arange(1, 25)

    res = 'hourly_request_ratio.csv'
    f = open(res, 'w+')
    for i in range(len(index)):
        if i != len(index)-1:
            f.write(str(index[i]) + ',' + str(rands[i])+'\n')
        else:
            f.write(str(index[i]) + ',' + str(rands[i]))
    f.close()


def initialize():
    global curTime
    loadDiurnalRatios()
    print('Generating Objects for Dataset ...')
    generateObjectsIntroductionInfo(traceType)
    generatePopularities(traceType, int(NUM_OF_OBJECTS))
    generateObjects()

    print('Generating Requests for Dataset ...')
    curTime = [0] * NUM_OF_OBJECTS
    generateRequests()


"""################################ Load diurnal ratios #############################################################"""
def loadDiurnalRatios():
    with open(lambdaFile, "r+") as fi:
        for line in fi:
            tmpLambdas = float(line.rstrip('\n').rstrip('\r').split(',')[1])
            lambdas.append(tmpLambdas)
    fi.close()


"""###########################  Object Popularity  ##################################################################"""
K = {'HPC': 30, 'HCL': 7}


def generatePopularities(traceType, N):
    zipalpha = 0.8
    k = K[traceType]
    for i in range(1, N+1):
        Mk = ((M-1)/k)+1
        tmp = (((float(Mk)/(math.pow((float(i+k-1)/k), zipalpha)))-1)*k)+1
        objectPopularities.append(tmp)


"""########################  Object Type  ###########################################################################"""
def getObjectType():
    decision = random.uniform(0, 1)
    if decision <= 0.1:  # 10 % of objects are news
        return 'news'
    else:
        return 'regular'


"""##################### generating random variates #################################################################"""
def generatePoissonVariate(rand, lambda_poisson):
    """
    for diurnal access generation
    """
    return -1 * (math.log(1-rand))/lambda_poisson


def generateParetoVariate(rand, alpha):
    return math.pow(1/rand, 1/alpha)


def generateParetoScaledVariate(rand, alpha, beta):
    """ F(x) = 1 - (b/x)^a, x >= b """
    return beta / (math.pow((1 - rand), (1/alpha)))


def generateNormalVariate(mu, sigma):
    """
    RV generated using rejection method
    """
    variateGenerated = False
    while not variateGenerated:
        u1 = random.uniform(0, 1)
        u2 = random.uniform(0, 1)
        x = -1*math.log(u1)
        if u2 > math.exp(-1*math.pow((x-1), 2)/2):
            continue
        else:
            u3 = random.uniform(0, 1)
            if u3 > 0.5:
                return mu+(sigma*x)
            else:
                return mu-(sigma*x)


def generateLogNormalVariate(mu, sigma):
    """
    RV generated using rejection method
    """
    variateGenerated = False
    while not variateGenerated:
        u1 = random.uniform(0, 1)
        u2 = random.uniform(0, 1)
        x = -1*math.log(u1)
        if u2 > math.exp(-1*math.pow((x-1), 2)/2):
            continue
        else:
            return math.exp(mu+(sigma*x))


def generateExponentialVariate(rand, a):
    return -(1/a)*math.log(1-rand)


def generateRandVariate(dist, params, numOfVariates):
    variates = []

    if dist is 'pareto':
        alpha = params['alpha']
        for i in range(numOfVariates):
            rand = random.uniform(0, 1)
            variates.append(generateParetoVariate(rand, alpha))

    if dist is 'paretoScaled':
        alpha = params['alpha']
        beta = params['beta']
        for i in range(numOfVariates):
            rand = random.uniform(0, 1)
            variates.append(generateParetoScaledVariate(rand, alpha, beta))

    elif dist is 'normal':
        mu = params['mu']
        sigma = params['sigma']
        for i in range(numOfVariates):
            variates.append(generateNormalVariate(mu, sigma))

    elif dist is 'logNormal':
        mu = params['mu']
        sigma = params['sigma']
        for i in range(numOfVariates):
            variates.append(generateLogNormalVariate(mu, sigma))

    elif dist is 'exp':
        mu = params['mu']
        for i in range(numOfVariates):
            rand = random.uniform(0, 1)
            variates.append(generateExponentialVariate(rand, mu))
    elif dist is 'poisson':
        mu = params['mu']
        for i in range(numOfVariates):
            rand = random.uniform(0, 1)
            variates.append(generatePoissonVariate(rand, mu))
    return variates


"""####################  Object Introduction Info  ##################################################################"""
def generateObjectsIntroductionInfo(typeMode):
    """
    generates gaps between introduction days based on either pareto or exponential distribution
    """
    global NUM_OF_OBJECTS
    global numOfObjectsIntroduced

    tempNumOfObjectsIntroduced = []
    while sum(tempNumOfObjectsIntroduced) < NUM_OF_OBJECTS:
        if typeMode is 'HPC':
            if WITH_DAY_GAPS_INTRODUCTION:
                pareto_alpha_objectIntro_hpc = 1.0164
                object_intro_days_gap = generateRandVariate('pareto', {'alpha':pareto_alpha_objectIntro_hpc}, 1)[0]
                if object_intro_days_gap > 20:
                    object_intro_days_gap = 20
                dayGaps.append(object_intro_days_gap)
            else:
                dayGaps.append(1)

        else:
            exponential_mu_objectIntro_hpl = 4.2705
            object_intro_days_gap = generateRandVariate('exp', {'mu': exponential_mu_objectIntro_hpl}, 1)[0]
            dayGaps.append(object_intro_days_gap)

        # number of new objects generated in each introduction day Pareto dist
        pareto_alpha_numOfObjectsGeneration = 0.8
        pareto_beta_numOfObjectsGeneration = MIN_OBJ_INTRODCUED_PER_DAY_THRESHOLD
        numOfObjects_intro_in_day = generateRandVariate('paretoScaled', {'alpha': pareto_alpha_numOfObjectsGeneration,
                                                        'beta': pareto_beta_numOfObjectsGeneration}, 1)[0]
        if numOfObjects_intro_in_day > MAX_OBJ_INTRODCUED_PER_DAY_THRESHOLD:
            numOfObjects_intro_in_day = MAX_OBJ_INTRODCUED_PER_DAY_THRESHOLD
        tempNumOfObjectsIntroduced.append(numOfObjects_intro_in_day)

    # sort generated items
    tempNumOfObjectsIntroduced.sort()
    extra_days = 0
    if len(tempNumOfObjectsIntroduced) % 7 != 0:
        extra_days = len(tempNumOfObjectsIntroduced) % 7
        for i in range(extra_days):
            # generate random int to add these objects to other introduction days to generate full weeks of data
            added = False
            while not added:
                u = random.randint(extra_days+1, len(tempNumOfObjectsIntroduced) - 1)
                if tempNumOfObjectsIntroduced[i] + tempNumOfObjectsIntroduced[u] < MAX_OBJ_INTRODCUED_PER_DAY_THRESHOLD:
                    tempNumOfObjectsIntroduced[u] += tempNumOfObjectsIntroduced[i]
                    added = True

    # Exclude the extra days after being added to other days
    tempNumOfObjectsIntroduced = tempNumOfObjectsIntroduced[extra_days:]
    tempNumOfObjectsIntroduced.sort()

    # Fill in the days by dividing the sorted data as following
    # This induces that more objects are introduced on Friday then Saturday, and so on.
    # The least number of objects are introduced on Tuesday.
    # Fri 1, Sat 2, Sun 3, Thu 4, Wed 5, Mon 6, Tuesday 7
    weeks = int(len(tempNumOfObjectsIntroduced) / 7)
    FriIndex = weeks * 6
    SatIndex = weeks * 5
    SunIndex = weeks * 4
    MonIndex = weeks * 1
    TuesIndex = weeks * 0
    WedIndex = weeks * 2
    ThuIndex = weeks * 3

    for i in range(weeks):
        numOfObjectsIntroduced.append(tempNumOfObjectsIntroduced[MonIndex+i])
        numOfObjectsIntroduced.append(tempNumOfObjectsIntroduced[TuesIndex + i])
        numOfObjectsIntroduced.append(tempNumOfObjectsIntroduced[WedIndex + i])
        numOfObjectsIntroduced.append(tempNumOfObjectsIntroduced[ThuIndex + i])
        numOfObjectsIntroduced.append(tempNumOfObjectsIntroduced[FriIndex + i])
        numOfObjectsIntroduced.append(tempNumOfObjectsIntroduced[SatIndex + i])
        numOfObjectsIntroduced.append(tempNumOfObjectsIntroduced[SunIndex + i])

    # interarrivalTime for objects introduction in a day
    pareto_alpha_interArrival = 1.0073
    numOfDays = len(numOfObjectsIntroduced)
    for i in range(numOfDays):
        objectsCountInDay = int(np.round(numOfObjectsIntroduced)[i])
        if WITH_INTRODUCTION:
            interArrivals.append(generateRandVariate('pareto', {'alpha': pareto_alpha_interArrival}, objectsCountInDay))
        else:
            interArrivals.append([0]*objectsCountInDay)
    NUM_OF_OBJECTS = int(sum(np.round(numOfObjectsIntroduced)))


def generateObjectIntroductionOrder():
    return np.random.permutation(range(len(objectPopularities)))+1


"""#########################  Object lifespan  ######################################################################"""
def generateLifeSpans(numOfObjects, objMode):
    logNormal_mu_mean = 3.0935
    logNormal_mu_std = 0.9612
    logNormal_sigma_mean = 1.1417
    logNormal_sigma_std = 0.3067
    pareto_alpha_mean = 1.7023
    pareto_alpha_std = 0.2092
    lifeSpans = []

    logNormalMu = generateRandVariate('normal', {'mu': logNormal_mu_mean, 'sigma': logNormal_mu_std}, 1)[0]
    logNormalSigma = generateRandVariate('normal', {'mu': logNormal_sigma_mean, 'sigma': logNormal_sigma_std}, 1)[0]

    paretoAlpha = generateRandVariate('normal', {'mu': pareto_alpha_mean, 'sigma': pareto_alpha_std}, 1)[0]

    for i in range(numOfObjects):
        if objMode[i] is 'regular':
            tmpLifeSpan = generateRandVariate('logNormal', {'mu': logNormalMu, 'sigma': logNormalSigma}, 1)[0]
        elif objMode[i] is 'news':
            tmpLifeSpan = generateRandVariate('pareto', {'alpha': paretoAlpha}, 1)[0]
        if tmpLifeSpan > 80:
            tmpLifeSpan = random.randint(2, 80)
        lifeSpans.append((i+1, tmpLifeSpan))
    return lifeSpans


"""#########################  Object Generation  ####################################################################"""
def normalizePopularities():
    normalized = np.array(objectPopularities)/max(objectPopularities)
    return normalized


def getBinInterval(time):
    return (math.floor(time/float(3600)))/float(23)


def generateObjects():
    global ObjectsLifeSpan
    global introductionOrder
    global sortedOnIntoTime
    global maxEndDay
    normalizedPop = normalizePopularities()

    for i in range(len(normalizedPop)):
         lifeSpanType.append(getObjectType())
    # tuple (objID, LifeSpan), objID from 1 to N
    ObjectsLifeSpan = generateLifeSpans(len(objectPopularities), lifeSpanType)
    introductionOrder = generateObjectIntroductionOrder()   # objectIntroductionOrder from 1 to N
    for i in range(1, len(objectPopularities)+1):
        requestGenInfo[i] = {'startDay': 0, 'lifeSpan': 0, 'endDay': 0, 'arrivalTime': 0, 'type': '', 'freq': 0,
                             'unitPerDay': 0} # From 1 to N
        startTimes[i] = 0

    objCnt = 0
    dayCnt = 0
    for i in range(len(numOfObjectsIntroduced)):
        dayTime = 0
        dayCnt = dayCnt+round(dayGaps[i])
        for j in range(int(np.round(numOfObjectsIntroduced)[i])):
            objIntroduced = introductionOrder[objCnt]
            dayTime = dayTime+interArrivals[i][j]
            requestGenInfo[objIntroduced]['startDay'] = dayCnt
            requestGenInfo[objIntroduced]['arrivalTime'] = dayTime
            requestGenInfo[objIntroduced]['lifeSpan'] = ObjectsLifeSpan[objIntroduced-1][1]
            requestGenInfo[objIntroduced]['type'] = lifeSpanType[objIntroduced-1]
            requestGenInfo[objIntroduced]['freq'] = objectPopularities[objIntroduced-1]

            # Generating at least a minimum number of requests per day
            if requestGenInfo[objIntroduced]['freq'] / requestGenInfo[objIntroduced]['lifeSpan'] \
                    < MIN_REQ_PER_DAY_THRESHOLD:
                # generate a random number for which number to update
                decision = random.uniform(0, 1)
                if decision <= 0.5:
                    # update the object frequency
                    life_span = random.randint(10, 80)
                    requestGenInfo[objIntroduced]['freq'] = life_span * MIN_REQ_PER_DAY_THRESHOLD
                    requestGenInfo[objIntroduced]['lifeSpan'] = life_span
                else:
                    # update the object life-span
                    freq = random.randint(MIN_REQ_PER_DAY_THRESHOLD, 80*MIN_REQ_PER_DAY_THRESHOLD)
                    requestGenInfo[objIntroduced]['freq'] = freq
                    requestGenInfo[objIntroduced]['lifeSpan'] = freq / MIN_REQ_PER_DAY_THRESHOLD

            startTimes[objIntroduced] = dayCnt+getBinInterval(dayTime)

            requestGenInfo[objIntroduced]['endDay'] = requestGenInfo[objIntroduced]['lifeSpan'] + \
                                                      requestGenInfo[objIntroduced]['startDay']
            requestGenInfo[objIntroduced]['totalDens'] = math.pow(requestGenInfo[objIntroduced]['lifeSpan'],
                                                                  hourly_request_function_degree)

            objectLengths.append([objIntroduced, requestGenInfo[objIntroduced]['startDay'],
                                  requestGenInfo[objIntroduced]['lifeSpan'], requestGenInfo[objIntroduced]['endDay'],
                                  requestGenInfo[objIntroduced]['freq']])

            if requestGenInfo[objIntroduced]['endDay'] > maxEndDay:
                maxEndDay = requestGenInfo[objIntroduced]['endDay']
            objCnt = objCnt+1

    sortedOnIntoTime = sorted(startTimes, key=startTimes.get)


def generateDiurnalAccess(obj, diurnalRatio, dayCnt):
    global requests

    lifeTimeLeft = requestGenInfo[obj]['lifeSpan']

    if lifeTimeLeft > 1:
        lastDay = requestGenInfo[obj]['endDay']
        objCount = abs(requestGenInfo[obj]['freq']*(((math.pow(dayCnt-lastDay, hourly_request_function_degree)
                       - math.pow(lastDay-dayCnt+1, hourly_request_function_degree)))/requestGenInfo[obj]['totalDens']))
        requestGenInfo[obj]['lifeSpan'] = requestGenInfo[obj]['lifeSpan']-1
        for i in range(len(diurnalRatio)):
            tmpCount = int(np.round(objCount*diurnalRatio[i]))
            if tmpCount != 0:
                tmpLambda = (tmpCount/float(3600))
                reqInterArrivals = generateRandVariate('exp', {'mu': tmpLambda}, tmpCount)
                for tmpInter in reqInterArrivals:
                    requests.append((obj, (curTime[obj-1]+tmpInter)))
                    curTime[obj-1] = curTime[obj-1]+tmpInter

    else:
        lastDay = requestGenInfo[obj]['endDay']
        objCount = abs(requestGenInfo[obj]['freq']*(((math.pow(lastDay-dayCnt, hourly_request_function_degree)
                       - math.pow(lastDay-(dayCnt+requestGenInfo[obj]['lifeSpan']), hourly_request_function_degree))) /
                                                    requestGenInfo[obj]['totalDens']))
        spanToGenerate = int(math.floor(requestGenInfo[obj]['lifeSpan']*10))
        requestGenInfo[obj]['lifeSpan'] = 0

        for i in range(spanToGenerate):
            tmpCount = int(np.round(objCount*diurnalRatio[i]))
            if tmpCount != 0:
                tmpLambda = (tmpCount/float(3600))

                reqInterArrivals = generateRandVariate('exp', {'mu': tmpLambda}, tmpCount)
                for tmpInter in reqInterArrivals:
                    requests.append((obj, (curTime[obj-1]+tmpInter)))
                    curTime[obj-1] = curTime[obj-1]+tmpInter


"""#########################  Requests Generation  ##################################################################"""
def generateRequests():
    global requests
    global curTime

    OUTPUTFILENAME = '{0}/mediSynDataset_x{1}_O{2}.csv'.format(OUTPUTDIR, hourly_request_function_degree, NUM_OF_OBJECTS)
    if not os.path.isfile(OUTPUTFILENAME):
        fi = open(OUTPUTFILENAME, 'w')
        fi.write('object_ID,request_time\n')
        fi.close()

    dayCount = requestGenInfo[sortedOnIntoTime[0]]['startDay']
    reqGendf = pd.DataFrame.from_dict(requestGenInfo, orient='index')
    reqGendf['objID'] = reqGendf.index

    while dayCount <= maxEndDay:
        objList = list(reqGendf[(reqGendf['startDay'] <= dayCount) & (reqGendf['endDay'] >= dayCount)]['objID'])
        for obj in objList:
            if curTime[obj-1] == 0:
                curTime[obj-1] = (dayCount*86400) + requestGenInfo[obj]['arrivalTime']

            generateDiurnalAccess(obj, lambdas, dayCount)

        dayCount = dayCount + 1
        if dayCount % 20 == 0:
            requests = sorted(requests, key=lambda x: x[1])
            saveRequestsToFile(OUTPUTFILENAME)
            requests = []
            print('{} Days Processed of {} Total Days'.format(dayCount, int(maxEndDay)))
    print('MediSyn Dataset Saved to Output file: {}'.format(OUTPUTFILENAME))


def saveRequestsToFile(OUTPUTFILENAME):
    with open(OUTPUTFILENAME, 'a') as resultFile:
        wr = csv.writer(resultFile, dialect='excel')
        wr.writerows(requests)


"""##################################################################################################################"""


def main():
    initialize()


if __name__ == "__main__": main()



Generating hourly request ratio file ...
Generating Objects for Dataset ...
Generating Requests for Dataset ...


  if dist is 'pareto':
  if dist is 'paretoScaled':
  elif dist is 'normal':
  elif dist is 'logNormal':
  elif dist is 'exp':
  elif dist is 'poisson':
  if typeMode is 'HPC':
  if objMode[i] is 'regular':
  elif objMode[i] is 'news':


20 Days Processed of 212 Total Days
40 Days Processed of 212 Total Days
60 Days Processed of 212 Total Days
80 Days Processed of 212 Total Days
100 Days Processed of 212 Total Days
120 Days Processed of 212 Total Days
140 Days Processed of 212 Total Days
160 Days Processed of 212 Total Days
180 Days Processed of 212 Total Days
200 Days Processed of 212 Total Days
MediSyn Dataset Saved to Output file: ./Datasets/mediSynDataset_x2_O1034.csv


In [None]:
from typing import ValuesView
# LSTM 학습을 위한 입력 시퀀스 생성

import pandas as pd
import numpy as np

df = pd.read_csv('Datasets/mediSynDataset_x2_O1034.csv') # 데이터 불러오기

df['hour'] = df['request_time'] // 3600  # 초 단위 -> 시간 단위 버킷

object_ids = df['object_ID'].unique()
object_ids.sort()
num_objects = len(object_ids)
print("총 객체 수: ", num_objects)

pivot = df.groupby(['hour', 'object_ID']).size().unstack(fill_value=0)
pivot = pivot.reindex(columns=object_ids, fill_value=0) # 객체 ID 순서 맞추기

# 확률 벡터로 정규화
probs = pivot.div(pivot.sum(axis=1), axis=0).fillna(0)

m, K = 20, 26  # 과거 20시간 -> 미래 10시간 예측한다고 가정
X = []
y = []

for i in range(len(probs) - m - K):
  x_seq = probs.iloc[i:i+m].values  # shape: (m, d)
  x_next = probs.iloc[i+m:i+m+K].values
  X.append(x_seq)
  y.append(x_next)

X = np.array(X)
y = np.array(y)

print("X.shape =", X.shape)
print("y.shape =", y.shape)

총 객체 수:  1034
X.shape = (4738, 20, 1034)
y.shape = (4738, 26, 1034)


In [None]:
# LSTM 예측 모델
# 과거 일정 시간 동안의 요청 분포를 받아서 앞으로의 일정 시간 동안의 요청 분포를 예측하는 구조
# Encoder: 과거 m시간 동안의 요청 분포((m, 1033) 시퀀스)를 받아서, 마지막 hidden state와 cell state로 응축된 정보를 생성 -> context vector로 요약
# Decoder: Encoder에서 받은 context vector를 바탕으로, 앞으로 k시간 동안 어떤 객체들이 얼마나 요청될지를 시퀀스 형태로 한 시간씩 예측
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, TimeDistributed, RepeatVector

def build_seq2seq_model(m, K, num_objects):
  # ----- Encoder -----
  encoder_inputs = Input(shape=(m, num_objects)) # (batch, m(시간), num_objects(객체수))
  encoder_lstm = LSTM(128, return_state=True) # LSTM이 마지막 시점의 hidden state와 cell state를 반환
  encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs) # state_h와 state_c는 Decoder의 초기 상태로 사용됨
  encoder_states = [state_h, state_c] # Encoder의 상태를 Decoder에 전달하기 위해 리스트로 묶음

  # ----- Decoder -----
  decoder_inputs = RepeatVector(K)(encoder_outputs) # (batch, K, 128): Encoder의 마지막 출력을 K번 복제해서 Decoder 입력으로 사용, Decoder는 K시간 동안 예측을 수행
  decoder_lstm = LSTM(128, return_sequences=True) # LSTM의 hidden size는 64, return_sequences=True는 각 시점마다 출력을 반환
  decoder_outputs = decoder_lstm(decoder_inputs, initial_state=encoder_states)

  # 객체별 확률 예측 (각 시점당 num_objects 개 출력)
  decoder_dense = TimeDistributed(Dense(num_objects, activation='softmax')) # softmax는 확률 분포로 만들어줌
  output_seq = decoder_dense(decoder_outputs)

  # ----- 모델 구성 -----
  model = Model(encoder_inputs, output_seq)
  model.compile(optimizer='adam', loss='mse', metrics=['mae'])
  return model


num_objects = 1034  # 객체 수
m = 20              # 과거 시간: 입력 시퀀스 길이
K = 26             # 미래 시간: 출력 시퀀스 길이

model = build_seq2seq_model(m, K, num_objects)

# 데이터: X shape = (samples, 20, 50), y shape = (samples, 10, 50)
# history = model.fit(X, y, epochs=30, batch_size=32, validation_split=0.1)

# 학습 후 추론 코드
from sklearn.model_selection import train_test_split

# 시퀀스 데이터를 학습/검증/테스트로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습 시에도 X_train, y_train 사용
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.1)

# 추론 (예측)
y_pred = model.predict(X_test)
print("예측 결과 형태:", y_pred.shape)
print("y_test.shape =", y_test.shape)

Epoch 1/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - loss: 2.0000e-05 - mae: 0.0016 - val_loss: 1.7732e-05 - val_mae: 0.0016
Epoch 2/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 1.8906e-05 - mae: 0.0016 - val_loss: 1.7730e-05 - val_mae: 0.0016
Epoch 3/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - loss: 1.9470e-05 - mae: 0.0016 - val_loss: 1.7727e-05 - val_mae: 0.0016
Epoch 4/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 1.9560e-05 - mae: 0.0016 - val_loss: 1.7724e-05 - val_mae: 0.0016
Epoch 5/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 1.9547e-05 - mae: 0.0016 - val_loss: 1.7720e-05 - val_mae: 0.0016
Epoch 6/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 1.8600e-05 - mae: 0.0016 - val_loss: 1.7716e-05 - val_mae: 0.0016
Epoch 7/30
[1m107/107[0m [32m━━

In [None]:
# 캐시 정책 구현
# 예측 결과를 기반으로 DeepCache의 캐싱 정책을 구현하고 이를 통해 기본 캐싱 전략과 비교평가
M = 10 # 캐시에 넣을 상위 M개 객체
top_objects_each_t = []

# 예측 결과에서 Top-M 객체 추출
for i in range(len(y_pred)):
    for t in range(K - 1):    # t+1 예측을 위해 K-1까지만
        next_probs = y_pred[i, t+1]  # 다음 시점 확률 분포
        top_indices = next_probs.argsort()[-M:][::-1]  # 상위 M개 객체 인덱스
        top_objects_each_t.append(top_indices)

# 실제 요청 로그 준비
actual_requests = df.sort_values('request_time')['object_ID'].tolist()

In [None]:
# 캐시 시뮬레이션 (LRU + 예측 기반) -> 성능 평가
from collections import deque

def simulate_deepcache(actual_requests, top_objects_each_t, cache_size=150, insert_interval=20):
  """
    DeepCache 캐시 시뮬레이션을 수행하고 hit ratio를 반환합니다.

    Parameters:
        actual_requests (list): 실제 요청 객체 ID 리스트
        top_objects_each_t (list of lists): 예측된 상위 M개의 객체 리스트 (매 insert_interval마다 1개 list)
        cache_size (int): 캐시 크기
        insert_interval (int): 예측 객체를 캐시에 넣는 주기 (예: 100 step마다 예측 사용)

    Returns:
        hit_ratio (float): 캐시 hit 비율
  """

  cache = deque(maxlen=cache_size)
  hit = 0
  total = 0
  fake_insert_idx = 0

  for t, req in enumerate(actual_requests):
    # 예측한 객체를 먼저 캐시에 넣기
    if t % insert_interval == 0 and fake_insert_idx < len(top_objects_each_t):
      fake_objs = top_objects_each_t[fake_insert_idx]
      for obj in fake_objs:
        if obj not in cache:
          cache.append(obj)
      fake_insert_idx += 1

    total += 1
    if req in cache:
      hit += 1
    else:
      cache.append(req)

  hit_ratio = hit / total if total > 0 else 0
  return hit_ratio

deepcache_hit = simulate_deepcache(actual_requests, top_objects_each_t, cache_size=150, insert_interval=20)
print(f"DeepCache 기반 캐시 hit ratio: {deepcache_hit:.4f}")

DeepCache 기반 캐시 hit ratio: 0.7779


In [None]:
# 기본 LRU 캐시와 성능 비교
# LRU 캐시 시뮬레이터 코드
from collections import OrderedDict

class LRUCache:
  def __init__(self, capacity):
    self.capacity = capacity
    self.cache = OrderedDict()
    self.hit = 0
    self.miss = 0

  def request(self, obj_id):
    if obj_id in self.cache:
      self.cache.move_to_end(obj_id)
      self.hit += 1
    else:
      self.miss += 1
      if len(self.cache) >= self.capacity:
        self.cache.popitem(last=False) # 길이를 넘으면 pop
      self.cache[obj_id] = True # 최근 것 push

  def get_hit_ratio(self):
    total = self.hit + self.miss
    return self.hit / total if total > 0 else 0

In [None]:
# 요청 로그를 읽어서 LRU 시뮬레이션 수행
import pandas as pd

df = pd.read_csv('Datasets/mediSynDataset_x2_O1034.csv')
requests = df['object_ID'].tolist()

cache_size = 5
lru = LRUCache(cache_size)

for obj_id in requests:
  lru.request(obj_id)

print("LRU 캐시 hit ratio:", round(lru.get_hit_ratio(), 4))

In [None]:
# 비교 결과 시각화
import matplotlib.pyplot as plt

ratios = [
    simulate_deepcache(actual_requests, top_objects_each_t, cache_size=150, insert_interval=20),  # DeepCache
    lru.get_hit_ratio()  # 기존 LRU 객체에서 hit ratio 가져오기
]
labels = ['DeepCache', 'LRU']

plt.bar(labels, ratios, color=['skyblue', 'salmon'])
plt.ylabel('Hit Ratio')
plt.title('DeepCache vs LRU Performance')
plt.ylim(0, 1)
plt.show()

NameError: name 'simulate_deepcache' is not defined