In [6]:
import numpy as np
import pandas as pd

In [7]:
data_path = '/home/hskim/source/origin/det_train3.CSV'
data = pd.read_csv(data_path)

event_cnt = data['event_id'].value_counts().reset_index()
event_cnt = event_cnt.rename(columns={'index':'event_id', 'event_id':'freq'})
data = data.merge(event_cnt, on='event_id')

data = data.drop_duplicates().reset_index(drop=True)

In [3]:
df = data.copy(deep=True)

In [6]:
iter_cnt = 5000
features = ['event_id', 'risk', 'freq', 'pp_mtime', 'pp_stime', 'score']

train = None
total_sampling_size = 0
for i in range(iter_cnt):
    sample_size = 10 #np.random.randint(low=5, high=10)
    sampling_data = df.sample(sample_size)[features]
    sampling_data['collection_id'] = i
    train = pd.concat([train, sampling_data], axis=0, ignore_index=True)

In [7]:
train.head()

Unnamed: 0,event_id,risk,freq,pp_mtime,pp_stime,score,collection_id
0,cheditor,7.0,4,8,9,110,0
1,flood,7.0,5,11,12,176,0
2,sql,7.0,93,16,46,147,0
3,drupal,7.0,75,3,58,123,0
4,E031,7.6,23,15,58,149,0


In [8]:
train.to_csv("data/det_priority_train.csv", index=False)

### 데이터 확인

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv("data/det_priority_train.csv")

In [13]:
check = data[data['event_id'].apply(lambda x : 'E' in x)]

In [15]:
check.drop_duplicates(subset=['event_id'])

Unnamed: 0,event_id,risk,freq,pp_mtime,pp_stime,score,collection_id
4,E031,7.6,23,15,58,149,0
5,E033,5.8,2,7,36,181,0
7,E034,5.8,101,23,21,157,0
21,E022,7.0,35,15,23,181,2
45,E035,5.8,11,11,35,160,4


#### 어차피 환경 모델에서 주어진 템플릿에서 데이터를 끌어오기 때문에 학습 데이터에는 risk 및 playtime 정보가 들어가지 않아도 됨

# 데이터 재생성

In [1]:
import ast, random, pickle
import pandas as pd
from itertools import combinations
from tqdm import tqdm

In [2]:
evt_data = open('templates/event_handling_priority/event_state.json', 'r').read()

In [3]:
evt_data = list(ast.literal_eval(evt_data).keys())

In [5]:
len(set(evt_data))

35

In [6]:
combi_data = combinations(evt_data, 10)

In [7]:
train_data = []
for val in tqdm(combi_data):
    train_data.append(val)

183579396it [00:46, 3923386.94it/s]


In [34]:
real_train_data = random.sample(train_data, k=5000)

In [35]:
len(real_train_data)

5000

In [8]:
del train_data

In [39]:
train_df = None

for idx, data in enumerate(real_train_data):
    tmp = [[idx, evt] for evt in data]
    tmp_df = pd.DataFrame(tmp, columns = ['collection_id', 'event_id'])
    train_df = pd.concat([train_df, tmp_df], axis=0, ignore_index=True)

In [41]:
train_df.shape

(50000, 2)

In [42]:
train_df.to_csv("data/det_priority_train.csv", index=False)

### environment 확인

In [1]:
from Environment import EventEnvironment

In [2]:
env = EventEnvironment()

In [4]:
env.reset()

(array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 7.        , 12.        , 45.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.   