In [6]:
import numpy as np
import rrcf

# A (robust) random cut tree can be instantiated from a point set (n x d)
X = np.random.randn(100, 2)
tree = rrcf.RCTree(X)

# A random cut tree can also be instantiated with no points
tree = rrcf.RCTree()

Inserting points

In [7]:
tree = rrcf.RCTree()

for i in range(6):
    x = np.random.randn(2)
    tree.insert_point(x, index=i)

Deleting points

In [8]:
tree.forget_point(2) 

Leaf(2)

Anomaly score

In [9]:
# Seed tree with zero-mean, normally distributed data
X = np.random.randn(100,2)
tree = rrcf.RCTree(X)

# Generate an inlier and outlier point
inlier = np.array([0, 0])
outlier = np.array([4, 4])

# Insert into tree
tree.insert_point(inlier, index='inlier')
tree.insert_point(outlier, index='outlier')

Leaf(outlier)

streaming anomaly detection

In [18]:
import numpy as np
import rrcf

# Generate data
n = 730
A = 50
center = 100
phi = 30
T = 2*np.pi/100
t = np.arange(n)
sin = A*np.sin(T*t-phi*T) + center
sin[235:255] = 80

# Set tree parameters
num_trees = 40
shingle_size = 4
tree_size = 256

# Create a forest of empty trees
forest = []
for _ in range(num_trees):
    tree = rrcf.RCTree()            
    forest.append(tree)
    
# Use the "shingle" generator to create rolling window
points = rrcf.shingle(sin, size=shingle_size)

# Create a dict to store anomaly score of each point
avg_codisp = {}

# For each shingle...
for index, point in enumerate(points): #enumerate(): (인덱스 번호, 컬렉션의 원소) tuple을 반환
    # For each tree in the forest...
    for tree in forest:
        # If tree is above permitted size, drop the oldest point (FIFO)
        if len(tree.leaves) > tree_size:
            tree.forget_point(index - tree_size)
        # Insert the new point into the tree
        tree.insert_point(point, index=index)
        # Compute codisp on the new point and take the average among all trees
        if not index in avg_codisp:  # 공분산 계수 계산
            avg_codisp[index] = 0
        avg_codisp[index] += tree.codisp(index) / num_trees 

In [26]:
sin.shape

(730,)

Epilogue

In [27]:
import pandas as pd

dataset=pd.read_csv('./input/intial_trainig_stage.csv',index_col='date',parse_dates=['date'])

In [28]:
dataset=dataset.iloc[:,0].values
dataset

array([367., 397., 388., ..., 412., 395., 431.])

In [30]:
import numpy as np

def shingle(data, shingle_size):
    num_data = len(data)
    shingled_data = np.zeros((num_data-shingle_size, shingle_size))
    
    for n in range(num_data - shingle_size):
        shingled_data[n] = data[n:(n+shingle_size)]
    return shingled_data

# single data with shingle size=48 (one day)
shingle_size = 48
prefix_shingled = 'sagemaker/randomcutforest_shingled'
dataset_shingled = shingle(dataset, shingle_size)
print(dataset_shingled)

[[367. 397. 388. ... 389. 392. 449.]
 [397. 388. 341. ... 392. 449. 442.]
 [388. 341. 361. ... 449. 442. 400.]
 ...
 [452. 430. 387. ... 418. 439. 383.]
 [430. 387. 441. ... 439. 383. 412.]
 [387. 441. 441. ... 383. 412. 395.]]


[pip install sagemaker] 설치 먼저 해주기

In [4]:
from sagemaker import RandomCutForest
session = sagemaker.Session()

# specify general training job information
rcf = RandomCutForest(role=execution_role,
                      train_instance_count=1,
                      train_instance_type='ml.m4.xlarge',
                      data_location='s3://{}/{}/'.format(bucket, prefix_shingled),
                      output_path='s3://{}/{}/output'.format(bucket, prefix_shingled),
                      num_samples_per_tree=512,
                      num_trees=50)

# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(dataset_shingled))

NameError: name 'sagemaker' is not defined