In [1]:
import numpy as np
import pandas as pd

In [2]:
# function to detect outliers, given an array of values
def outliers_modified_z_score(ys):
    threshold = 3.5 #set threshold higher if more false positives are acceptable and lower to limit false positives
    median_y = np.median(ys)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys])
    modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y for y in ys]
    return np.where(np.abs(modified_z_scores) > threshold)

In [3]:
# input data paths
cluster_assignments_path = 'data/cluster_assignments.npy' 
t0_data_path = 'data/featureVectors_t0.csv' 
t1_data_path = 'data/featureVectors_t1.csv' 

# output data path
outliers_path = 'data/possibleOutliers.csv'

# read in data 
cluster_assignments = np.load(cluster_assignments_path) # get plan ids and clusters
t0 = pd.read_csv(t0_data_path) # get feature vectors from t0
t1 = pd.read_csv(t1_data_path) # get feature vectors from t1

In [4]:
# create dictionary where key:cluster and value:list of planIDs
clusterMembers = {}
for tuple in cluster_assignments:
    value = tuple[0].decode("utf-8") 
    key = int(tuple[1])
    if key in clusterMembers:
        clusterMembers[key].append(value)
    else:
        clusterMembers[key] = [value]
display(clusterMembers)

{0: ['000EB31E-B7AD-48E5-BC03-7A04E1383B33',
  '003CFB64-7574-4D4C-8742-BF2414B24E2D',
  '00D0F0C5-81A7-4CF9-A75E-D5EFD453C763',
  '01E2AB57-150D-433D-93DF-EF38B6B72263',
  '0243ADA9-09E0-4C81-9703-44CB17734C79',
  '02A7A421-84AD-4650-8ED1-0666E360D5DF',
  '0384DFA2-815D-42E3-A6C0-C44826BB376D',
  '040ACD50-AB5F-42EF-9355-E2B09B0B244D',
  '0699570C-54B0-497C-BC9A-0FB49DB66F3C',
  '06F764CF-78A0-4B6D-A88C-CD89595D99C2',
  '071523F7-CEC1-418B-A2F8-1F98D20BBD70',
  '08B407DB-7382-46E2-B8F8-AB6DF0149AF4',
  '0A60E98B-F043-4C5D-AAD3-1E26E590EE3C',
  '0AAC897E-435B-4D83-8D67-6F2BAEFC95E5',
  '0C555FFC-5302-43E7-B857-BC4B77DA3904',
  '0D134DCC-00F6-4E05-85CE-DFEAC044351F',
  '0DBCA26E-71ED-44CD-AD0E-C0031FB94A3C',
  '0E9FAFD5-E0DF-47FE-BCF3-EBCD9F821F75',
  '1075CA7D-F6A9-4A80-B586-15668D8BA1B7',
  '11E4DEE4-287D-443D-8BD4-AC8CBF460A63',
  '12953652-CB35-47F4-9CBC-01F332D84A25',
  '130D1BDF-2575-47FE-8005-C8DBA1F80180',
  '14263F7A-F125-4D31-A401-F960FEB6A9EE',
  '155D93C3-5E5F-4AA5-8276-012C

In [5]:
# create dictionaries for t0 and t1 where key:planID and value:plan value
t0_dict = dict(zip(t0['ct_planid'],t0['ct_value']))
t1_dict = dict(zip(t1['ct_planid'],t1['ct_value']))
#display(t0_dict)
display(t1_dict)

{'EFF1DFCC-1AD9-4513-8005-61284852D602': 101155984.06999999,
 '0872FD71-10D8-420D-9FD7-7C2BDA809541': 17847022.34,
 'B426231C-0C25-4F47-ADC6-C97486DC0BE5': 2163165.0,
 'ACB0FFD7-95BE-4C72-B470-E34117B75CA7': 59335206.670000002,
 'F85C4311-D8FF-4A5C-87CB-AD1826FE9704': 20172550.18,
 'BAF5BB8B-E068-4671-BC69-9DC27BA94DE3': 30156837.469999999,
 '03A20D9C-F287-432A-90CA-A0EDF50A9412': 866841.26000000001,
 'A0D83A44-9BA8-474F-9696-27BBB82AFF27': 5503033.0,
 '46C27BE4-3933-494E-A9A3-A82DBF6B59AF': 63135.790000000001,
 'D38CEAE4-2DA8-4DC9-9453-5D0F5C7526E4': 0.0,
 'E44F562C-EAD8-4F53-A967-5DC073063555': 13136974.0,
 '0458766D-364D-44E4-A8E6-5AD032AE37CB': 654378258.71000004,
 'F139B4EF-990C-4799-967B-C3587C1AECDD': 5982823.0999999996,
 'AAB647AE-0671-4813-B4CE-C2212D9C2F34': 4853905.5300000003,
 '1C37EE24-9E01-4503-A087-8F491BEE9400': 170131499.30000001,
 '1E77B8E8-903B-4076-9D94-0C1F3B304A0B': 173507249.69,
 '3897D11A-52F8-4F7C-8C97-C6290F511770': 11795511.460000001,
 '206AC017-F2EF-4934-B32

In [7]:
result = []
for key, value in clusterMembers.items():
    deltaList = []
    planIdList = []
    deltaPlanList = []
    for id in value: # for each planID in each cluster
        if (id in t0_dict and id in t1_dict):
            t0_val = t0_dict[id] # plan value at t0
            t1_val = t1_dict[id] # plan value at t1
            if (t0_val != 0):
                delta = (t1_val - t0_val)/t0_val
                deltaList.append(delta)
                planIdList.append(id)
                
    if(len(deltaList) <= 1): # if cluster contains fewer than 2 plans that had values at t0 and t1, no outliers can be found
        continue
    outliers = outliers_modified_z_score(deltaList)
    median = np.median(deltaList)
    stddev = np.std(deltaList)
    for outlier in outliers[0]:
        outlierId = planIdList[outlier] #find associated planID
        deltaVal = deltaList[outlier]
        result.append((outlierId, deltaVal, median, stddev))

# write possible outliers to csv        
result = pd.DataFrame(result, columns = ('PlanID','% Change','Median','Standard Deviation'))
result.to_csv(outliers_path, sep=',', index=False)
