This file performs outlier detection per agent

Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import random

from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import auc

Read in data and group by agent

In [None]:
df = pd.read_feather('generated_data/reduced_kinematics.feather')

groups = df.groupby('agent_name')

The following function performs the LOF outlier detection using a single agent as "normal," randomly adding outliers from other agents

In [None]:
def outlier_detection(grp):

    # Get dataframe of this agent's trips
    temp = groups.get_group(grp).reset_index(drop=True)

    # Get approximately 3% of original data as outliers
    num_outliers = round(0.03 * len(temp))
    # print(grp)
    # print('Number of trips for agent: ', len(temp))
    # print('Number of outliers to introduce: ', num_outliers)

    # Append outliers to dataframe
    for i in range(num_outliers):
        # Be sure we don't get a duplicate
        index = random.randint(0, len(df) - 1)
        while df['agent_name'].iloc[index] == grp:
            index = random.randint(0, len(df) - 1)

        temp.loc[len(temp)] = df.iloc[index]

    # Drop unnecesarry columns
    data = temp.drop(columns=['Agent_ID', 'Start_time', 'End_time', 'modality', 'agent_name'])
    data = data.to_numpy()

    # Apply LOF
    clf = LocalOutlierFactor()
    clf.fit(data)

    temp['score'] = clf.negative_outlier_factor_
    temp = temp.sort_values(by='score')

    # Find AUC score
    p = [1]
    r = [0]
    num_found = 0
    i = 0

    for (idx, row) in temp.iterrows():
        if row['agent_name'] != grp:
            num_found += 1
        p.append(num_found / (i + 1))
        r.append(num_found / num_outliers)

        if r[-1] == 1:
            i += 1
            break

        i += 1

    auc_precision_recall = auc(r, p)

    # Return (1) Size of group, (2) AUC score
    return [len(temp) - num_outliers, auc_precision_recall]

Running 10 trials per agent, we perform outlier detection and compile the results

In [None]:
results = pd.DataFrame(columns=['agent_name', 'num_trips', 'auc'])

for g in groups.groups:
    # 10 iterations per agent
    for iter in range(10):
        l, s = outlier_detection(g)
        results.loc[len(results)] = [g, l, s]

We can observe some statistics about our results

In [None]:
print(results['auc'].describe())

The following function performs the outlier detection via random guessing, using a single agent as "normal," randomly adding outliers from other agents

In [None]:
def random_guess(grp):

    # Get dataframe of this agent's trips
    temp = groups.get_group(grp).reset_index(drop=True)

    # Get approximately 3% of original data as outliers
    num_outliers = round(0.03 * len(temp))
    # print(grp)
    # print('Number of trips for agent: ', len(temp))
    # print('Number of outliers to introduce: ', num_outliers)

    # Append outliers to dataframe
    for i in range(num_outliers):
        # Be sure we don't get a duplicate
        index = random.randint(0, len(df) - 1)
        while df['agent_name'].iloc[index] == grp:
            index = random.randint(0, len(df) - 1)

        temp.loc[len(temp)] = df.iloc[index]

    # Randomly shuffle rows
    temp = temp.sample(frac=1).reset_index(drop=True)

    # Find AUC score
    p = [1]
    r = [0]
    num_found = 0
    i = 0

    for (idx, row) in temp.iterrows():
        if row['agent_name'] != grp:
            num_found += 1
        p.append(num_found / (i + 1))
        r.append(num_found / num_outliers)

        if r[-1] == 1:
            i += 1
            break

        i += 1

    auc_precision_recall = auc(r, p)

    # Return (1) Size of group, (2) AUC score
    return [len(temp) - num_outliers, auc_precision_recall]

Running 10 trials per agent, we perform outlier detection and compile the results

In [None]:
rand_results = pd.DataFrame(columns=['agent_name', 'num_trips', 'auc'])

for g in groups.groups:
    # 10 iterations per agent
    for iter in range(10):
        l, s = random_guess(g)
        rand_results.loc[len(rand_results)] = [g, l, s]

We can observe some statistics about our results

In [None]:
print(rand_results['auc'].describe())