In [None]:
from glob import glob
import pandas as pd

In [None]:
# this cell gets the training frame counts for the table used in the report
# commented out for the run time though incase images are not downloaded to save space
'''video_dir = "../shanghaitech/training/videos/*/"

all_frames = {}
for f in glob(f"{video_dir}"):
  scene = f.split("/")[-2].split('_')[0]
  frames = len(glob(f + '*.jpg'))

  try:
    all_frames[scene] = all_frames[scene] + frames
  except:
    all_frames[scene] = frames'''

In [None]:
#all_frames_df = pd.DataFrame.from_dict(all_frames, orient='index', columns=['frames'])

In [None]:
df = pd.read_csv('./data/stc-train_tracked_poses_l.csv')

In [None]:
'''df['Scene'] = df['video'].apply(lambda x: x.split('_')[0])
train_dets = df.groupby('Scene').personID.count()

all_frames_df.rename(columns={'frames': 'Training frames'}, inplace=True)
frame_counts = all_frames_df.merge(train_dets, left_index=True, right_index=True)
frame_counts.rename(columns={'personID': 'Training detections'}, inplace=True)
frame_counts['Train pose detections per frame'] = frame_counts['Training detections']/frame_counts['Training frames']
frame_counts'''

In [None]:
df['Scene'] = df['video'].apply(lambda x: x.split('_')[0])
train_dets = df.groupby('Scene').personID.count()
frame_counts = train_dets

# VSA Encoding (TRAIN)

In [None]:
from src.vsa_encoding import *

In [None]:
scenes = sorted(list(set([id[:2] for id in df['video'].unique()])))

In [None]:
import gc
gc.collect()

inference_components = {}

d = 10000
bins = 5


Joints = hd.random(5,d)
Speeds = hd.random(bins,d)
Aspect = hd.random(bins,d)
Pos = hd.random((bins*bins)+1,d)
Times = hd.random(bins,d)
Wrists = hd.random(5,d)
Postures = hd.random(5, d)
Features = hd.random(6,d)

clusters = 10

for scene in scenes:
    print(f"Processing scene: {scene}")
    detections = df[df['video'].str.startswith(scene)]

    detections, aspect_thresh, time_thresh = build_encoding_df(detections, bins=bins, aspect_thresh=None, time_thresh=None)

    vsa = encode_vsa(detections, Features, Pos, Aspect, Times, Postures, bins=bins)

    temporal_vecs = build_temporal_enc(detections, vsa)
    temporal_vecs = torch.stack(temporal_vecs)


    prototypes, kmeans = cluster(temporal_vecs, clusters)

    if kmeans is None:
            labels = np.array([0]*len(temporal_vecs))
    else:
            labels = kmeans.labels_

    thresholds = compute_cluster_thresholds(prototypes, temporal_vecs, labels)
    fine_tuned_prototypes = fine_tune_prototypes(prototypes, temporal_vecs, labels)
    thresholds = compute_cluster_thresholds(fine_tuned_prototypes, temporal_vecs, labels)


    # Store only what’s needed
    inference_components[scene] = (fine_tuned_prototypes, thresholds,
                                   aspect_thresh, time_thresh)

    # Free memory from intermediate steps
    del detections, vsa, temporal_vecs, prototypes, kmeans, thresholds
    gc.collect()

In [None]:
import pickle

with open('./data/shanghaitech/inference_components.pkl', 'wb') as f:
    pickle.dump(inference_components, f)

# Test

In [None]:
test = pd.read_csv('./data/stc-test_tracked_poses_l.csv')

In [None]:
scenes = sorted(list(set([id for id in test['video'].unique()])))

In [None]:
import gc
import time

gc.collect()

preds = {}

start = time.time()
for scene in scenes:
    print(f"Processing scene: {scene}")
    test_detections = test[test['video'].str.startswith(scene)]

    test_detections, _, _ = build_encoding_df(test_detections, aspect_thresh=aspect_thresh, time_thresh=time_thresh)
    test_vsa = encode_vsa(test_detections, Features, Pos, Aspect, Times, Postures, bins=bins)

    prototypes = inference_components.get(scene[:2])[0]
    thresholds = inference_components.get(scene[:2])[1]
    aspect_thresh = inference_components.get(scene[:2])[2]
    time_thresh = inference_components.get(scene[:2])[3]

    test_temporal_vecs = build_temporal_enc(test_detections, test_vsa)
    test_temporal_vecs = torch.stack(test_temporal_vecs)

    scores = evaluate_test_vectors(prototypes, thresholds, test_temporal_vecs)

    preds[test_detections['video'].iloc[0]] = [scores, test_detections['frameID'].tolist()]

    # Free memory from intermediate steps
    del test_detections, test_vsa, test_temporal_vecs
    gc.collect()

end = time.time()

latency = end - start

latency

In [None]:
truth = pd.read_csv('./data/ShanghaiTech-Labels.csv')

In [None]:
truth['Scene'] = truth['Video'].apply(lambda row: row.split('_')[0])

anomaly_counts = truth.groupby(['Scene', 'anomalyPresence']).Frame.count().reset_index()
anomaly_counts = anomaly_counts.pivot(columns='anomalyPresence', index='Scene', values='Frame').reset_index()
anomaly_counts.rename(columns={0:'No Anomaly', 1:'Anomaly'}, inplace=True)
anomaly_counts.index = anomaly_counts['Scene']
frame_counts = anomaly_counts.merge(frame_counts, left_index=True, right_index=True)

In [None]:
test['Scene'] = test['video'].apply(lambda row: row.split('_')[0])
test_frames = test.groupby('Scene').personID.count().reset_index()
test_frames.rename(columns={'personID': 'Test pose detections'}, inplace=True)
test_frames.index = test_frames['Scene']
frame_counts = frame_counts.merge(test_frames, left_index=True, right_index=True)

Morais et al (2019) filtering removes these frames for HR only anomalies (see written work for full citation)

•Camera 01: Videos 0130, 0135 and 0136;
•Camera 06: Videos 0144 and 0145;
•Camera 12: Video 0152.

In [None]:
truth = truth[~truth['Video'].isin(['0130', '0135', '0136', '0144' '0145', '0152'])]

In [None]:
recs = []
for k,p in preds.items():
  scene = k
  pred = p[0][0]
  scores = p[0][1]
  threshes = p[0][2]
  frame = p[1]

  for j, f in enumerate(frame):
    rec = {"vidID": scene, "frameID": frame[j], "AnomalyScore": pred[j], "Thresh":threshes[j]}

    recs.append(rec)

all_res = pd.DataFrame(recs)
all_res

In [None]:
results = all_res.groupby(['vidID', 'frameID']).agg({
        'AnomalyScore': 'min',
    }).reset_index()

In [None]:
truth['vidID'] = truth['Video']
truth['frameID'] = truth['Frame']

In [None]:
results['vidID'] = results.apply(lambda row: row['vidID'].replace('_frames', ''), axis=1)

In [None]:
comparison = truth.merge(results, on=['vidID', 'frameID'], how='left')

In [None]:
comparison

In [None]:
# Ensure first NA becomes 0, then forward-fill within each video
def custom_fill(group):
    group = group.copy()
    if pd.isna(group.iloc[0]['AnomalyScore']):
        group.iloc[0, group.columns.get_loc('AnomalyScore')] = 1
    group['AnomalyScore'] = group['AnomalyScore'].ffill()
    return group

comparison.sort_values(['vidID', 'frameID'], inplace=True)
comparison = comparison.groupby('vidID', group_keys=False).apply(custom_fill)

In [None]:
comparison['smoothed_score'] = (
    comparison
    .sort_values(['vidID', 'frameID'])  # ensure correct order
    .groupby('vidID')['AnomalyScore']
    .transform(lambda x: x.rolling(window=25, center=True, min_periods=1).mean())
)

In [None]:
comparison['smoothed_score'] = comparison['smoothed_score'].apply(lambda x: 1-x)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

roc_auc_score(comparison['anomalyPresence'], comparison['smoothed_score'])

In [None]:
comparison.isna().sum()['AnomalyScore']/len(comparison)

In [None]:
comparison['Scene'] = comparison['Video'].apply(lambda x: x[:2])

auc = {}

for scene in comparison['Scene'].unique():
  score = roc_auc_score(comparison[comparison['Scene'] == scene]['anomalyPresence'], comparison[comparison['Scene'] == scene]['smoothed_score'])

  auc[scene] = score

In [None]:
auc_df = pd.DataFrame(auc.items(), columns=['Scene', 'AUC'])
auc_df.index = auc_df['Scene']
auc_df.drop('Scene', axis=1, inplace=True)

frame_counts = frame_counts.merge(auc_df, left_index=True, right_index=True)

In [None]:
frame_counts

In [None]:
frame_counts['Test detections per frame'] = frame_counts.apply(lambda row: (row['Test pose detections'])/ (row['Anomaly'] + row['No Anomaly']), axis=1)

In [None]:
frame_counts.to_csv('data/shanghaitech/frame_corr_data.csv')