# Compute Dataset statistics

1. General stats
  - number of matches (each natch diff background, angle)
  - numver of rally videos
  - number of frames
  - number of hits
2. Performance of object annotation pipeline
  - visibility, accuracy of near player, far player, shuttle
  - also P, R for shuttle
  
* only take frames with label in-play = 1. Cos I think some vids have a short part of a second rally after the main one, might have noise.

In [1]:
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

## General stats

In [2]:
gt_rootdir = 'input_features/gt/domain/' # input_features/gt/domain/domain_pro/test_match3/1_05_02_x.csv
annot_rootdir = 'input_features/filtered2/domain/' # input_features/filtered2/domain/domain_pro/match1/1_02_02_x.csv

In [3]:
def calc_gen_stats(path_y):
    df_y = pd.read_csv(path_y)
    matchname = path_y.split('/')[-2]
    vidbasename = path_y.split('/')[-1]
    # get in-play frames
    df_y = df_y[df_y['inplay'] == 1]
    
    # get label stats
    numframes = len(df_y)

    numhit1 = len(df_y[df_y['player_hit']==1])
    numhit2 = len(df_y[df_y['player_hit']==2])
    numhit = numhit1 + numhit2

    # fill up dict
    dic = {}
    dic['match'] = matchname
    dic['vid'] = vidbasename
    dic['numframes'] = numframes
    dic['numhit'] = numhit
    return dic

### pro

In [4]:
prefix = 'pro'
gt_rootdir_pro = os.path.join(gt_rootdir, 'domain_' + prefix)

list_dicts = []
for matchdir in sorted(os.listdir(gt_rootdir_pro)):
    for gtcsv in os.listdir(os.path.join(gt_rootdir_pro, matchdir)):
        if gtcsv.endswith('_y.csv'):
            gt_y = os.path.join(gt_rootdir_pro, matchdir, gtcsv)
            
            gen_dict = calc_gen_stats(gt_y)
            list_dicts.append(gen_dict.copy())

df_gen_pro = pd.DataFrame(list_dicts)
df_gen_pro.head()

Unnamed: 0,match,vid,numframes,numhit
0,match1,1_02_00_y.csv,484,18
1,match1,1_06_08_y.csv,67,4
2,match1,1_06_06_y.csv,138,4
3,match1,1_03_06_y.csv,221,8
4,match1,1_02_01_y.csv,165,5


### am_singles

In [5]:
prefix = 'am_singles'
gt_rootdir_am_singles = os.path.join(gt_rootdir, 'domain_' + prefix)

list_dicts = []
for matchdir in sorted(os.listdir(gt_rootdir_am_singles)):
    for gtcsv in os.listdir(os.path.join(gt_rootdir_am_singles, matchdir)):
        if gtcsv.endswith('_y.csv'):
            gt_y = os.path.join(gt_rootdir_am_singles, matchdir, gtcsv)
            
            gen_dict = calc_gen_stats(gt_y)
            list_dicts.append(gen_dict.copy())

df_gen_am_singles = pd.DataFrame(list_dicts)
df_gen_am_singles.head()

Unnamed: 0,match,vid,numframes,numhit
0,match24,1_01_04_y.csv,203,6
1,match24,1_04_05_y.csv,44,1
2,match24,1_02_04_y.csv,703,19
3,match24,1_05_05_y.csv,113,3
4,match24,1_01_03_y.csv,276,8


### am-doubles

In [6]:
prefix = 'am_doubles'
gt_rootdir_am_doubles = os.path.join(gt_rootdir, 'domain_' + prefix)

list_dicts = []
for matchdir in sorted(os.listdir(gt_rootdir_am_doubles)):
    for gtcsv in os.listdir(os.path.join(gt_rootdir_am_doubles, matchdir)):
        if gtcsv.endswith('_y.csv'):
            gt_y = os.path.join(gt_rootdir_am_doubles, matchdir, gtcsv)
            
            gen_dict = calc_gen_stats(gt_y)
            list_dicts.append(gen_dict.copy())

df_gen_am_doubles = pd.DataFrame(list_dicts)
df_gen_am_doubles.head()

Unnamed: 0,match,vid,numframes,numhit
0,match_china,doubles1_y.csv,282,11
1,match_china,doubles2_y.csv,288,10
2,match_china,doubles3_y.csv,276,10
3,match_china,doubles0_y.csv,248,6
4,match_clementi,doubles5_y.csv,440,17


### Observe gen stats

#### vid level

In [7]:
df_gen_pro.sum()

match        match1match1match1match1match1match1match1matc...
vid          1_02_00_y.csv1_06_08_y.csv1_06_06_y.csv1_03_06...
numframes                                                70982
numhit                                                    2587
dtype: object

In [8]:
df_gen_pro.count()

match        173
vid          173
numframes    173
numhit       173
dtype: int64

In [9]:
df_gen_am_singles.sum()

match        match24match24match24match24match24match24matc...
vid          1_01_04_y.csv1_04_05_y.csv1_02_04_y.csv1_05_05...
numframes                                                11393
numhit                                                     331
dtype: object

In [10]:
df_gen_am_singles.count()

match        35
vid          35
numframes    35
numhit       35
dtype: int64

In [11]:
df_gen_am_doubles.sum()

match        match_chinamatch_chinamatch_chinamatch_chinama...
vid          doubles1_y.csvdoubles2_y.csvdoubles3_y.csvdoub...
numframes                                                 7115
numhit                                                     264
dtype: object

In [12]:
df_gen_am_doubles.count()

match        20
vid          20
numframes    20
numhit       20
dtype: int64

#### match level

In [13]:
df_gen_pro.groupby('match').sum()

Unnamed: 0_level_0,numframes,numhit
match,Unnamed: 1_level_1,Unnamed: 2_level_1
match1,2602,86
match10,2471,82
match11,2594,91
match12,3036,98
match13,2186,83
match14,2170,83
match15,2282,95
match16,1835,73
match17,2275,81
match18,2328,100


In [14]:
df_gen_am_singles.groupby('match').sum()

Unnamed: 0_level_0,numframes,numhit
match,Unnamed: 1_level_1,Unnamed: 2_level_1
match24,2876,75
match25,2863,79
match26,2640,74
match_china2,1052,33
match_clementi,1205,48
match_yewtee,757,22


In [15]:
df_gen_am_doubles.groupby('match').sum()

Unnamed: 0_level_0,numframes,numhit
match,Unnamed: 1_level_1,Unnamed: 2_level_1
match_china,1094,37
match_clementi,3187,122
match_msia,1946,70
match_yewtee,888,35


#### train-test level

In [16]:
# train vids
df_gen_pro[(df_gen_pro['match']!='test_match1') & (df_gen_pro['match']!='test_match2') & (df_gen_pro['match']!='test_match3') ].sum()

match        match1match1match1match1match1match1match1matc...
vid          1_02_00_y.csv1_06_08_y.csv1_06_06_y.csv1_03_06...
numframes                                                60379
numhit                                                    2222
dtype: object

In [17]:
# test vids
df_gen_pro[(df_gen_pro['match']=='test_match1') | (df_gen_pro['match']=='test_match2') | (df_gen_pro['match']=='test_match3') ].sum()

match        test_match1test_match1test_match1test_match1te...
vid          2_03_10_y.csv1_07_03_y.csv1_06_03_y.csv1_09_07...
numframes                                                10603
numhit                                                     365
dtype: object

## Annot pipeline performance

In [18]:
def calc_visibility(num_vis, num_tot): # for GT only
    return num_vis/num_tot

def calc_acc(num_correct, num_vis): # for annot compared to GT
    return num_correct/num_vis

def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    
    return iou

In [19]:
def calc_stats(df_gt, df_y, df_annot, thresh_iou=0.5, shuttle_dist_thresh=0.01):
    # get in-play frames
    df_y = df_y[df_y['inplay'] == 1]
    frames_inplay = df_y['frame'].astype(int).values

    # check bbox visibility
    tot_frames = len(frames_inplay)
    num_near_bbox_vis = len(df_gt[(df_gt['Frame'] >= frames_inplay[0]) & (df_gt['Frame'] <= frames_inplay[-1]) & (df_gt['vis_near'] == 1) ])
    num_far_bbox_vis = len(df_gt[(df_gt['Frame'] >= frames_inplay[0]) & (df_gt['Frame'] <= frames_inplay[-1]) & (df_gt['vis_far'] == 1) ])

    # check shuttle visibility
    num_shuttle_vis = len(df_gt[(df_gt['Frame'] >= frames_inplay[0]) & (df_gt['Frame'] <= frames_inplay[-1]) & (df_gt['ball_x'] != 0) & (df_gt['ball_y'] != 0)])

    # check bbox correctness
    num_near_bbox_correct = 0
    num_far_bbox_correct = 0
    # check shuttle correctness
    tp, tn, fp, fn = 0, 0, 0, 0
    for fr in frames_inplay:
        df_gt_tmp = df_gt[df_gt['Frame'] == fr]
        df_annot_tmp = df_annot[df_annot['Frame'] == fr]

        # process bbox
        bbox_near_gt = df_gt_tmp[['near_x1', 'near_y1', 'near_x2', 'near_y2']].to_numpy()[0]
        bbox_far_gt = df_gt_tmp[['far_x1', 'far_y1', 'far_x2', 'far_y2']].to_numpy()[0]
        bbox_near_annot = df_annot_tmp[['near_x1', 'near_y1', 'near_x2', 'near_y2']].to_numpy()[0]
        bbox_far_annot = df_annot_tmp[['far_x1', 'far_y1', 'far_x2', 'far_y2']].to_numpy()[0]

        if bb_intersection_over_union(bbox_near_gt, bbox_near_annot) > thresh_iou:
            num_near_bbox_correct += 1
        if bb_intersection_over_union(bbox_far_gt, bbox_far_annot) > thresh_iou:
            num_far_bbox_correct += 1

        # process shuttle
        shuttle_gt = df_gt_tmp[['ball_x', 'ball_y']].to_numpy()[0]
        shuttle_annot = df_annot_tmp[['ball_x', 'ball_y']].to_numpy()[0]
        if not np.any(shuttle_gt) and not np.any(shuttle_annot):
            tn += 1
        elif np.any(shuttle_gt) and not np.any(shuttle_annot):
            fn += 1
        elif not np.any(shuttle_gt) and np.any(shuttle_annot):
            fp += 1
        else:
            dist = np.linalg.norm(shuttle_gt - shuttle_annot)
            if dist < shuttle_dist_thresh:
                tp += 1
            else:
                fp += 1

    # compute stats
    stats_dict = {}
    stats_dict['near_vis'] = num_near_bbox_vis/tot_frames
    stats_dict['far_vis'] = num_far_bbox_vis/tot_frames
    stats_dict['near_acc'] = num_near_bbox_correct/num_near_bbox_vis
    stats_dict['far_acc'] = num_far_bbox_correct/num_far_bbox_vis
    stats_dict['shuttle_vis'] = num_shuttle_vis/tot_frames
    stats_dict['shuttle_prec'] = tp/(tp + fp)
    stats_dict['shuttle_rec'] = tp/(tp + fn)
    stats_dict['shuttle_acc'] = (tp + tn)/tot_frames

    return stats_dict

In [20]:
gt_rootdir = 'input_features/gt/domain/' # input_features/gt/domain/domain_pro/test_match3/1_05_02_x.csv
annot_rootdir = 'input_features/filtered2/domain/' # input_features/filtered2/domain/domain_pro/match1/1_02_02_x.csv

### pro

In [21]:
prefix = 'pro'
gt_rootdir_pro = os.path.join(gt_rootdir, 'domain_' + prefix)
annot_rootdir_pro = os.path.join(annot_rootdir, 'domain_' + prefix)

list_dicts = []
for matchdir in sorted(os.listdir(gt_rootdir_pro)):
    for gtcsv in os.listdir(os.path.join(gt_rootdir_pro, matchdir)):
        if gtcsv.endswith('_x.csv'):
            annot_csv = os.path.join(annot_rootdir_pro, matchdir, gtcsv)
            gt_csv = os.path.join(gt_rootdir_pro, matchdir, gtcsv)
            gt_y = gt_csv[:-5] + 'y.csv'
            
            print(gt_csv)
            
            df_gt = pd.read_csv(gt_csv)
            df_y = pd.read_csv(gt_y)
            df_annot = pd.read_csv(annot_csv)
            
            stats_dict = calc_stats(df_gt, df_y, df_annot, thresh_iou=0.5, shuttle_dist_thresh=0.01)
            stats_dict['file'] = os.path.join(matchdir, gtcsv[:-6])
            list_dicts.append(stats_dict.copy())

df_stats_pro = pd.DataFrame(list_dicts)
df_stats_pro.head()

input_features/gt/domain/domain_pro/match1/1_02_02_x.csv
input_features/gt/domain/domain_pro/match1/1_02_00_x.csv
input_features/gt/domain/domain_pro/match1/1_03_05_x.csv
input_features/gt/domain/domain_pro/match1/1_02_04_x.csv
input_features/gt/domain/domain_pro/match1/1_02_01_x.csv
input_features/gt/domain/domain_pro/match1/1_02_03_x.csv
input_features/gt/domain/domain_pro/match1/1_06_06_x.csv
input_features/gt/domain/domain_pro/match1/1_03_04_x.csv
input_features/gt/domain/domain_pro/match1/1_01_00_x.csv
input_features/gt/domain/domain_pro/match1/1_06_09_x.csv
input_features/gt/domain/domain_pro/match1/1_03_06_x.csv
input_features/gt/domain/domain_pro/match1/1_06_08_x.csv
input_features/gt/domain/domain_pro/match10/2_04_02_x.csv
input_features/gt/domain/domain_pro/match10/1_03_01_x.csv
input_features/gt/domain/domain_pro/match10/1_03_03_x.csv
input_features/gt/domain/domain_pro/match10/2_14_08_x.csv
input_features/gt/domain/domain_pro/match10/1_12_16_x.csv
input_features/gt/domain/d

Unnamed: 0,near_vis,far_vis,near_acc,far_acc,shuttle_vis,shuttle_prec,shuttle_rec,shuttle_acc,file
0,1.0,1.0,1.0,1.0,0.870813,1.0,1.0,1.0,match1/1_02_02
1,1.0,1.0,1.0,1.0,0.919421,1.0,1.0,1.0,match1/1_02_00
2,1.0,1.0,1.0,1.0,0.935933,1.0,1.0,1.0,match1/1_03_05
3,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0,match1/1_02_04
4,1.0,1.0,1.0,1.0,0.909091,1.0,1.0,1.0,match1/1_02_01


### am-singles

In [22]:
prefix = 'am_singles'
gt_rootdir_am_singles = os.path.join(gt_rootdir, 'domain_' + prefix)
annot_rootdir_am_singles = os.path.join(annot_rootdir, 'domain_' + prefix)

list_dicts = []
for matchdir in sorted(os.listdir(gt_rootdir_am_singles)):
    for gtcsv in os.listdir(os.path.join(gt_rootdir_am_singles, matchdir)):
        if gtcsv.endswith('_x.csv'):
            annot_csv = os.path.join(annot_rootdir_am_singles, matchdir, gtcsv)
            gt_csv = os.path.join(gt_rootdir_am_singles, matchdir, gtcsv)
            gt_y = gt_csv[:-5] + 'y.csv'
            
            print(gt_csv)
            
            df_gt = pd.read_csv(gt_csv)
            df_y = pd.read_csv(gt_y)
            df_annot = pd.read_csv(annot_csv)
            
            stats_dict = calc_stats(df_gt, df_y, df_annot, thresh_iou=0.5, shuttle_dist_thresh=0.01)
            stats_dict['file'] = os.path.join(matchdir, gtcsv[:-6])
            list_dicts.append(stats_dict.copy())

df_stats_am_singles = pd.DataFrame(list_dicts)
df_stats_am_singles.head()

input_features/gt/domain/domain_am_singles/match24/1_00_01_x.csv
input_features/gt/domain/domain_am_singles/match24/1_05_05_x.csv
input_features/gt/domain/domain_am_singles/match24/1_01_04_x.csv
input_features/gt/domain/domain_am_singles/match24/1_01_03_x.csv
input_features/gt/domain/domain_am_singles/match24/1_03_05_x.csv
input_features/gt/domain/domain_am_singles/match24/1_04_05_x.csv
input_features/gt/domain/domain_am_singles/match24/1_02_04_x.csv
input_features/gt/domain/domain_am_singles/match24/1_01_02_x.csv
input_features/gt/domain/domain_am_singles/match24/1_01_01_x.csv
input_features/gt/domain/domain_am_singles/match24/1_03_04_x.csv
input_features/gt/domain/domain_am_singles/match25/1_06_03_x.csv
input_features/gt/domain/domain_am_singles/match25/1_05_03_x.csv
input_features/gt/domain/domain_am_singles/match25/1_02_00_x.csv
input_features/gt/domain/domain_am_singles/match25/1_05_01_x.csv
input_features/gt/domain/domain_am_singles/match25/1_05_00_x.csv
input_features/gt/domain/

Unnamed: 0,near_vis,far_vis,near_acc,far_acc,shuttle_vis,shuttle_prec,shuttle_rec,shuttle_acc,file
0,1.0,0.835106,0.808511,1.0,0.803191,1.0,1.0,1.0,match24/1_00_01
1,1.0,1.0,1.0,1.0,0.831858,1.0,1.0,1.0,match24/1_05_05
2,1.0,1.0,1.0,1.0,0.8867,1.0,1.0,1.0,match24/1_01_04
3,1.0,1.0,1.0,1.0,0.934783,1.0,1.0,1.0,match24/1_01_03
4,1.0,1.0,1.0,1.0,0.908441,1.0,1.0,1.0,match24/1_03_05


### Observe stats (plus sanity check)

Annotation pipeline working decently. 

ONly problem is clementi singles videos, there are extra people near the court and they end up getting detected. 

In [23]:
matchnames_pro = [filename.split('/')[0] for filename in df_stats_pro['file'].values]
df_stats_pro['match'] = matchnames_pro

matchnames_am_singles = [filename.split('/')[0] for filename in df_stats_am_singles['file'].values]
df_stats_am_singles['match'] = matchnames_am_singles

#### video level

In [24]:
df_stats_pro.describe()

Unnamed: 0,near_vis,far_vis,near_acc,far_acc,shuttle_vis,shuttle_prec,shuttle_rec,shuttle_acc
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,0.998709,0.998709,0.999294,0.983277,0.873092,1.0,1.0,1.0
std,0.011997,0.011997,0.00502,0.047483,0.097217,0.0,0.0,0.0
min,0.881579,0.881579,0.937729,0.683784,0.495238,1.0,1.0,1.0
25%,1.0,1.0,1.0,0.995546,0.825843,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,0.890187,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,0.946203,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
df_stats_am_singles.describe()

Unnamed: 0,near_vis,far_vis,near_acc,far_acc,shuttle_vis,shuttle_prec,shuttle_rec,shuttle_acc
count,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0
mean,0.993652,0.934697,0.921722,0.954937,0.831801,0.921822,0.917373,0.917311
std,0.02087,0.091472,0.170695,0.113573,0.1406,0.179509,0.156855,0.155461
min,0.894958,0.681499,0.201527,0.578182,0.417062,0.391026,0.402985,0.487578
25%,1.0,0.860387,0.938018,1.0,0.770104,0.955882,0.909853,0.918067
50%,1.0,0.998473,1.0,1.0,0.88697,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,0.924594,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### match level

In [26]:
df_stats_pro.groupby('match').mean()

Unnamed: 0_level_0,near_vis,far_vis,near_acc,far_acc,shuttle_vis,shuttle_prec,shuttle_rec,shuttle_acc
match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
match1,1.0,1.0,1.0,1.0,0.861166,1.0,1.0,1.0
match10,1.0,1.0,1.0,1.0,0.907647,1.0,1.0,1.0
match11,1.0,1.0,1.0,0.998119,0.885028,1.0,1.0,1.0
match12,1.0,1.0,0.999725,0.959001,0.979819,1.0,1.0,1.0
match13,1.0,1.0,1.0,1.0,0.727283,1.0,1.0,1.0
match14,1.0,1.0,1.0,1.0,0.956235,1.0,1.0,1.0
match15,1.0,1.0,1.0,0.999669,0.884259,1.0,1.0,1.0
match16,1.0,1.0,0.987546,0.994505,0.922376,1.0,1.0,1.0
match17,1.0,1.0,0.996786,0.809889,0.938297,1.0,1.0,1.0
match18,1.0,1.0,1.0,1.0,0.817997,1.0,1.0,1.0


In [27]:
# issues with bbox for clementi
df_stats_am_singles.groupby('match').mean()

Unnamed: 0_level_0,near_vis,far_vis,near_acc,far_acc,shuttle_vis,shuttle_prec,shuttle_rec,shuttle_acc
match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
match24,1.0,0.983511,0.980851,1.0,0.895193,1.0,1.0,1.0
match25,0.99974,0.82477,1.0,1.0,0.878568,1.0,1.0,1.0
match26,1.0,0.97375,0.983036,0.998972,0.924956,1.0,1.0,1.0
match_china2,0.9713,0.912332,0.903454,1.0,0.582879,0.446149,0.601028,0.565273
match_clementi,1.0,0.999237,0.362582,0.648418,0.715135,0.888096,0.817157,0.803456
match_yewtee,0.964986,0.997438,0.743841,0.711047,0.672507,0.900993,0.689879,0.745961
