In [1]:
import numpy as np
import pandas as pd

### Explore training data

In [2]:
df = pd.read_csv('../../data/train.csv')

# Process the annotations column
df['annotations'] = df['annotations'].map(eval)
df['num_annotations'] = df['annotations'].map(len)

In [3]:
df.shape

(23501, 7)

In [4]:
df.head()

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,num_annotations
0,0,40258,0,0,0-0,[],0
1,0,40258,1,1,0-1,[],0
2,0,40258,2,2,0-2,[],0
3,0,40258,3,3,0-3,[],0
4,0,40258,4,4,0-4,[],0


One factor we may need to consider, we'll need to do the dataset split carefully. We don't want adjacent frames in the train/test set. Let's take a look at the statistics.

#### Basic stats on videos, frames, sequences, etc.

In [5]:
print(f"Unique video ids: {df['video_id'].unique()}")

Unique video ids: [0 1 2]


In [6]:
print(f"Unique sequences: {df['sequence'].unique().shape[0]}")

Unique sequences: 20


In [11]:
df.groupby(['video_id']).agg({'sequence': [pd.Series.nunique, 'count']}).sort_index()

Unnamed: 0_level_0,sequence,sequence
Unnamed: 0_level_1,nunique,count
video_id,Unnamed: 1_level_2,Unnamed: 2_level_2
0,8,6708
1,8,8232
2,4,8561


In [15]:
df.groupby(['video_id','sequence']).agg({'sequence': 'count'}).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,sequence
video_id,sequence,Unnamed: 2_level_1
0,996,923
0,8399,1423
0,35305,853
0,40258,480
0,45015,617
0,45518,798
0,53708,1077
0,59337,537
1,8503,2843
1,15827,770


#### Basic stats on annotations

In [None]:
print(f'Number empty images: {df[df["num_annotations"]==0].shape[0]}')
print(f'Number non-empty images: {df[df["num_annotations"]!=0].shape[0]}')

In [None]:
df['num_annotations'].value_counts().sort_index()

So we'll probably want to do some downsampling of negative examples, but this may be a hyper-parameter that we tune. Unrelated, but we may also need to do some downsampling of adjacent video frames.

In [None]:
df[df['num_annotations']==1]['annotations'].values[:3]

In [None]:
df[df['num_annotations']==2]['annotations'].values[:3]

So the annotations for an image are stored as a list of dicts. Cool.