In [1]:
import pandas as pd
import os
import shutil

In [2]:
df_train = pd.read_csv('../data/annotation/train_annotation.csv').drop(columns = 'Unnamed: 0')
df_test = pd.read_csv('../data/annotation/test_annotation.csv').drop(columns = 'Unnamed: 0')

## Annotation EDA

In [3]:
df_train.shape, df_test.shape

((51826, 10), (57649, 10))

In [4]:
df_train.isnull().sum()

Filename                     0
Annotation tag               0
Upper left corner X          0
Upper left corner Y          0
Lower right corner X         0
Lower right corner Y         0
Origin file                  0
Origin frame number          0
Origin track                 0
Origin track frame number    0
dtype: int64

In [5]:
df_test.isnull().sum()

Filename                     0
Annotation tag               0
Upper left corner X          0
Upper left corner Y          0
Lower right corner X         0
Lower right corner Y         0
Origin file                  0
Origin frame number          0
Origin track                 0
Origin track frame number    0
dtype: int64

In [6]:
df_train.dtypes

Filename                     object
Annotation tag               object
Upper left corner X           int64
Upper left corner Y           int64
Lower right corner X          int64
Lower right corner Y          int64
Origin file                  object
Origin frame number           int64
Origin track                 object
Origin track frame number     int64
dtype: object

In [7]:
df_test.dtypes

Filename                     object
Annotation tag               object
Upper left corner X           int64
Upper left corner Y           int64
Lower right corner X          int64
Lower right corner Y          int64
Origin file                  object
Origin frame number           int64
Origin track                 object
Origin track frame number     int64
dtype: object

In [8]:
df_train['Annotation tag'].value_counts()

go             22946
stop           18382
stopLeft        7707
goLeft          1236
Name: Annotation tag, dtype: int64

In [9]:
df_test['Annotation tag'].value_counts()

stop           25936
go             23777
stopLeft        5027
goLeft          1240
goForward        205
Name: Annotation tag, dtype: int64

## drop goForward: no goForward in train

In [10]:
df_test.drop(df_test.loc[df_test['Annotation tag'] == 'goForward'].index, inplace = True)

In [11]:
df_test['Annotation tag'].value_counts()

stop           25936
go             23777
stopLeft        5027
goLeft          1240
Name: Annotation tag, dtype: int64

## Create tag label

In [12]:
# new train, test folder
os.mkdir('../data/train_eda')
os.mkdir('../data/test_eda')

In [13]:
# train label
for i in df_train['Annotation tag'].value_counts().index:
    os.mkdir(f'../data/train_eda/{i}')

In [14]:
# test label 
for i in df_test['Annotation tag'].value_counts().index:
    os.mkdir(f'../data/test_eda/{i}')

In [17]:
def get_annotation(df,folder,annotation):
    # list_tag: store image name (no directory)
    # i is tag name: ex 'go', 'stop'
    list_tag = [i.split('/')[1] for i in df.loc[df['Annotation tag'] == annotation]['Filename']]
    list_tag_set = set(list_tag)
    list_tag = list(list_tag_set)
    print (i)
    print (list_tag[:20])
    
    # the new directory we move image into
    new_dir = f'../data/{folder}_eda/{annotation}/'
    
    # the old directory we move image from
    img_dir = f'../data/{folder}/'
    for img in list_tag:
            src = os.path.join(img_dir, img)
            dst = os.path.join(new_dir, img)
            shutil.copyfile(src,dst)

In [18]:
# train
for i in df_train['Annotation tag'].value_counts().index:
    get_annotation(df_train, 'train', i)

go
['dayClip11--00250.jpg', 'dayClip7--01900.jpg', 'dayClip13--00654.jpg', 'dayClip1--01684.jpg', 'dayClip5--01650.jpg', 'dayClip1--01628.jpg', 'dayClip5--02215.jpg', 'nightClip2--00351.jpg', 'nightClip2--00185.jpg', 'nightClip2--01086.jpg', 'dayClip7--01760.jpg', 'nightClip5--00318.jpg', 'dayClip7--01953.jpg', 'dayClip1--01881.jpg', 'nightClip4--00149.jpg', 'dayClip7--01874.jpg', 'dayClip3--00592.jpg', 'dayClip10--00001.jpg', 'dayClip1--00177.jpg', 'nightClip2--00024.jpg']
stop
['dayClip7--00225.jpg', 'dayClip1--00410.jpg', 'dayClip6--00174.jpg', 'dayClip7--00244.jpg', 'dayClip2--00178.jpg', 'nightClip4--00613.jpg', 'dayClip5--00478.jpg', 'dayClip13--00509.jpg', 'nightClip4--00751.jpg', 'dayClip7--01428.jpg', 'dayClip1--00572.jpg', 'dayClip7--01246.jpg', 'dayClip13--00362.jpg', 'dayClip1--00299.jpg', 'nightClip4--00825.jpg', 'dayClip7--00682.jpg', 'dayClip1--01271.jpg', 'dayClip8--00129.jpg', 'dayClip7--01499.jpg', 'dayClip1--00321.jpg']
stopLeft
['dayClip7--00225.jpg', 'dayClip6--001

In [19]:
# test
for i in df_test['Annotation tag'].value_counts().index:
    get_annotation(df_test, 'test', i)

stop
['nightSequence1--00648.jpg', 'nightSequence2--02054.jpg', 'nightSequence2--00585.jpg', 'nightSequence1--03346.jpg', 'nightSequence2--00221.jpg', 'nightSequence1--01084.jpg', 'nightSequence2--00129.jpg', 'nightSequence2--01749.jpg', 'nightSequence2--05175.jpg', 'nightSequence1--00554.jpg', 'daySequence1--00108.jpg', 'daySequence1--03489.jpg', 'daySequence1--00829.jpg', 'daySequence1--00183.jpg', 'nightSequence1--01344.jpg', 'nightSequence1--00375.jpg', 'daySequence1--02164.jpg', 'nightSequence2--00130.jpg', 'daySequence2--05734.jpg', 'daySequence2--03792.jpg']
go
['daySequence1--03052.jpg', 'nightSequence1--04480.jpg', 'daySequence2--02653.jpg', 'nightSequence2--02054.jpg', 'nightSequence2--00585.jpg', 'nightSequence2--02380.jpg', 'nightSequence1--00554.jpg', 'nightSequence2--04382.jpg', 'daySequence1--03400.jpg', 'daySequence1--02936.jpg', 'nightSequence2--05284.jpg', 'nightSequence1--01911.jpg', 'nightSequence1--02524.jpg', 'nightSequence1--04379.jpg', 'nightSequence1--02036.jpg

## remove train, test folder

In [20]:
shutil.rmtree('../data/train')
shutil.rmtree('../data/test')