In [1]:
import os
import json
import pandas as pd
import requests
import tensorflow as tf

## Read in Vocab csv and get labels
Label list as a number is a part of the video-level features dataset  
http://research.google.com/youtube8m/download.html

In [2]:
label_df = pd.read_csv('../data/youtube8m/vocabulary.csv', index_col="Index")
label_df

Unnamed: 0_level_0,TrainVideoCount,KnowledgeGraphId,Name,WikiUrl,Vertical1,Vertical2,Vertical3,WikiDescription
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,788288,/m/03bt1gh,Game,https://en.wikipedia.org/wiki/Game,Games,,,"A game is structured form of play, usually und..."
1,539945,/m/01mw1,Video game,https://en.wikipedia.org/wiki/Video_game,Games,,,A video game is an electronic game that involv...
2,415890,/m/07yv9,Vehicle,https://en.wikipedia.org/wiki/Vehicle,Autos & Vehicles,,,A vehicle is a mobile machine that transports ...
3,378135,/m/01jddz,Concert,https://en.wikipedia.org/wiki/Concert,Arts & Entertainment,,,A concert is a live music performance in front...
4,286532,/m/09jwl,Musician,https://en.wikipedia.org/wiki/Musician,Arts & Entertainment,,,A musician is a person who plays a musical ins...
...,...,...,...,...,...,...,...,...
3805,131,/m/02_6gvj,Uncharted: Drake's Fortune,https://en.wikipedia.org/wiki/Uncharted:_Drake...,Games,,,Uncharted: Drake's Fortune is a 2007 action-ad...
3854,130,/m/02kh4w,Injury,https://en.wikipedia.org/wiki/Injury,Health,,,Injury is damage to the body. This may be caus...
3824,130,/m/01hbjs,Look-alike,https://en.wikipedia.org/wiki/Look-alike,(Unknown),,,"A look-alike, or double, is a person who close..."
3844,127,/m/01vzvy,Mortar (masonry),https://en.wikipedia.org/wiki/Mortar_(masonry),Business & Industrial,,,Mortar is a workable paste used to bind buildi...


#### Create label mapping
create a dictionary mapping that is final_label:list of current labels

In [3]:
labels = {}
labels['indoors'] = ['Bedroom',
                    'Bathroom',
                    'Classroom',
                    'Office',
                    'Kitchen',
                    'Living room',
                    'Dining room',
                    'Room']
labels['outdoors'] = ['Landscape',
                    'Skyscraper',
                    'City',
                    'Mountain',
                    'Beach',
                    'National park',
                    'Outdoor recreation',
                    'Farm']

Need to change the current labels from names to indices as is used in the tfrecords

In [4]:
label_map = {}
for label, item_list in labels.items():
    index_list = [label_df.index[label_df['Name'] == x][0] for x in item_list]
    label_map[label] = index_list

label_map

{'indoors': [416, 514, 1245, 3501, 307, 380, 614, 184],
 'outdoors': [2211, 2105, 1226, 666, 248, 1238, 60, 262]}

## Read in and get needed info from downloaded TFRecords
Video-level has an id (which will need to be translated)  
and a list that has the label indices we will use for assigning our classification labels  
Already downloaded records from the instructions provided here: http://research.google.com/youtube8m/download.html  
Downloaded 1/20th of data for initial run  
`curl data.yt8m.org/download.py | shard=1,20 partition=2/video/train mirror=us python`

In [5]:
YT8M_DIRECTORY = '../data/youtube8m/'

In [6]:
def get_tfrecords(directory):
    return [os.path.join(directory, x) for x in os.listdir(directory) if x.endswith('.tfrecord')]

How many files did we snag?

In [7]:
len(get_tfrecords(YT8M_DIRECTORY))

192

In [8]:
raw_dataset = tf.data.TFRecordDataset(get_tfrecords(YT8M_DIRECTORY))

Look at what a single record looks like

In [9]:
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)
    print(type(example))

features {
  feature {
    key: "id"
    value {
      bytes_list {
        value: "eACj"
      }
    }
  }
  feature {
    key: "labels"
    value {
      int64_list {
        value: 180
        value: 304
      }
    }
  }
  feature {
    key: "mean_audio"
    value {
      float_list {
        value: -1.5312055349349976
        value: -1.0285152196884155
        value: 0.15257614850997925
        value: -1.3953794240951538
        value: -0.5539141893386841
        value: 1.0660279989242554
        value: -1.8354463577270508
        value: 0.3552817106246948
        value: -0.7087097764015198
        value: 0.95269775390625
        value: -0.3510870337486267
        value: -1.0913819074630737
        value: -0.43328797817230225
        value: -0.13257357478141785
        value: 0.9500225782394409
        value: 1.6974917650222778
        value: 1.8891319036483765
        value: -0.3803924024105072
        value: -1.9713940620422363
        value: 1.7584128379821777
        value: -0

In [10]:
def get_target(label_map, label_list):
    '''
    Iterate through our label map to see what final labels apply to the record
    '''
    targets = []
    for target, index_list in label_map.items():
        if bool(set(label_list) & set(index_list)):
            targets.append(target)
    
    return targets

In [14]:
def yt8m_id_translate(yt8m_id):
    base_url = 'http://data.yt8m.org/2/j/i/'
    url = base_url + f"{yt8m_id[:2]}/{yt8m_id}.js"
    r = requests.get(url=url)
    if "Anonymous caller does not have storage.objects.get access" in r.text:
        print("No such video ID")
        return None
    else:
        return r.text.split(",")[1].strip(");").strip('"')

# Unit Test yt8m_id_translate()
assert (yt8m_id_translate("nXSc") == "0sf943sWZls")
assert (yt8m_id_translate("foob") is None)

No such video ID


In [12]:
def transform_tfrecords(tf_record):
    output = {}
    
    yt8m_id = tf_record.features.feature["id"].bytes_list.value[0].decode('utf-8')
    output['yt8m_id'] = yt8m_id
    
    labels = list(tf_record.features.feature['labels'].int64_list.value)
    output['labels'] = labels
    
    targets = get_target(label_map, labels)

    # Only keep items that have a single matching target
    if len(targets) == 1:
        output['target'] = targets[0]
    else: 
        return None
    
    yt_id = yt8m_id_translate(yt8m_id)
    
    if yt_id is None:
        return None
    
    output['yt_id'] = yt8m_id_translate(yt8m_id)
    
    return output    

### Read in tfrecord
As described at https://www.tensorflow.org/tutorials/load_data/tfrecord

In [13]:
df = pd.DataFrame()

count = 0
for raw_record in raw_dataset:
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    
    output = transform_tfrecords(example)
    if output is not None:
        df = df.append(output, ignore_index=True)
    
    count += 1
    if count%1000 == 0:
        print(f"{count} records processed")

No such video ID
1000 records processed
2000 records processed
3000 records processed
4000 records processed
5000 records processed
6000 records processed
No such video ID
7000 records processed
8000 records processed
9000 records processed
10000 records processed
11000 records processed
12000 records processed
13000 records processed
14000 records processed
15000 records processed
No such video ID
16000 records processed
17000 records processed
No such video ID
18000 records processed
No such video ID
19000 records processed
20000 records processed
21000 records processed
22000 records processed
23000 records processed
24000 records processed
No such video ID
25000 records processed
26000 records processed
27000 records processed
28000 records processed
29000 records processed
30000 records processed
No such video ID
31000 records processed
32000 records processed
33000 records processed
No such video ID
34000 records processed
No such video ID
No such video ID
35000 records processed

#### Check out work

In [14]:
df

Unnamed: 0,labels,target,yt8m_id,yt_id
0,"[88, 184, 420, 514]",indoors,k5a7,LLf2i_EK6RU
1,"[15, 18, 60, 87, 545, 1696]",outdoors,Nda7,bkK-qbwAgJo
2,"[511, 514, 1148, 2277, 2405]",indoors,Dca7,jsFiMne0Eyk
3,[60],outdoors,OEa7,p4Ub6lYdK38
4,"[11, 20, 22, 29, 176, 307, 2561]",indoors,EMa7,KW2WY3GIpt4
...,...,...,...,...
2891,"[11, 262, 370]",outdoors,sa9Y,RF_c2Zp5AhU
2892,"[184, 191, 985]",indoors,jD9Y,3GHSzGIqgTg
2893,"[60, 210, 685]",outdoors,lV9Y,xK2AdjZBNoA
2894,"[60, 2701]",outdoors,F99Y,6w6VXsszoak


#### Check target counts
Indoor has about 1/3rd of the outdoor. Not bad, but we'll normalize when splitting

In [15]:
df.groupby("target").count()

Unnamed: 0_level_0,labels,yt8m_id,yt_id
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
indoors,705,705,705
outdoors,2191,2191,2191


Write out results to split and get frames later

In [16]:
df.to_csv('../data/yt8m_indoor_outdoor_labels_id.csv')