# Recovery of files from vox-grn
During investigation of the created database it was discovered that many of the files missing from the programs were present in the vox-grn files.
The purpose of this notebook is to recover those files.

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import pickle as pkl
from pathlib import Path
import time
import glob
import json
import requests

## Gather Data
First read in the descriptors of vox-grn and our created data.


In [10]:
# Now read in the description of the input and remove the unwanted columns and rename the rest to be python attribute names.
items = pd.read_csv('/home/jovyan/work/GRN-Notebooks/Data/all_items_for_processing.csv')
items = items.drop(columns=['Unnamed: 0', 'LanguageID', 'Language', 'Track', 'Recordist', 'Size', 'Length', 'Tape Side', 'Item Type', 'composite'])
items.rename(inplace=True, columns={ 'ISO' : 'iso', 'Location' : 'location', 'Year' : 'year', 'Path' : 'path', 'Filename' : 'filename', 'Title' : 'title', 'Program Item Number' : 'item_no', 'Program' : 'program' })

# the existing ID is based on track number rather than item number. We want to use the item number.
items['ID'] = items['program'] + '_' + items['item_no'].astype(int).apply('{:0>3d}'.format)

items.set_index('ID', inplace=True)
items.columns


Index(['iso', 'program', 'location', 'year', 'path', 'filename', 'item_no',
       'title', 'start', 'end'],
      dtype='object')

In [12]:
# read the data into a pandas data frame
file_descriptors = pd.read_csv("/prometheus/GRN/recording_files_with_tags_and_track.csv")
print(f'File descriptors shape: {file_descriptors.shape}')
items = pd.read_csv("/prometheus/GRN/grid_program_items.csv")
print(f'Program items shape: {items.shape}')


File descriptors shape: (210704, 12)
Program items shape: (267681, 21)


Now find the missing files from the file descriptors

In [4]:
def check_for_file(item_row):
    return os.path.isfile('/media/programs/' + item_row['Path'] + item_row['Filename'] )

file_descriptors['file exists'] = file_descriptors.apply(check_for_file, axis=1)
print(f'Files verified as existing for {sum(file_descriptors["file exists"])} out of {file_descriptors.shape[0]} records.')

Files verified as existing for 203879 out of 210704 records.


In [5]:
missing_files = file_descriptors[file_descriptors['file exists'] == False].copy()

Read in the vox-grn data.

In [6]:

# create a dataframe using a generator
def gen_vox_grn():
  resp = requests.get('https://raw.githubusercontent.com/johno-source/vox-grn/main/data/vox-grn.json')
  vox_dict = json.loads(resp.text)
  for iso in vox_dict.keys():
    lang_df = pd.json_normalize(vox_dict[iso])
    lang_df['iso'] = iso
    yield lang_df

vox_df = pd.concat(gen_vox_grn())

In [7]:
# pull out the program ID
vox_df['program'] = vox_df['file'].str.extract('\./Audio_MP3/[0-9]{2}/([0-9]{5})')
missing_files['prog'] = missing_files['Program'].str.extract('[AC]([0-9]{5})')
missing_files['found_prog'] = missing_files['prog'].isin(vox_df['program'])
found_files = missing_files[missing_files['found_prog']]
print(len(found_files))

6604


### File Recovery from vox-grn

First of all we currently have the missing files, but what we really need is the subset of vox_df that we want to load.

In [8]:
vox_df['recovery_candidate'] = vox_df['program'].isin(missing_files['prog'])
vox_files = vox_df[vox_df['recovery_candidate']]
print(f'Recovery files: {len(vox_files)}')

Recovery files: 7015


Now set up access to the aws server.

In [10]:
import logging
import boto3
from botocore.exceptions import ClientError
import getpass

In [11]:
access_key = getpass.getpass('key')
secret = getpass.getpass('secret')

In [12]:
session = boto3.session.Session()
client = session.client('s3',
                        region_name='us-east-1',
                        aws_access_key_id=access_key,
                        aws_secret_access_key=secret)

In [13]:
def extract_vox_grn_key(vox_grn_dir):
    return vox_grn_dir[2:]

def prepare_vox_grn_file(vox_grn_key):
    vox_path_list = vox_grn_key.split('/')
    vox_path = '.'
    if len(vox_path_list) > 1:
        vox_path = '/media/programs/vox_grn/' + '/'.join(vox_path_list[:-1])
        Path(vox_path).mkdir(parents=True, exist_ok=True)
    return vox_path + '/' + vox_path_list[-1]

Now iterate through all the files that could be recovered and download them from the amazon server.

In [14]:
# now iterate over all the candidates and download them if possible
start_time = time.time()
count = 0
for candidate in vox_files.itertuples():
    vox_grn_key = extract_vox_grn_key(candidate.file)
    filename = prepare_vox_grn_file(vox_grn_key)
    if not os.path.exists(filename):
        client.download_file('grn-media',vox_grn_key, filename)
    count += 1
    if count % 10 == 0:
        print('.', end='')
    if count % 100 == 0:
        print(f'{count} out of {len(vox_files)} completed in {time.time() - start_time} seconds.')


..........100 out of 7015 completed in 444.0606791973114 seconds.
..........200 out of 7015 completed in 2105.9120016098022 seconds.
..........300 out of 7015 completed in 3766.7452392578125 seconds.
..........400 out of 7015 completed in 4903.281210899353 seconds.
..........500 out of 7015 completed in 6172.015818119049 seconds.
..........600 out of 7015 completed in 7420.355749607086 seconds.
..........700 out of 7015 completed in 8376.8984105587 seconds.
..........800 out of 7015 completed in 9615.405744552612 seconds.
..........900 out of 7015 completed in 10789.719019889832 seconds.
..........1000 out of 7015 completed in 11266.099214792252 seconds.
..........1100 out of 7015 completed in 12378.761924266815 seconds.
..........1200 out of 7015 completed in 13178.961907863617 seconds.
..........1300 out of 7015 completed in 13633.576668262482 seconds.
..........1400 out of 7015 completed in 14511.333701372147 seconds.
..........1500 out of 7015 completed in 15624.468387842178 second

OK - Now that the files have been recovered we need to put them back into the file descriptors. 

This will be done in FileDescriptorsFromVoxGRN.ipynb.
