# Investigation
I wonder where the other languages have gone? In the original items there were how many languages? Why do some languages have so few segments?

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import pickle as pkl
from pathlib import Path
import time
import glob
import json
import requests
import math



## Number of Languages
First lets answer the question of where have all the languages gone.

How many languages were there in the original input file? 3267 according to the analysis below. This is considerably less than the number of languages we believe are in the data set (~4500). Where have the other languages gone?

In [2]:
# Now read in the description of the input and remove the unwanted columns and rename the rest to be python attribute names.
orig_items = pd.read_csv('/home/jovyan/work/GRN-Notebooks/Data/all_items_for_processing.csv')
orig_items = orig_items.drop(columns=['Unnamed: 0', 'LanguageID', 'Language', 'Track', 'Recordist', 'Size', 'Length', 'Tape Side', 'Item Type', 'composite'])
orig_items.rename(inplace=True, columns={ 'ISO' : 'iso', 'Location' : 'location', 'Year' : 'year', 'Path' : 'path', 'Filename' : 'filename', 'Title' : 'title', 'Program Item Number' : 'item_no', 'Program' : 'program' })

# the existing ID is based on track number rather than item number. We want to use the item number.
orig_items['ID'] = orig_items['program'] + '_' + orig_items['item_no'].astype(int).apply('{:0>3d}'.format)

orig_items.set_index('ID', inplace=True)
orig_items.columns


Index(['iso', 'program', 'location', 'year', 'path', 'filename', 'item_no',
       'title', 'start', 'end'],
      dtype='object')

How many languages in this set?

In [3]:
langs1 = orig_items['iso'].value_counts()
len(langs1)

3267

So there were only 3267 languages in the original set. Is this because some languages did not have there item type labelled?

Go back to the way the data was formed and see what is happening to the languages.

In [4]:
# read the data into a pandas data frame
file_descriptors = pd.read_csv("/prometheus/GRN/recording_files_with_tags_and_track.csv")
print(f'File descriptors shape: {file_descriptors.shape}')
items = pd.read_csv("/prometheus/GRN/grid_program_items.csv")
print(f'Program items shape: {items.shape}')


File descriptors shape: (210704, 12)
Program items shape: (267681, 21)


In [5]:
print(f'File descriptors columns include:\n{list(file_descriptors.columns)}')
print("-"*30)
print(f'Program Items columns include:\n{list(items.columns)}')


File descriptors columns include:
['LanguageID', 'ISO', 'Language', 'Program', 'Track', 'Recordist', 'Location', 'Year', 'Path', 'Filename', 'Size', 'Length']
------------------------------
Program Items columns include:
['Program Number', 'Program Item Number', 'Tape Side', 'Track Number', 'Original Recording Number', 'Original Item Number', 'Title', 'Vernacular Item Title', 'Language Number', 'Language Name', 'Item Start Time', 'Item Time', 'Finish Time', 'Original Time', 'Script Number', 'Script Name', 'Picture Number', 'Item Type', 'Comments', 'Entered By', 'Enter On Date']


So how many iso codes in the file descriptors? And how many language numbers in the items?

In [6]:
print(f'Number of iso codes in file descriptors: {len(file_descriptors["ISO"].value_counts())}')
print(f'Number of language numbers in file descriptors: {len(file_descriptors["LanguageID"].value_counts())}')
print(f'Number of language names in file descriptors: {len(file_descriptors["Language"].value_counts())}')
print(f'Number of file descriptors without an ISO code: {sum(file_descriptors["ISO"].isna())}')
print(f'Number of language numbers in program items: {len(items["Language Number"].value_counts())}')
print(f'Number of language names in program items: {len(items["Language Name"].value_counts())}')

Number of iso codes in file descriptors: 3972
Number of language numbers in file descriptors: 6148
Number of language names in file descriptors: 6130
Number of file descriptors without an ISO code: 946
Number of language numbers in program items: 6461
Number of language names in program items: 6444


In [7]:
# give the items an ID
items['ID'] = items['Program Number'] + '_' + items['Track Number'].astype(int).apply('{:0>3d}'.format)


This means the file descriptors have 800 more languages than we used for analysis. Why were the files dropped? We need to look at this.

Furthermore we have almost 1000 file descriptors without an ISO code. Can we recover an ISO code from the language number?

Also some records pointed to non-existent files -  and were dropped. How many of these were there? Did the file really not exists?

## 1. How many records had missing files?

In [8]:
def check_for_file(item_row):
    return os.path.isfile('/media/programs/' + item_row['Path'] + item_row['Filename'] )

file_descriptors['file exists'] = file_descriptors.apply(check_for_file, axis=1)
print(f'Files verified as existing for {sum(file_descriptors["file exists"])} out of {file_descriptors.shape[0]} records.')

Files verified as existing for 203879 out of 210704 records.


In [9]:
missing_files = file_descriptors[file_descriptors['file exists'] == False].copy()

How many languages are in these missing files?

In [10]:
print(f'Number of iso codes in descriptors with missing files: {len(missing_files["ISO"].value_counts())}')


Number of iso codes in descriptors with missing files: 1400


If we could recover these files how many languages would we recover?

In [11]:
existent_files = file_descriptors[file_descriptors['file exists']]
existing_set = set(existent_files['ISO'])
missing_set = set(missing_files['ISO'])
overlapping_set = existing_set.intersection(missing_set)
print(f'Number of ISO codes that overlap with existing data: {len(overlapping_set)}')
print(f'Number of ISO codes not in existing data: {len(missing_set)-len(overlapping_set)}')

Number of ISO codes that overlap with existing data: 778
Number of ISO codes not in existing data: 623


So they are very worthwhile recovering: 778 with more data for existing languages and 623 new languages.

Lets try and find the files.

Are the missing files located in vox-grn?

In [12]:

# create a dataframe using a generator
def gen_vox_grn():
  resp = requests.get('https://raw.githubusercontent.com/johno-source/vox-grn/main/data/vox-grn.json')
  vox_dict = json.loads(resp.text)
  for iso in vox_dict.keys():
    lang_df = pd.json_normalize(vox_dict[iso])
    lang_df['iso'] = iso
    yield lang_df

vox_df = pd.concat(gen_vox_grn())

In [13]:
# pull out the program ID
vox_df['program'] = vox_df['file'].str.extract('\./Audio_MP3/[0-9]{2}/([0-9]{5})')
vox_df['item'] = vox_df['file'].str.extract('\./Audio_MP3/[0-9]{2}/[0-9]{5}/.*?(0[0-9]{2})')

In [14]:
missing_files['prog'] = missing_files['Program'].str.extract('[AC]([0-9]{5})')

In [15]:
missing_files['found_prog'] = missing_files['prog'].isin(vox_df['program'])
found_files = missing_files[missing_files['found_prog']]
print(len(found_files))

6604


which means that all but 221 files can potentially be recovered.

Let's try and bring them back. See RecoverFilesFromVoxGRN.ipynb for this effort.



In [16]:
# the number of languages recovered
print(f'Number of iso codes in found files: {len(found_files["ISO"].value_counts())}')
found_set = set(found_files['ISO'])
overlapping_set = existing_set.intersection(found_set)
print(f'Number that overlap {len(overlapping_set)}')

Number of iso codes in found files: 1394
Number that overlap 772


So that gives over 600 more languages and it gives more data to almost 800 other languages.

## 2 Files without an ISO code
First identify the files that did not have an iso code. Do they have a language id?

In [17]:
missing_iso = file_descriptors[file_descriptors.ISO.isna()]
print(f'Number of files missing an iso code and a language number {sum(missing_iso["LanguageID"].isna())}')

Number of files missing an iso code and a language number 0


OK - Can we generate a map of language ID to ISO code?

In [18]:
lang_id_to_iso = {}
for file in file_descriptors.itertuples():
    if isinstance(file.ISO, str):
        if file.LanguageID in lang_id_to_iso:
            if file.ISO != lang_id_to_iso[file.LanguageID]:
                print(f'{file.LanguageID} is mapped to {file.ISO} and {lang_id_to_iso[file.LanguageID]}')
        else:
            lang_id_to_iso[file.LanguageID] = file.ISO

Now can we use this map to restore the ISO code for the files that have it missing?

In [19]:
recovered_iso = 0
unrecovered_iso = 0
unrecovered_lang_ids = set()
for file in missing_iso.itertuples():
    if file.LanguageID in lang_id_to_iso:
        recovered_iso += 1
    else:
        unrecovered_iso += 1
        unrecovered_lang_ids.add(file.LanguageID)

print(f'Number of files with ISO recovered: {recovered_iso}')
print(f'Number of files with ISO not recovered: {unrecovered_iso}')
print(f'Number of Language IDs with an unknown ISO code {len(unrecovered_lang_ids)}')

Number of files with ISO recovered: 0
Number of files with ISO not recovered: 946
Number of Language IDs with an unknown ISO code 69


From looking at the GRN database and the internet it would appear that these 69 languages are not ISO languages.

Do these programs exist in vox-grn?

In [20]:
missing_iso = missing_iso.copy()
missing_iso['prog'] = missing_iso['Program'].str.extract('[AC]([0-9]{5})')
missing_iso['in_vox_grn'] = missing_iso['prog'].isin(vox_df['program'])
found_iso = missing_iso[missing_iso['in_vox_grn']].copy()
print(f'Number of missing ISO programs in vox-grn {len(found_iso)}')

Number of missing ISO programs in vox-grn 11


And these all belong to one language which has an ISO code of nan - which has tripped up python.

### Conclusion
The files with missing ISO codes really do not have an ISO code. They should just be dropped from the study.

## Items dropped based on type.
Now the program items were filtered based on type. Lets look at their type to see what was excluded.

In [21]:
item_types = items['Item Type'].value_counts()
print(item_types)

Message                 212534
Song                     24371
Scripture                14013
Message & Song            6085
Instrumental              4233
Message/Instrumental      2940
Announcement               968
Bridge                     960
Chorus from                610
Testimony                  545
Scripture Stories          181
Song and Scripture          90
Undefined                   77
Sound Effect                63
Poem                        11
Name: Item Type, dtype: int64


and we kept 'Message', 'Scripture', 'Scripture Stories', 'Testimony' which is probably reasonable.

After we filtered out the items how many languages did we have?

In [22]:
def usable_types(item_row):
    unusable_items = ['Instrumental', 'Sound Effect', 'Announcement', 'Bridge']
    return item_row['Item Type'] not in unusable_items

items['usable'] = items.apply(usable_types, axis=1)
usable_items = items[items["usable"]].copy()

usable_items.drop(['usable'], inplace=True, axis=1)
print(f'There are {usable_items.shape[0]} usable items out of {items.shape[0]} total items.')

There are 261457 usable items out of 267681 total items.


In [23]:
print(f'Number of language numbers in usable items: {len(usable_items["Language Number"].value_counts())}')
print(f'Number of language names in usable items: {len(usable_items["Language Name"].value_counts())}')
print(f'Number of usable items: {len(usable_items)}')

Number of language numbers in usable items: 6461
Number of language names in usable items: 6444
Number of usable items: 261457


So we only lost about 100 languages by doing this filtering. 


Lets take a different tack. How many programs and items in Rob's code are not found in our data? Note that Rob's data did NOT accurately give the item numbers.

## Languages with few segments
Some languages had very few segments associated with them. Lets find out why. I have two theories to test:
1. The item times meant that data was missed
2. The vad failed to recognise voice in the language.

Lets investigate cjm, nyq, gaw, jeh, hac

In [24]:
# read in the seg_4_df file
seg_4_df = pd.read_csv("../../data/seg_4_df.csv")
print(f'The columns of 4 second segments are:\n{seg_4_df.columns}')


The columns of 4 second segments are:
Index(['file_name', 'Index', 'iso', 'program', 'location', 'year', 'path',
       'filename', 'item_no', 'title', 'start', 'end', 'seg_start', 'seg_stop',
       'seg'],
      dtype='object')


In [25]:
langs = seg_4_df['iso'].value_counts()

How many files did cjm have?

In [26]:
print(f'Number of cjm files: {sum(file_descriptors.ISO == "cjm")}')

Number of cjm files: 43


So how did 43 files finish up having only one segment? Lets listen to the files.

In [27]:
cjm_files = file_descriptors[file_descriptors.ISO == 'cjm'].copy()
print(cjm_files.Filename)

94                      C03180A.mp3
95                      C03180B.mp3
97831    A66147-01-Introduction.wav
97832          A66147-02-H�nh_1.wav
97833          A66147-03-H�nh_2.wav
97834          A66147-04-H�nh_3.wav
97835          A66147-05-H�nh_4.wav
97836          A66147-06-H�nh_5.wav
97837          A66147-07-H�nh_6.wav
97838          A66147-08-H�nh_7.wav
97839          A66147-09-H�nh_8.wav
97840          A66147-10-H�nh_9.wav
97841         A66147-11-H�nh_10.wav
97842         A66147-12-H�nh_11.wav
97843         A66147-13-H�nh_12.wav
97844         A66147-14-H�nh_13.wav
97845         A66147-15-H�nh_14.wav
97846         A66147-16-H�nh_15.wav
97847         A66147-17-H�nh_16.wav
97848         A66147-18-H�nh_17.wav
97849         A66147-19-H�nh_18.wav
97850         A66147-20-H�nh_19.wav
97851         A66147-21-H�nh_20.wav
97852         A66147-22-H�nh_21.wav
97853         A66147-23-H�nh_22.wav
97854         A66147-24-H�nh_23.wav
97855         A66147-25-H�nh_24.wav
97856         A66147-26-H�nh

So the files all look good. The language is quite abrupt. What about the item meta data?

In [28]:
cjm_items = orig_items[orig_items.program == 'A66147']
print(f'Number of cjm items: {len(cjm_items)}')

Number of cjm items: 1


Only one item -  where did the others go?

In [29]:
cjm_prog_items = items[items['Program Number'] == 'A66147']
print(f'Number of cjm items: {len(cjm_prog_items)}')


Number of cjm items: 41


Was the data in items_with_records?

In [30]:
items_with_records = pd.read_csv('../../data/items_with_records.csv')
print(f'Items with Records that have cjm language: {sum(items_with_records.ISO == "cjm")}')

Items with Records that have cjm language: 1


No. The records are lost before items_with_records is written.

Rerunning FormProgramTrackFrame.ipynb it can be seen that the file is lost because os.path.isfile cannot cope with the odd characters in the file name.

In [31]:
import os

def check_for_file(item_row):
    return os.path.isfile('/media/programs/' + item_row['Path'] + item_row['Filename'] )    

def check_for_foreign_file(item_row):
    # The issue is that some filenames contain the replacement character because the
    # filenames have unknonw foreign characters in them.
    path_to_file = '/media/programs/' + item_row['Path'] + item_row['Filename'].replace('ufffd', '*')
    print(f'Char at 60 {ord(path_to_file[60])}')
    return os.path.isfile(path_to_file)    

cjm_files['exists'] = cjm_files.apply(check_for_file, axis=1)
cjm_files['foreign exists'] = cjm_files.apply(check_for_foreign_file, axis=1)


Char at 60 65
Char at 60 66
Char at 60 110
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533
Char at 60 65533


In [32]:
import chardet
result = chardet.detect(cjm_files.iloc[3].Filename.encode())

encoding = result['encoding']
confidence = result['confidence']

print(f"Encoding: {encoding} with confidence {confidence}")

Encoding: ISO-8859-1 with confidence 0.73


In [33]:
files = os.listdir('/media/programs/Programs/66/66147/A66147/PM-1812')

In [34]:
result = chardet.detect(files[0].encode())

encoding = result['encoding']
confidence = result['confidence']

print(f"Encoding: {encoding} with confidence {confidence}")

Encoding: Windows-1254 with confidence 0.5343954060317209


    # Prevent this running. It causes an error.
    file = '/media/programs/' + cjm_files.iloc[3].Path + cjm_files.iloc[3].Filename
    file = file.encode('windows-1250')
    print(f'{file} exists is {os.path.exists(file)}')


In [35]:
print(f'{files[0]} exists is {os.path.exists(files[0])}')

A66147-15-Hnh_14.wav exists is False


In [36]:
bin_str = ' '.join(format(ord(byte), 'x') for byte in files[0])
print(bin_str)

41 36 36 31 34 37 2d 31 35 2d 48 8d 6e 68 5f 31 34 2e 77 61 76


In [37]:
# Define the byte sequence as a bytes object
byte_seq = b'\x41\x36\x36\x31\x34\x37\x2d\x31\x35\x2d\x48\x8d\x6e\x68\x5f\x31\x34\x2e\x77\x61\x76'

# Decode the byte sequence using the Windows-1250 encoding
filename = byte_seq.decode('windows-1250')

# Print the decoded filename
print(filename)

A66147-15-HŤnh_14.wav


    bin_str = ' '.join(format(ord(byte), 'x') for byte in file)
    print(bin_str)

Try doing a byte by byte comparison of the files string and the one gained from the meta data.

In [38]:
for meta_char, file_char in zip(cjm_files.iloc[16].Filename, files[0]):
    print(f'{format(ord(meta_char), "2x")} {format(ord(file_char), "x")} {file_char}')
                                

41 41 A
36 36 6
36 36 6
31 31 1
34 34 4
37 37 7
2d 2d -
31 31 1
35 35 5
2d 2d -
48 48 H
fffd 8d 
6e 6e n
68 68 h
5f 5f _
31 31 1
34 34 4
2e 2e .
77 77 w
61 61 a
76 76 v


So for the character in question: The meta data csv file contains the character sequence EF BF BD which is the UTF-8 code for the unicode replacement character which is inserted when an unknown character is found in a string. This character becomes FFFD when read into a dataframe, which is also the Unicode replacement character. The file contains 8D which is either a windows-1250 or 1251 encoding.

In [39]:
import glob

f = '/media/programs/' + cjm_files.iloc[3].Path + r'A66147-15-H*nh_14.wav'
#f = '/media/programs/' + cjm_files.iloc[3].Path + r'A66147*.wav'
print(f)
files = glob.glob(f)
for f in files:
    print(f'{f} exists {os.path.exists(f)}')

/media/programs/Programs/66/66147/A66147/PM-1812/A66147-15-H*nh_14.wav
/media/programs/Programs/66/66147/A66147/PM-1812/A66147-15-Hnh_14.wav exists True


In [40]:
import os
import glob

def check_for_file(item_row):
    return os.path.isfile('/media/programs/' + item_row['Path'] + item_row['Filename'] )    

def check_for_foreign_file(item_row):
    # The issue is that some filenames contain the replacement character because the
    # filenames have unknonw foreign characters in them.
    path_to_file = '/media/programs/' + item_row['Path'] + item_row['Filename'].replace('\ufffd', '*')
    print(f'Char at 60 {ord(path_to_file[60]):x}')
    files = glob.glob(path_to_file)
    return len(files) == 1    

cjm_files['exists'] = cjm_files.apply(check_for_file, axis=1)
cjm_files['foreign exists'] = cjm_files.apply(check_for_foreign_file, axis=1)


Char at 60 41
Char at 60 42
Char at 60 6e
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a
Char at 60 2a


OK - with this new knowledge how many files do we have missing from the original data?

In [41]:
def check_for_glob_file(item_row):
    path_to_file = '/media/programs/' + item_row['Path'] + item_row['Filename'].replace('\ufffd', '*')
    files = glob.glob(path_to_file)
    return len(files) == 1    

file_descriptors['file exists'] = file_descriptors.apply(check_for_glob_file, axis=1)
print(f'Files verified as existing for {sum(file_descriptors["file exists"])} out of {file_descriptors.shape[0]} records.')

Files verified as existing for 204436 out of 210704 records.


So this recovered another 600 or so files. Good. Now with the data from vox_grn how many do we have?

In [42]:
new_missing_files = file_descriptors[file_descriptors['file exists'] == False].copy()

In [43]:
new_missing_files['prog'] = new_missing_files['Program'].str.extract('[AC]([0-9]{5})')
new_missing_files['found_prog'] = new_missing_files['prog'].isin(vox_df['program'])
new_found_files = new_missing_files[new_missing_files['found_prog']]
print(f'{len(new_found_files)} found out of {len(new_missing_files)}')

6058 found out of 6268


So what is with the 210 missing files?

In [44]:
still_missing = new_missing_files[new_missing_files['found_prog'] == False]

Some(80) are copy_masters - which I do not have. 

Looking at some examples:

* A37731 - this exists on the disk but the sub dir is MM, not PM-0000 as in the meta data, and the files are mp3 and broken into items. vox_df does not have program 37731
* A19741 - only C19741 exists on the disk and that only contains gif files.
* A38090 - exists on the disk but the sub dir is PM-0000, not PM-2106 as in the meta data, and there is a wav file for every item.
* A38091 - exists on the disk but the sub dir is PM-0000, not PM-2107 as in the meta data, and there is a wav file for every item.
* A38101 - exists on the disk but the sub dir is PM-0000, not PM-2106 as in the neta data, and there is a wav file for every item.

In other words, these are genuine errors in the meta-data.

# Completeness of meta data
This section builds up a description of all the files on the disk and sees which ones are not known by the meta data.

In [45]:
def find_files(path):
    """
    Find all files in the directory tree rooted at `path`.
    """
    for root, _, files in os.walk(path):
        for file in files:
            if file.lower().endswith('.wav') or file.lower().endswith('.mp3'):
                yield root, file

files_in_dir = pd.DataFrame.from_records(find_files('/media/programs/Programs'), columns=['path', 'filename'])

So there are about 75000 more files on the disk than in the file meta data. Find out which ones.

In [46]:
files_in_dir['file_match'] = files_in_dir['filename'].isin(file_descriptors['Filename'])

In [47]:
files_not_in_meta = files_in_dir[files_in_dir['file_match'] == False]
print(len(files_not_in_meta))

63445


OK - so lets pull out the program and the track and see if they are in the item data.

In [48]:
f = files_not_in_meta.copy()
f['program'] = f['filename'].str.extract('([AC][0-9]{5})')


In [49]:
import re
def extract_track(s):
    tracks = re.search(r'[AC][0-9]{5}[AB]{,1}-([0-9]{1,3})', s)
    if tracks:
        return int(tracks.group(1))
    tracks = re.search(r'[AC]-{,1}[0-9]{5}([AB])', s)
    if tracks:
        return 1 if tracks.group(1) == 'A' else 2
    return 0

f['track'] = f['filename'].apply(extract_track)

In [50]:
f['ID'] = f['program'] + '_' + f['track'].astype(int).apply('{:0>3d}'.format)

In [51]:
f['has_item'] = f['ID'].isin(items['ID'])
print(f'{sum(f["has_item"])} files have an item.')

19295 files have an item.


So this raises some interesting questions:
1. If we discarded the file descriptors and only used the files found on the disk how many items would be usable compared to those when the file descriptors were used:

    - with just the original file descriptors
    - with the original plus vox-grn files?

2. How many duplicated files are on the disk?
3. What is the coverage like for files on the disk compared to those in grn-vox?

Using items would necessitate solving the problem of multiple items in the one file. This is only really a problem if the languages vary between the items. 

4. How many items belong to one file that has multiple languages?
5. Is this a problem for wav files as well as mp3 files?

## Duplicate Files on Disk.

Lets start by identifying duplicate files.

In [98]:
files_in_dir['program'] = files_in_dir['filename'].str.extract('([A-Za-z]?[0-9]{5})')
files_in_dir['track'] = files_in_dir['filename'].apply(extract_track)
files_in_dir['ID'] = files_in_dir['program'] + '_' + files_in_dir['track'].astype(int).apply('{:0>3d}'.format)
files_in_dir['duplicated'] = files_in_dir['ID'].duplicated(keep='first')
print(f'Number of duplicates: {sum(files_in_dir["duplicated"])}')

Number of duplicates: 77381


Now discard the duplicates

In [99]:
f_on_disk = files_in_dir[files_in_dir['duplicated'] == False].copy()

## Usable items using disk files

In [54]:
items_with_records = pd.read_csv("../../data/items_with_records_with_voxgrn_files.csv")
fd = pd.read_csv("../../data/records_with_voxgrn_files.csv")

usable_items['file on disk'] = usable_items['ID'].isin(f_on_disk['ID'])
print(f'The number of usable items with original metadata and voxgrn: {len(items_with_records)}')
print(f'The number of usable items with data from disk: {sum(usable_items["file on disk"])}')

The number of usable items with original metadata and voxgrn: 248682
The number of usable items with data from disk: 200812


Why the loss of 48000 items? Is it an A/C thing?

In [100]:
usable_items['_ID'] = usable_items['ID'].str.extract('[A-Za-z]?(.*)')
f_on_disk['_ID'] = f_on_disk['ID'].str.extract('[A-Za-z]?(.*)')
usable_items['file on disk'] = usable_items['_ID'].isin(f_on_disk['_ID'])
print(f'The number of usable items with original metadata and voxgrn: {len(items_with_records)}')
print(f'The number of usable items with data from disk: {sum(usable_items["file on disk"])}')

The number of usable items with original metadata and voxgrn: 248682
The number of usable items with data from disk: 205231


The remaining missing files are to do with non-conventional naming meaning that the track cannot be extracted from the file name.

Can we augment the existing files with data from the disk?

What items do not have data found on any disk?

In [56]:
usable_items['data in items with rec'] = usable_items['_ID'].isin(items_with_records['ID'])
print(f'The number of items with records is: {sum(usable_items["data in items with rec"])}')
usable_items['on disk but not in rec'] = usable_items['file on disk'] & ~usable_items['data in items with rec']
usable_items['on disk or in rec'] = usable_items['file on disk'] | usable_items['data in items with rec']
print(f'The number of items that could be added is: {sum(usable_items["on disk but not in rec"])}')
print(f'The total number of items is: {sum(usable_items["on disk or in rec"])}')

The number of items with records is: 248682
The number of items that could be added is: 2973
The total number of items is: 251655


In [101]:
usable_items_on_disk = usable_items[usable_items['on disk or in rec']].copy()
print(f'The total number of languages is: {len(set(usable_items_on_disk["Language Number"].to_list()))}')

The total number of languages is: 6076


Now the issue is that the files on the disk do not have an iso language associated with them. How many do not have an iso that we can infer?

In [102]:
def language_map(lang):
    return lang_id_to_iso[lang] if lang in lang_id_to_iso else '***'

usable_items['iso'] = usable_items['Language Number'].apply(language_map)

In [103]:
print(f'Number of items with unfound languages: {sum(usable_items["iso"] == "***")}')
usable_items_on_disk = usable_items[usable_items['on disk or in rec']].copy()
usable_items_with_rec = usable_items[usable_items['data in items with rec']].copy()
usable_items_file_on_disk = usable_items[usable_items['file on disk']].copy()

print(f'Number of items with files with unfound languages: {sum(usable_items_on_disk["iso"] == "***")}')
print(f'Number of items with rec with unfound languages: {sum(usable_items_with_rec["iso"] == "***")}')
print(f'Number of items with file on disk with unfound languages: {sum(usable_items_file_on_disk["iso"] == "***")}')

Number of items with unfound languages: 4445
Number of items with files with unfound languages: 1545
Number of items with rec with unfound languages: 1125
Number of items with file on disk with unfound languages: 1304


One last question: If I augment the items with records with those found just on the disk - how many items do I recover?

In [104]:
on_disk_recoverable = usable_items[usable_items["file on disk"] & ~usable_items["data in items with rec"]]
print(f'Usable items on disk but not in records: {len(on_disk_recoverable)}')
print(f'ISO language known for {sum(on_disk_recoverable["iso"] != "***")}')

Usable items on disk but not in records: 3112
ISO language known for 2692


OK - lets recover them.

In [105]:
print(usable_items.columns)
print(f_on_disk.columns)
print(fd.columns)
print(items_with_records.columns)

Index(['Program Number', 'Program Item Number', 'Tape Side', 'Track Number',
       'Original Recording Number', 'Original Item Number', 'Title',
       'Vernacular Item Title', 'Language Number', 'Language Name',
       'Item Start Time', 'Item Time', 'Finish Time', 'Original Time',
       'Script Number', 'Script Name', 'Picture Number', 'Item Type',
       'Comments', 'Entered By', 'Enter On Date', 'ID', 'file on disk', '_ID',
       'data in items with rec', 'on disk but not in rec', 'on disk or in rec',
       'iso'],
      dtype='object')
Index(['path', 'filename', 'file_match', 'program', 'track', 'ID',
       'duplicated', '_ID'],
      dtype='object')
Index(['Unnamed: 0', 'iso', 'language_name', 'track', 'location', 'year',
       'path', 'filename', 'length', 'program', 'ID'],
      dtype='object')
Index(['Unnamed: 0', 'iso', 'language_name', 'track', 'location', 'year',
       'path', 'filename', 'length', 'ID', 'item', 'title', 'start',
       'duration', 'end', 'type', 'pr

In [106]:
recover_candidates = on_disk_recoverable[on_disk_recoverable["iso"] != "***"].copy()
recover_candidates.drop(columns=['Tape Side', 'Original Recording Number', 'Original Item Number', 
                                 'Vernacular Item Title', 'Language Number', 'Script Number', 'Script Name', 'Picture Number', 'Entered By', 
                                 'Enter On Date', 'Comments', 'file on disk', 'data in items with rec',
                                 'on disk but not in rec', 'on disk or in rec', 'Original Time'], inplace=True)
recover_candidates.rename(columns={'Program Item Number': 'item', 'Title': 'title', 'Item Start Time': 'start', 'Track Number' : 'track',
        'Item Time': 'duration', 'Finish Time': 'end', 'Item Type': 'type', 'Program Number' : 'program', 'Language Name' : 'language_name'}, inplace=True)

print(recover_candidates.columns)

Index(['program', 'item', 'track', 'title', 'language_name', 'start',
       'duration', 'end', 'type', 'ID', '_ID', 'iso'],
      dtype='object')


We cannot recover location or year. Add in path a filename.

In [122]:
recover_candidates = recover_candidates.dropna(subset=['_ID'])   
candidates = pd.merge(recover_candidates, f_on_disk, on='_ID', how='left')
print(candidates.columns)

Index(['program_x', 'item', 'track_x', 'title', 'language_name', 'start',
       'duration', 'end', 'type', 'ID_x', '_ID', 'iso', 'path', 'filename',
       'file_match', 'program_y', 'track_y', 'ID_y', 'duplicated'],
      dtype='object')


Sanity check

In [123]:
print(f'Mismatched programs: {sum(candidates["program_x"] != candidates["program_y"])}')
print(f'Program x na: {sum(candidates["program_x"].isna())}')
print(f'Program y na: {sum(candidates["program_y"].isna())}')
match_err = candidates[candidates["program_x"] != candidates["program_y"]]


Mismatched programs: 1002
Program x na: 0
Program y na: 0


In [124]:
print(f'Mismatched tracks: {sum(candidates["track_x"] != candidates["track_y"])}')


Mismatched tracks: 0


In [125]:
candidates.drop(columns=['program_y', 'track_y', 'ID_x', 'ID_y', 'file_match', 'duplicated'], inplace=True)


In [126]:
print(candidates.columns)
print(items_with_records.columns)

Index(['program_x', 'item', 'track_x', 'title', 'language_name', 'start',
       'duration', 'end', 'type', '_ID', 'iso', 'path', 'filename'],
      dtype='object')
Index(['Unnamed: 0', 'iso', 'language_name', 'track', 'location', 'year',
       'path', 'filename', 'length', 'ID', 'item', 'title', 'start',
       'duration', 'end', 'type', 'program'],
      dtype='object')


In [127]:
candidates['program'] = candidates['program_x'].str.extract('[A-Za-z]?([0-9]{5})').astype(int)


In [128]:
candidates.rename(columns={ 'track_x' : 'track', '_ID' : 'ID'}, inplace=True)
candidates.drop(columns=['program_x'], inplace=True)

In [129]:
# make the path the same
candidates['new_path'] = candidates['path'].str.extract('/media/programs/(.*)')

In [130]:
candidates.drop(columns=['path'], inplace=True)
candidates.rename(columns={'new_path' : 'path'}, inplace=True)

In [131]:
augmented_items_with_records = pd.concat([items_with_records, candidates], ignore_index=True)

In [133]:
augmented_items_with_records.drop(columns=['Unnamed: 0'], inplace=True)

In [134]:
augmented_items_with_records.to_csv("../../data/items_with_records_all.csv")
print(f'The columns of items with records are:\n{augmented_items_with_records.columns}')

The columns of items with records are:
Index(['iso', 'language_name', 'track', 'location', 'year', 'path', 'filename',
       'length', 'ID', 'item', 'title', 'start', 'duration', 'end', 'type',
       'program'],
      dtype='object')
