In [1]:
import ujson
from glob import glob
import os.path as op

In [2]:
def split_index_path(path):
    splitted = op.normpath(path).split(op.sep)
    ifile = splitted[-1]
    cat_name = splitted[-2]
    return cat_name, path

In [9]:
def read_catalog_db(data, split_char='\t'):
    field_seperator = '\x00'
    fields  = [field.split(field_seperator) for field in data.split(split_char)]
    entries = [field for field in fields if field[3] == bytes('d')]
    index   = {entry[0].decode('utf8'): entry[4].decode('utf8')[1:] for entry in entries}
    return index

In [12]:
def export_index_to_json(cat_name, dat_path):
    json_path = '/media/jakob/bigdata/index_data/{}_indicies.json'.format(cat_name)
    with open(dat_path, 'rb') as f:
        data = bytearray(f.read())
        if cat_name == 'S':
            index = read_catalog_db(data, split_char='\x08')
        else:
            index = read_catalog_db(data)
            
    with open(json_path, 'wb') as out:
        print(json_path)
        json_data = {'name': cat_name, 'index': index}
        ujson.dump(json_data,out,indent=4,ensure_ascii=False)

In [13]:
paths = glob('/media/jakob/bigdata/repository.org.20091214/catalogs/*/data*.dat')
data_files = {cat_name: path for cat_name, path in map(split_index_path, paths)}
sorted(data_files.keys())

['S', 'SB', 'SC', 'SD', 'SE']

In [14]:
for data in data_files.items():
    export_index_to_json(*data)

/media/jakob/bigdata/index_data/SC_indicies.json
/media/jakob/bigdata/index_data/SB_indicies.json
/media/jakob/bigdata/index_data/S_indicies.json
/media/jakob/bigdata/index_data/SE_indicies.json
/media/jakob/bigdata/index_data/SD_indicies.json


In [15]:
!cat /media/jakob/bigdata/index_data/SD_indicies.json

{
    "index":{
        "SD0150421":"Rundfunk-Sinfonieorchester <Leipzig>",
        "SD0160065":"Salzburger Bachchor",
        "SD0120200":"Orchestra Internazionale d'Italia",
        "SD0060525":"Gürzenich-Orchester <Köln>",
        "SD0080061":"Kammerorchester C. Ph. E. Bach <Berlin, Ost>",
        "SD0070055":"Hamburgische Staatsoper \/ Chor",
        "SD0140104":"Radio-Sinfonie-Orchester <Frankfurt, Main>",
        "SD0020221":"Bayerische Staatsoper <München> \/ Chor",
        "SD0150189":"Rundfunk im Amerikanischen Sektor Berlins \/ Kammerensemble",
        "SD0130364":"Philharmonisches Staatsorchester <Hamburg>",
        "SD0160819":"Studio for New Music",
        "SD0100002":"Ma'alot-Quintett",
        "SD0110250":"New Budapest Quartet",
        "SD0130570":"Prazský Komorní Sbor",
        "SD0040423":"Coro Gulbenkian <Lisboa>",
        "SD0020736":"British Broadcasting Corporation <London> \/ Symphony Orchestra",
        "SD0050419":"Ensemble <Köln>",
        