# Create a local dataset
The purpose of this is to get going with locally generated data sets.

In [25]:
import pandas as pd
from datasets import load_dataset

In [26]:
# read in our file description
items = pd.read_csv("/home/jovyan/work/GRN-Notebooks/Data/items_with_records.csv")


In [27]:
# now we want to export half a dozen records as a json file.
items_eng = items[items.ISO == 'eng']
items_spa = items[items.ISO == 'spa']
items_fra = items[items.ISO == 'fra']

In [28]:
items_sample_ds = pd.concat([items_eng.iloc[:2], items_spa.iloc[:2], items_fra.iloc[:2]])

In [29]:
print(items_sample_ds.columns)

Index(['Unnamed: 0', 'LanguageID', 'ISO', 'Language', 'Program', 'Track',
       'Recordist', 'Location', 'Year', 'Path', 'Filename', 'Size', 'Length',
       'ID', 'Program Item Number', 'Tape Side', 'Title', 'Item Start Time',
       'Item Time', 'Finish Time', 'Original Time', 'Item Type', 'Compound'],
      dtype='object')


In [30]:
items_sample_ds.drop(columns=['Unnamed: 0', 'LanguageID', 'Recordist', 'Location', 'Year', 'Program', 'Track', 'Item Start Time', 'Item Time', 'Finish Time', 'Original Time', 'Compound'], inplace=True)

In [31]:
def create_filename(row):
    return '/media/programs/' + row['Path'] + row['Filename']

items_sample_ds['file'] = items_sample_ds.apply(create_filename, axis=1)

In [32]:
items_sample_ds.drop(columns=['Path', 'Filename'], inplace=True)

In [46]:
items_sample_ds.to_json('./example_ds.json', orient='records')

In [49]:
sample_ds = load_dataset('json', data_files='./example_ds.json', split='train')

Using custom data configuration default-78652adcc619fa7f


Downloading and preparing dataset json/default to /home/jovyan/.cache/huggingface/datasets/json/default-78652adcc619fa7f/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/json/default-78652adcc619fa7f/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


In [50]:
print(sample_ds.features)

{'ISO': Value(dtype='string', id=None), 'Language': Value(dtype='string', id=None), 'Size': Value(dtype='int64', id=None), 'Length': Value(dtype='float64', id=None), 'ID': Value(dtype='string', id=None), 'Program Item Number': Value(dtype='int64', id=None), 'Tape Side': Value(dtype='string', id=None), 'Title': Value(dtype='string', id=None), 'Item Type': Value(dtype='string', id=None), 'file': Value(dtype='string', id=None)}


Now prove the data set on a model

In [51]:
import torch
import librosa
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor

def map_to_array(example):
    speech, _ = librosa.load(example["file"], sr=16000, mono=True)
    example["speech"] = speech[:64000]
    return example

sample_ds = sample_ds.map(map_to_array)


  0%|          | 0/6 [00:00<?, ?ex/s]

In [5]:

model = HubertForSequenceClassification.from_pretrained("superb/hubert-large-superb-er")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-large-superb-er")


In [21]:

# compute attention masks and normalize the waveform if needed
inputs = feature_extractor(sample_ds["speech"], sampling_rate=16000, padding=True, return_tensors="pt")


In [22]:

# logits = model(**inputs).logits
logits = model(**inputs).logits


In [23]:

predicted_ids = torch.argmax(logits, dim=-1)
labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()]

In [18]:
print( sample_ds )

Dataset({
    features: ['ISO', 'Language', 'Size', 'Length', 'ID', 'Program Item Number', 'Tape Side', 'Title', 'Item Type', 'file'],
    num_rows: 1
})


Experiment with the different json export formats

In [38]:
import json

def show_json(orient):
    global items_sample_ds
    result = items_sample_ds.to_json( orient=orient)
    parsed = json.loads(result)
    print(json.dumps(parsed, indent=4))

In [45]:
show_json('records')

[
    {
        "ISO": "eng",
        "Language": "English: Aboriginal",
        "Size": 5365806,
        "Length": 37.259903,
        "ID": "A75295_001",
        "Program Item Number": 1,
        "Tape Side": " ",
        "Title": "Picture 1",
        "Item Type": "Message",
        "file": "/media/programs/Programs/75/75295/A75295/PM-1811/A75295-001.wav"
    },
    {
        "ISO": "eng",
        "Language": "English: Aboriginal",
        "Size": 3665474,
        "Length": 25.451528,
        "ID": "A75295_002",
        "Program Item Number": 2,
        "Tape Side": " ",
        "Title": "Picture 2",
        "Item Type": "Message",
        "file": "/media/programs/Programs/75/75295/A75295/PM-1811/A75295-002.wav"
    },
    {
        "ISO": "spa",
        "Language": "Spanish: Mexico",
        "Size": 219330154,
        "Length": 1523.0,
        "ID": "A66344_001",
        "Program Item Number": 1,
        "Tape Side": " ",
        "Title": "The Refugee's Journey",
        "Item Type":