In [103]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from pathlib import Path, PurePath
import json
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
import re

In [2]:
# path of note sound files
pth1 = Path('.') / 'data'
pth2 = Path('.') / '..' / 'note-spider' / 'data'
data_pth = pth2 if pth2.exists() else pth1
if not data_pth.exists():
    raise('Cannot find data')
data_pth

PosixPath('../note-spider/data')

In [3]:
file_pth_json = None
with open(data_pth / 'urlsAndPath.json') as f:
    text = f.read()
    file_pth_json = json.loads(text)
file_pth_json[:3]

[{'url': 'https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.Bb3B3.aiff',
  'path': '/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default'},
 {'url': 'https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C4B4.aiff',
  'path': '/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default'},
 {'url': 'https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C5B5.aiff',
  'path': '/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default'}]

In [4]:
file_pth_dict = { 'url': [o['url'] for o in file_pth_json], 'path': [o['path'] for o in file_pth_json] }
df = pd.DataFrame(file_pth_dict, columns=['url','path', 'local_path'], index=None)
df.head()

Unnamed: 0,url,path,local_path
0,https://theremin.music.uiowa.edu/sound files/M...,/Users/origami/Desktop/temp_projects/muse/pack...,
1,https://theremin.music.uiowa.edu/sound files/M...,/Users/origami/Desktop/temp_projects/muse/pack...,
2,https://theremin.music.uiowa.edu/sound files/M...,/Users/origami/Desktop/temp_projects/muse/pack...,
3,https://theremin.music.uiowa.edu/sound files/M...,/Users/origami/Desktop/temp_projects/muse/pack...,
4,https://theremin.music.uiowa.edu/sound files/M...,/Users/origami/Desktop/temp_projects/muse/pack...,


In [40]:
def get_file_path(row):
    url, path = row['url'], row['path']
    file_name = url.split('/')[-1]
    file_path = path + '/' + file_name
    return file_path

def get_relative_path(row):
    local_path = Path(row['local_path'])
    relative_path = local_path.relative_to(data_pth.resolve())
    return relative_path

In [41]:
df['local_path'] = df.apply(get_file_path, axis=1)
df['relative_path'] = df.apply(get_relative_path, axis=1)

In [33]:
def check_local_path_exists(row):
    local_path = Path(row['local_path'])
    if not local_path.exists():
        print(row)
    return local_path.exists()

### Extract path info
group: woodwinds
name: oboe
type: 
dynamics: pp, mf , ff

### 1
splash.bow.aif
20chinese.bow.aif

> No explicit meaning, skip processing

### 2
Oboe.pp.Bb3B3.aiff

> dynamics.note

20windgong.bow.stereo.aif

### 3

*Case 1*: Flute.vib.pp.B3B4.aiff

*Case 2*: Vibraphone.bow.Db6.stereo.aif


### 4
Viola.arco.sulC.pp.C3B3.aiff

Vibraphone.shortsustain.ff.F6.stereo.aif


### 5

Bass.arco.ff.sulE.E1.stereo.aif

Xylophone.rosewood.roll.ff.Ab7.stereo.aif

In [160]:
def get_path_info(row):
    relative_path = row['relative_path']
    splited = str(relative_path).split('/')
    year, group, name, subtype = splited[:-1]
    return splited[:-1]

def extend_col_year(row):
    return get_path_info(row)[0]

def extend_col_group(row):
    return get_path_info(row)[1]

def extend_col_name(row):
    return get_path_info(row)[2]

def extend_col_subtype(row):
    return get_path_info(row)[3]

In [161]:
df['year'] = df.apply(extend_col_year, axis=1)
df['group'] = df.apply(extend_col_group, axis=1)
df['instrument'] = df.apply(extend_col_name, axis=1)
df['subtype'] = df.apply(extend_col_subtype, axis=1)

#### Extract dynamics, notes

In [167]:
def find_dynamics(strs):
    result = []
    for s in strs:
        res = re.findall(r"^[pmf]{1,3}$",s)
        result.extend(res)
    return result

def find_note(strs):
    result = []
    for s in strs:
        res = re.findall(r"^[A-G]{1,}[A-Gb0-9]{1,}$",s)
        result.extend(res)
    return result
def extend_col_dynamics(row):
    file_path = Path(row['local_path'])
    file_name = file_path.name
    splited = file_name.split('.')
    dynamics = find_dynamics(splited)
    if len(dynamics) > 1:
        print('Wrong res', dynamics)
    if len(dynamics) == 0:
        return None
    return dynamics[0]

def extend_col_note(row):
    file_path = Path(row['local_path'])
    file_name = file_path.name
    splited = file_name.split('.')
    note = find_note(splited)
    if len(note) > 1:
        print('Wrong res', note)
    if len(note) == 0:
        return None
    return note[0]

def extend_col_notes(row):
    note = row['note']
    if note is None:
        return note
    return re.findall(r"[A-G]b?[0-9]", note)

In [170]:
df['dynamics'] = df.apply(extend_col_dynamics, axis=1)
df['note'] = df.apply(extend_col_note, axis=1)
df['notes'] = df.apply(extend_col_notes, axis=1)

In [171]:
df[['dynamics', 'note','notes', 'group', 'instrument','subtype']].head()

Unnamed: 0,dynamics,note,notes,group,instrument,subtype
0,pp,Bb3B3,"[Bb3, B3]",woodwinds,oboe,default
1,pp,C4B4,"[C4, B4]",woodwinds,oboe,default
2,pp,C5B5,"[C5, B5]",woodwinds,oboe,default
3,pp,C6Ab6,"[C6, Ab6]",woodwinds,oboe,default
4,mf,Bb3B3,"[Bb3, B3]",woodwinds,oboe,default


### Tags

In [176]:
def extend_col_tags(row):
    group,name, subtype = row['group'],row['instrument'],row['subtype']
    note, notes = row['note'], row['notes']
    alltags = [group, name]
    if subtype != 'default':
        alltags.append(subtype)
    if note is not None:
        alltags.append(note)
        alltags.extend(notes)
    return alltags
    

In [177]:
df['tags'] = df.apply(extend_col_tags, axis=1)

In [178]:
df.head()

Unnamed: 0,url,path,local_path,relative_path,dynamics,note,year,group,instrument,subtype,notes,tags
0,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.Bb3B3.aiff,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default/Oboe.pp.Bb3B3.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.Bb3B3.aiff,pp,Bb3B3,pre2012,woodwinds,oboe,default,"[Bb3, B3]","[woodwinds, oboe, Bb3B3, Bb3, B3]"
1,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C4B4.aiff,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default/Oboe.pp.C4B4.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.C4B4.aiff,pp,C4B4,pre2012,woodwinds,oboe,default,"[C4, B4]","[woodwinds, oboe, C4B4, C4, B4]"
2,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C5B5.aiff,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default/Oboe.pp.C5B5.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.C5B5.aiff,pp,C5B5,pre2012,woodwinds,oboe,default,"[C5, B5]","[woodwinds, oboe, C5B5, C5, B5]"
3,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C6Ab6.aiff,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default/Oboe.pp.C6Ab6.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.C6Ab6.aiff,pp,C6Ab6,pre2012,woodwinds,oboe,default,"[C6, Ab6]","[woodwinds, oboe, C6Ab6, C6, Ab6]"
4,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.mf.Bb3B3.aiff,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default,/Users/origami/Desktop/temp_projects/muse/packages/note-spider/data/pre2012/woodwinds/oboe/default/Oboe.mf.Bb3B3.aiff,pre2012/woodwinds/oboe/default/Oboe.mf.Bb3B3.aiff,mf,Bb3B3,pre2012,woodwinds,oboe,default,"[Bb3, B3]","[woodwinds, oboe, Bb3B3, Bb3, B3]"


In [181]:
df2 = df.drop(['path','local_path'],axis=1)

In [182]:
df2.head()

Unnamed: 0,url,relative_path,dynamics,note,year,group,instrument,subtype,notes,tags
0,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.Bb3B3.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.Bb3B3.aiff,pp,Bb3B3,pre2012,woodwinds,oboe,default,"[Bb3, B3]","[woodwinds, oboe, Bb3B3, Bb3, B3]"
1,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C4B4.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.C4B4.aiff,pp,C4B4,pre2012,woodwinds,oboe,default,"[C4, B4]","[woodwinds, oboe, C4B4, C4, B4]"
2,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C5B5.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.C5B5.aiff,pp,C5B5,pre2012,woodwinds,oboe,default,"[C5, B5]","[woodwinds, oboe, C5B5, C5, B5]"
3,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.pp.C6Ab6.aiff,pre2012/woodwinds/oboe/default/Oboe.pp.C6Ab6.aiff,pp,C6Ab6,pre2012,woodwinds,oboe,default,"[C6, Ab6]","[woodwinds, oboe, C6Ab6, C6, Ab6]"
4,https://theremin.music.uiowa.edu/sound files/MIS/Woodwinds/oboe/Oboe.mf.Bb3B3.aiff,pre2012/woodwinds/oboe/default/Oboe.mf.Bb3B3.aiff,mf,Bb3B3,pre2012,woodwinds,oboe,default,"[Bb3, B3]","[woodwinds, oboe, Bb3B3, Bb3, B3]"


In [189]:
df2[df2['relative_path'].str.contains('html')]

KeyError: "None of [Float64Index([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n              ...\n              nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],\n             dtype='float64', length=3687)] are in the [columns]"

In [199]:
df2.dtypes

url              string
relative_path    object
dynamics         object
note             object
year             object
group            object
instrument       object
subtype          object
notes            object
tags             object
dtype: object

In [198]:
df2['url'] = df2['url'].astype('string')