In [1]:
import os
import h5py
import pytz
import numpy as np
import datetime
import pandas as pd

In [2]:
def get_time_values(timestamp, timezone='America/New_York'):
    
    dt = datetime.datetime.utcfromtimestamp(timestamp)
    dt = pytz.UTC.localize(dt)
    dt = dt.astimezone(pytz.timezone(timezone))
    hour_of_the_day = dt.hour
    day_of_the_week = dt.weekday()
    week_of_the_year = dt.isocalendar()[1]
    
    # Get the 2 hour group id
    # dt.hour = 0 and dt.hour = 1 gets mapped to hr_id = 0
    # dt.hour = 2 and dt.hour = 3 gets mapped to hr_id = 1
    #.
    #.
    # dt.hour = 22 and dt.hour = 23 gets mapped to hr_id = 11
    hr_id = map_2_hrs[str(hour_of_the_day)]
    
    # Get combination of 2 hour id, day of the week and week of the year to enable groupby later
    # This acts as a unique key for a 2-hr window
    day_id = str(hr_id)+'-'+str(day_of_the_week)+'-'+str(week_of_the_year)
    
    return dict(hour_of_the_day=hour_of_the_day,
                day_of_the_week=day_of_the_week,
                week_of_the_year=week_of_the_year,
                day_id=day_id,
                hr_id=hr_id)

In [3]:
def get_spl_frame_vector(spl_vector):
    
    # From list of 80 values in spl_vector, get 20 values for 20 embedding frames by averaging 4 values
    # Calculate the min and max of resulting 20 values to make calculation of min and max spl over 2 hr period easier
    spl_frames = [0.25*sum([spl_vector[i+k] for k in range(4)]) for i in spl_iterable]
    return dict({frame_keys[i]: value for (i, value) in enumerate(spl_frames)},
                max_frame=max(spl_frames),
                min_frame=min(spl_frames))

In [10]:
def pretty_print_test(map_2_hrs, frame_keys, df, final):
    print('Keys for 2 hr groups: ', map_2_hrs)
    print('Keys for spl/frame: ', frame_keys)
    print('Dataframe before aggregating min and max over spl values:')
    df.head()
    print('Dataframe after calculating the relative loudness per frame:')
    final.head()

In [5]:
feats_path = '/beegfs/work/sonyc/features/openl3/2017/sonycnode-b827ebefb215.sonyc_features_openl3.h5'
indices_path = '/beegfs/work/sonyc/indices/2017/'+ os.path.basename(feats_path).replace('features_openl3', 'recording_index')
indices = h5py.File(indices_path)
blob = h5py.File(feats_path)
 
# Get the timestamp from the feature file
ts = blob['openl3']['timestamp']

# Not used as of now
feats = blob['openl3']['openl3']

# Get the spl_vector from the indices file
spl_vecs = indices['recording_index']['spl_vector']

assert feats.shape[0] == ts.shape[0] == spl_vecs.shape[0]

In [6]:
# 3 mappings/list to make list comprehensions easier
map_2_hrs = {str(2*i+k): value for k in range(2) for (i, value) in enumerate(range(12))}
spl_iterable = [4*k for k in range(20)]
frame_keys = ['frame_'+ str(i) for i in range(20)] 

In [7]:
# Get the spl avg value of 4 consecutive values from spl_vector
spl_arr = np.apply_along_axis(get_spl_frame_vector, 1, spl_vecs)

# Apply get_time_values() to each element of the timestamp array 
dt_vectorize = np.vectorize(get_time_values)
t_arr = dt_vectorize(ts)

# Convert the dicts obtained above into dataframe and combine them to make aggregation easier
t_df = pd.DataFrame(list(t_arr))
spl_df = pd.DataFrame(list(spl_arr)) 
df = pd.concat([t_df, spl_df], axis=1)

In [8]:
# Get the min and max spl values over 2 hr window
res = df.groupby(['day_id']).agg({'min_frame': np.min, 'max_frame': np.max}).reset_index()
final = pd.merge(df, res, on='day_id', how='outer', suffixes=('_emb', '_2_hr'))

# Get relative loudness of each frame using the min_frame_2_hr and max_frame_2_hr calculated above
for key in frame_keys:
    final[key+'_rel_loudness'] = (final[key] - final['min_frame_2_hr'])/(final['max_frame_2_hr'] - final['min_frame_2_hr'])

In [11]:
pretty_print_test(map_2_hrs, frame_keys, df, final)

Keys for 2 hr groups:  {'0': 0, '2': 1, '4': 2, '6': 3, '8': 4, '10': 5, '12': 6, '14': 7, '16': 8, '18': 9, '20': 10, '22': 11, '1': 0, '3': 1, '5': 2, '7': 3, '9': 4, '11': 5, '13': 6, '15': 7, '17': 8, '19': 9, '21': 10, '23': 11}
Keys for spl/frame:  ['frame_0', 'frame_1', 'frame_2', 'frame_3', 'frame_4', 'frame_5', 'frame_6', 'frame_7', 'frame_8', 'frame_9', 'frame_10', 'frame_11', 'frame_12', 'frame_13', 'frame_14', 'frame_15', 'frame_16', 'frame_17', 'frame_18', 'frame_19']
Dataframe before aggregating min and max over spl values:
Dataframe after calculating the relative loudness per frame:


In [None]:
# How should we bin the rel_loudness values to find the probability distribution?