In [1]:
import os
import h5py
import pytz
import numpy as np
import datetime
import pandas as pd

In [2]:
def get_time_values(timestamp, timezone='America/New_York'):
    
    dt = datetime.datetime.utcfromtimestamp(timestamp)
    dt = pytz.UTC.localize(dt)
    dt = dt.astimezone(pytz.timezone(timezone))
    hour_of_the_day = dt.hour
    day_of_the_week = dt.weekday()
    week_of_the_year = dt.isocalendar()[1]
    
    # Get the 2 hour group id
    # dt.hour = 0 and dt.hour = 1 gets mapped to hr_id = 0
    # dt.hour = 2 and dt.hour = 3 gets mapped to hr_id = 1
    #.
    #.
    # dt.hour = 22 and dt.hour = 23 gets mapped to hr_id = 11
    hr_id = map_2_hrs[str(hour_of_the_day)]
    
    # Get combination of 2 hour id, day of the week and week of the year to enable groupby later
    # This acts as a unique key for a 2-hr window
    day_id = str(hr_id)+'-'+str(day_of_the_week)+'-'+str(week_of_the_year)
    
    return dict(hour_of_the_day=hour_of_the_day,
                day_of_the_week=day_of_the_week,
                week_of_the_year=week_of_the_year,
                day_id=day_id,
                hr_id=hr_id)

In [3]:
def get_spl_frame_vector(spl_vector):
    
    # From list of 80 values in spl_vector, get 20 values for 20 embedding frames by averaging 4 values
    # Calculate the min and max of resulting 20 values to make calculation of min and max spl over 2 hr period easier
    spl_frames = [0.25*sum([spl_vector[i+k] for k in range(4)]) for i in spl_iterable]
    return dict({frame_keys[i]: value for (i, value) in enumerate(spl_frames)},
                max_frame=max(spl_frames),
                min_frame=min(spl_frames))

In [4]:
feats_path = '/beegfs/work/sonyc/features/openl3/2017/sonycnode-b827ebefb215.sonyc_features_openl3.h5'
indices_path = '/beegfs/work/sonyc/indices/2017/'+ os.path.basename(feats_path).replace('features_openl3', 'recording_index')
indices = h5py.File(indices_path)
blob = h5py.File(feats_path)

# 3 mappings/list to make list comprehensions easier
map_2_hrs = {str(2*i+k): value for k in range(2) for (i, value) in enumerate(range(12))}
spl_iterable = [4*k for k in range(20)]
frame_keys = ['frame_'+ str(i) for i in range(20)] 
 
# Get the timestamp from the feature file
ts = blob['openl3']['timestamp']

# Not used as of now
feats = blob['openl3']['openl3']

# Get the spl_vector from the indices file
spl_vecs = indices['recording_index']['spl_vector']

assert feats.shape[0] == ts.shape[0] == spl_vecs.shape[0]

In [5]:
# Get the spl avg value of 4 consecutive values from spl_vector
spl_arr = np.apply_along_axis(get_spl_frame_vector, 1, spl_vecs)

# Apply get_time_values() to each element of the timestamp array 
dt_vectorize = np.vectorize(get_time_values)
t_arr = dt_vectorize(ts)

# Convert the dicts obtained above into dataframe and combine them to make aggregation easier
t_df = pd.DataFrame(list(t_arr))
spl_df = pd.DataFrame(list(spl_arr)) 
df = pd.concat([t_df, spl_df], axis=1)

df.head()

Unnamed: 0,day_id,day_of_the_week,hour_of_the_day,hr_id,week_of_the_year,frame_0,frame_1,frame_10,frame_11,frame_12,...,frame_2,frame_3,frame_4,frame_5,frame_6,frame_7,frame_8,frame_9,max_frame,min_frame
0,8-2-42,2,16,8,42,62.7975,65.184999,64.592499,64.95,64.28,...,62.577499,63.16,63.644999,64.000001,63.775002,64.584999,65.7325,65.655001,66.112502,62.577499
1,8-2-42,2,16,8,42,65.992498,67.807503,67.66,67.122501,66.255001,...,68.367498,69.279999,68.76,69.180002,70.1,68.410002,68.370001,69.362499,70.1,65.717501
2,8-2-42,2,16,8,42,62.717501,62.965,64.887503,64.505001,64.107499,...,63.320001,63.590001,62.675,63.215,63.9525,65.049999,65.2425,64.752499,65.2425,62.675
3,8-2-42,2,16,8,42,62.15,66.0525,63.315,63.33,62.4575,...,62.562499,62.055,61.4925,61.679999,62.475,62.784999,62.5575,62.945,66.0525,61.4925
4,8-2-42,2,16,8,42,64.340001,64.8475,65.010001,64.690001,64.4775,...,65.592499,65.302498,64.9825,65.9475,66.2575,66.177502,65.549999,66.365,66.365,62.4625


In [7]:
# Get the min and max spl values over 2 hr window
res = df.groupby(['day_id']).agg({'min_frame': np.min, 'max_frame': np.max}).reset_index()
final = pd.merge(df, res, on='day_id', how='outer', suffixes=('_emb', '_2_hr'))

# Get relative loudness of each frame using the min_frame_2_hr and max_frame_2_hr calculated above
for key in frame_keys:
    final[key+'_rel_loudness'] = (final[key] - final['min_frame_2_hr'])/(final['max_frame_2_hr'] - final['min_frame_2_hr'])

In [8]:
final.head(10)

Unnamed: 0,day_id,day_of_the_week,hour_of_the_day,hr_id,week_of_the_year,frame_0,frame_1,frame_10,frame_11,frame_12,...,frame_10_rel_loudness,frame_11_rel_loudness,frame_12_rel_loudness,frame_13_rel_loudness,frame_14_rel_loudness,frame_15_rel_loudness,frame_16_rel_loudness,frame_17_rel_loudness,frame_18_rel_loudness,frame_19_rel_loudness
0,8-2-42,2,16,8,42,62.7975,65.184999,64.592499,64.95,64.28,...,0.253399,0.274765,0.234723,0.317197,0.34424,0.228597,0.182131,0.221126,0.203944,0.140146
1,8-2-42,2,16,8,42,65.992498,67.807503,67.66,67.122501,66.255001,...,0.436725,0.404602,0.352757,0.369491,0.363066,0.328253,0.445839,0.382041,0.320634,0.356343
2,8-2-42,2,16,8,42,62.717501,62.965,64.887503,64.505001,64.107499,...,0.27103,0.24817,0.224414,0.262961,0.245181,0.244434,0.188107,0.282235,0.236217,0.158972
3,8-2-42,2,16,8,42,62.15,66.0525,63.315,63.33,62.4575,...,0.177051,0.177947,0.125803,0.147766,0.151502,0.181981,0.210967,0.161363,0.074854,0.135515
4,8-2-42,2,16,8,42,64.340001,64.8475,65.010001,64.690001,64.4775,...,0.278351,0.259226,0.246526,0.179441,0.189751,0.150456,0.203646,0.126102,0.177798,0.173017
5,8-2-42,2,16,8,42,64.187499,63.73,63.4775,63.0275,63.624999,...,0.186762,0.159869,0.195577,0.232183,0.202151,0.258778,0.196623,0.150605,0.14179,0.179441
6,8-2-42,2,16,8,42,62.365,64.085,65.464998,68.907501,70.182501,...,0.305543,0.511281,0.58748,0.684895,0.511878,0.149111,0.137606,0.164201,0.128791,0.16973
7,8-2-42,2,17,8,42,70.392502,68.360003,62.65,62.3325,62.01,...,0.137308,0.118333,0.099059,0.131481,0.119378,0.110862,0.110414,0.163006,0.111012,0.094278
8,8-2-42,2,17,8,42,65.307499,67.474998,66.142502,64.6975,64.0325,...,0.346033,0.259674,0.219931,0.199761,0.214403,0.19991,0.158823,0.188256,0.13925,0.11893
9,8-2-42,2,17,8,42,65.685001,66.360001,65.800001,65.512499,65.254999,...,0.325564,0.308382,0.292993,0.304945,0.296429,0.342896,0.396832,0.314807,0.310773,0.328403


In [None]:
# How should we bin the rel_loudness values to find the probability distribution?