In [1]:
import os
import h5py
import pytz
import numpy as np
import datetime
import pandas as pd
from scipy.stats import rankdata

In [2]:
def get_time_values(timestamp, timezone='America/New_York', 
                    map_2_hrs={str(2*i+k): value for k in range(2) for (i, value) in enumerate(range(12))}):
    
    dt = datetime.datetime.utcfromtimestamp(timestamp)
    dt = pytz.UTC.localize(dt)
    dt = dt.astimezone(pytz.timezone(timezone))
    hour_of_the_day = dt.hour
    day_of_the_week = dt.weekday()
    week_of_the_year = dt.isocalendar()[1]
    
    # Get the 2 hour group id
    # dt.hour = 0 and dt.hour = 1 gets mapped to hr_id = 0
    # dt.hour = 2 and dt.hour = 3 gets mapped to hr_id = 1
    #.
    #.
    # dt.hour = 22 and dt.hour = 23 gets mapped to hr_id = 11
    hr_id = map_2_hrs[str(hour_of_the_day)]
    
    # Get combination of 2 hour id, day of the week and week of the year to enable groupby later
    # This acts as a unique key for a 2-hr window
    day_id = str(hr_id)+'-'+str(day_of_the_week)+'-'+str(week_of_the_year)
    
    return dict(hour_of_the_day=hour_of_the_day,
                day_of_the_week=day_of_the_week,
                week_of_the_year=week_of_the_year,
                day_id=day_id,
                hr_id=hr_id)

In [3]:
# Default parameter values are evaluated when the function definition is executed.
# This means that the expression is evaluated once, when the function is defined, 
# and that that same ``pre-computed'' value is used for each call. 

def get_spl_frame_vector_old(spl_vector, spl_iterable=[4*k for k in range(20)],
                             frame_keys=['frame_'+ str(i) for i in range(20)]):
    
    # From list of 80 values in spl_vector, get 20 values for 20 embedding frames by averaging 4 values
    # Calculate the min and max of resulting 20 values to make calculation of min and max spl over 2 hr period easier
    spl_frames = [0.25*sum([spl_vector[i+k] for k in range(4)]) for i in spl_iterable]
    return dict({frame_keys[i]: value for (i, value) in enumerate(spl_frames)},
                max_frame=max(spl_frames),
                min_frame=min(spl_frames))

In [4]:
def rel_loudness_with_scaling(df):
    res = df.groupby(['day_id']).agg({'min_frame': np.min, 'max_frame': np.max}).reset_index()
    final = pd.merge(df, res, on='day_id', how='outer', suffixes=('_emb', '_2_hr'))

    # Get relative loudness of each frame using the min_frame_2_hr and max_frame_2_hr calculated above
    for key in frame_keys:
        final[key+'_rel_loudness'] = (final[key] - final['min_frame_2_hr'])/(final['max_frame_2_hr'] - final['min_frame_2_hr'])
    
    return final

In [5]:
# As a string: spl_frames = ''.join(','.join(str(0.25*sum([spl_vector[i+k] for k in range(4)])) for i in spl_iterable))
def get_spl_frame_vector(spl_vector, spl_iterable=[4*k for k in range(20)]):
    
    spl_frames = [0.25*sum([spl_vector[i+k] for k in range(4)]) for i in spl_iterable]
    return dict({'spl_frames': spl_frames})

In [6]:
def test_prob_sum(d, elements, count):
    prob_sum = [d[i]/count.sum() for i in elements]
    print('Sum of probs: ', np.array(prob_sum).sum())

In [7]:
def get_spl_prob(row, decimal_place=2):
    d = {}
    spl_frames = np.around(np.array(row['spl_frames']), decimals=decimal_place)
    total_frames = len(spl_frames)
    unique_elements, counts_elements = np.unique(spl_frames, return_counts=True)
    d = {unique_elements[i]: counts_elements[i] for i in range(unique_elements.shape[0])}
    res = [d[i]/total_frames for i in spl_frames]
    #test_prob_sum(d, unique_elements, counts_elements)
    return res

In [11]:
def pretty_print_test(df):
    print('Dataframe head:')
    print(df.head())
    print('---------------')
    print('Example of unique probability values in the probability distribution over 2 hr window:')
    print(np.unique(df.iloc[0]['prob_spl_2_hr']))

In [9]:
def get_rel_loudness_probs(feats_path, indices_path):
    indices = h5py.File(indices_path)
    blob = h5py.File(feats_path)

    # Get the timestamp from the feature file
    ts = blob['openl3']['timestamp']

    # Not used as of now
    feats = blob['openl3']['openl3']

    # Get the spl_vector from the indices file
    spl_vecs = indices['recording_index']['spl_vector']

    assert feats.shape[0] == ts.shape[0] == spl_vecs.shape[0]

    # Get the spl avg value of 4 consecutive values from spl_vector
    spl_arr = np.apply_along_axis(get_spl_frame_vector, 1, spl_vecs)

    # Apply get_time_values() to each element of the timestamp array 
    dt_vectorize = np.vectorize(get_time_values)
    t_arr = dt_vectorize(ts)

    # Convert the dicts obtained above into dataframe and combine them to make aggregation easier
    t_df = pd.DataFrame(list(t_arr))
    spl_df = pd.DataFrame(list(spl_arr)) 
    df = pd.concat([t_df, spl_df], axis=1)
    
    res = df.groupby(['day_id'], as_index = False).agg({'spl_frames': 'sum'}).reset_index() 
    res['prob_spl_2_hr'] = res.apply(lambda row : get_spl_prob(row), axis = 1)
    final = pd.merge(df, res, on='day_id', how='outer', suffixes=('_emb_str', '_2_hr_str'))
    
    return final

In [12]:
feats_path = '/beegfs/work/sonyc/features/openl3/2017/sonycnode-b827ebefb215.sonyc_features_openl3.h5'
indices_path = '/beegfs/work/sonyc/indices/2017/'+ os.path.basename(feats_path).replace('features_openl3', 'recording_index')

final = get_rel_loudness_probs(feats_path, indices_path)
pretty_print_test(final)

Dataframe head:
   day_id  day_of_the_week  hour_of_the_day  hr_id  week_of_the_year  \
0  8-2-42                2               16      8                42   
1  8-2-42                2               16      8                42   
2  8-2-42                2               16      8                42   
3  8-2-42                2               16      8                42   
4  8-2-42                2               16      8                42   

                                  spl_frames_emb_str  index  \
0  [62.797499656677246, 65.18499946594238, 62.577...    720   
1  [65.99249839782715, 67.80750274658203, 68.3674...    720   
2  [62.71750068664551, 62.96500015258789, 63.3200...    720   
3  [62.14999961853027, 66.05249977111816, 62.5624...    720   
4  [64.3400011062622, 64.84749984741211, 65.59249...    720   

                                 spl_frames_2_hr_str  \
0  [62.797499656677246, 65.18499946594238, 62.577...   
1  [62.797499656677246, 65.18499946594238, 62.577...   
2  [