In [21]:
import librosa
import numpy as np
import math
import pandas as pd
import re
from IPython.display import Audio
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from pathlib import Path
import json

In [36]:
import pyarrow

In [2]:
def play_audio(signal, rate):
    return Audio(data=signal, rate=rate)

In [22]:
wavs = [path for path in Path('wavs/').rglob('*.wav')]
jsons = [path for path in Path('wavs/').rglob('*.json')]

In [23]:
wav_file, wav_sr = librosa.load(wavs[0], sr=librosa.core.get_samplerate(wavs[0]))

In [24]:
df = pd.read_json(jsons[0])
df['file'] = wavs[0].name
df['length'] = df['end'] - df['start']
df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))

In [25]:
data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)

In [26]:
print(f'{wav_file.shape[0]:,} samples with sampling rate {wav_sr}')
print(f'{df.shape[0]} transcript lines')
print(f"{data.shape[0]} viable segments")

63,186,817 samples with sampling rate 16000
150 transcript lines
37 viable segments


In [27]:
# play_audio(wav_file, wav_sr)

In [28]:
data.sample(n=4)

Unnamed: 0,start,end,speaker,speaker_role,text,file,length,start_idx,end_idx,word_count
14,1391.56,1419.96,Neil_Gorsuch,scotus_justice,Okay. Judge Kelly found that the decree would ...,142-orig.wav,28.4,22264960,22719360,66
4,510.88,588.4,Stephen_G_Breyer,scotus_justice,"Well, the part I don't understand, I mean, you...",142-orig.wav,77.52,8174080,9414400,191
13,1272.28,1286.96,Sonia_Sotomayor,scotus_justice,And did you ever quantify exactly how much wat...,142-orig.wav,14.68,20356480,20591360,45
3,431.92,465.4,Clarence_Thomas,scotus_justice,"Well, that's -- you know, that's interesting b...",142-orig.wav,33.48,6910720,7446400,83


In [47]:
def get_onset_values(audio, sr, row):
    start = row['start_idx']
    end = row['end_idx']   
    # More standard but slower because it has to load in file
    # Each time. Either speed or assume quality risk.
#     clip, sr = librosa.load(wavs[0],
#                             sr=librosa.core.get_samplerate(wavs[0]),
#                             offset=row['start'],
#                             duration=row['length']
#                            )
    # Faster but MIGHT (don't know how likely) cause data quality issues
    clip = audio[start:end]


    try:
        o_env = librosa.onset.onset_strength(clip, sr=sr, max_size=5)
        times = librosa.times_like(o_env, sr=sr)
        onset_times = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr, units="time")
        onset_dt = librosa.feature.delta(o_env)

        row['onset_count'] = onset_times.shape[0]
        row['onset_rate'] = row['onset_count']/row['length']
        
        onset_times_diff = np.diff(onset_times)
        row['onset_time_diff_mean'] = np.mean(onset_times_diff)
        row['onset_time_diff_var'] = np.var(onset_times_diff)
        
        row['onset_strength_mean'] = np.mean(o_env)
        row['onset_strength_seg_var'] = np.var(o_env)
        row['onset_strength_seg_std'] = np.std(o_env)
        row['onset_delta_abs_mean'] = np.mean(np.abs(onset_dt))
        row['onset_delta_var'] = np.var(onset_dt)

    except Exception as e:
        print(f"{row['file']} {row['start']}:{row['end']}, {e}")
        row['onset_count'] = 0
        row['onset_rate'] = 0
        row['onset_time_diff_mean'] = 0
        row['onset_time_diff_var'] = 0
        row['onset_strength_mean'] = 0
        row['onset_strength_seg_var'] = 0
        row['onset_strength_seg_std'] = 0
        row['onset_delta_abs_mean'] = 0
        row['onset_delta_var'] = 0

        
    return row

In [48]:
data = data.apply(lambda x: get_onset_values(wav_file, wav_sr, x), axis=1)

In [50]:
data.sample(n=2)

Unnamed: 0,start,end,speaker,speaker_role,text,file,length,start_idx,end_idx,word_count,onset_count,onset_rate,onset_time_diff_mean,onset_time_diff_var,onset_strength_mean,onset_strength_seg_var,onset_strength_seg_std,onset_delta_abs_mean,onset_delta_var
11,1366.59,1386.91,Elena_Kagan,scotus_justice,And when you say that all the e-mails are irre...,65-orig.wav,20.32,21865440,22190560,46,51,2.509843,0.37568,0.095388,0.577563,1.762889,1.327738,0.130294,0.052979
9,1089.01,1112.15,Sonia_Sotomayor,scotus_justice,"Counsel, that's the whole point, which is: So ...",65-orig.wav,23.14,17424160,17794400,47,71,3.06828,0.305829,0.083343,0.631623,2.024734,1.422931,0.134448,0.062116


In [51]:
%%time
all_data = []

for w in wavs:
    wav_file, wav_sr = librosa.load(w, sr=librosa.core.get_samplerate(w), )
    df = pd.read_json(os.path.join(w.parent,w.name.replace(".wav",".json")))
    df['file'] = w.name
    df['length'] = df['end'] - df['start']
    df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
    df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
    df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))
    
    data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
    data = data.apply(lambda x: get_onset_values(wav_file, wav_sr, x), axis=1)
    
    all_data.append(data)
    
all_data_df = pd.concat(all_data)
print("\n Record Info:")
print(all_data_df.shape)
all_data_df.to_parquet("data.pq", engine="fastparquet")



19-123.wav-1886.0:1885.0, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2239.4:2304.8, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2328.24:2375.52, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2442.04:2455.44, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2477.6:2506.76, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2534.56:2556.04, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2587.28:2712.64, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2758.6:2818.6, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2871.24:2933.08, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-2970.04:2995.28, can't extend empty axis 0 using modes other than 'constant' or 'empty'
20-18.wav-3062.84:3086.68,

In [54]:
check = pd.read_parquet("data.pq", engine="fastparquet")
check.sample(n=2)

Unnamed: 0_level_0,start,end,speaker,speaker_role,text,file,length,start_idx,end_idx,word_count,onset_count,onset_rate,onset_time_diff_mean,onset_time_diff_var,onset_strength_mean,onset_strength_seg_var,onset_strength_seg_std,onset_delta_abs_mean,onset_delta_var
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
21,1988.32,2016.92,John_G_Roberts_Jr,scotus_justice,"Counsel, let's say there's a -- a retired guy ...",19-368.wav,28.6,31813120,32270720,90,84,2.937063,0.338892,0.103339,0.63822,1.55829,1.248315,0.126122,0.042974
0,132.975,165.95,John_G_Roberts_Jr,scotus_justice,"Mr. Fisher, in Musacchio versus United States,...",19-783.wav,32.975,2127600,2655200,75,102,3.093252,0.317465,0.093451,0.876915,2.018828,1.420855,0.139838,0.050157


In [53]:
print("Bad Records:",check.loc[check['onset_count'] == 0].shape[0])

Bad Records: 52


In [None]:
# Could change to median absolute deviation to make no assumptions on distribution (n = 3 for some judges)
#     recording_stats = data.groupby('speaker').agg({"onset_rate": ["mean","std"], "onset_strength_mean": ["mean","std"]})
#     exploded_recording_stats = []
#     for r in recording_stats.iterrows():
#         info = {
#             "speaker": r[0],
#             "rec_onset_rate_mean": r[1]['onset_rate']['mean'],
#             "rec_onset_rate_std": r[1]['onset_rate']['std'],
#             "rec_onset_str_mean": r[1]['onset_strength_mean']['mean'],
#             "rec_onset_str_std": r[1]['onset_strength_mean']['std'],
#         }
#         exploded_recording_stats.append(info)
#     recording_stats = pd.DataFrame(exploded_recording_stats)

#     data = data.merge(recording_stats, on="speaker")
#     data['rec_onset_rate_dev'] = (data['onset_rate'] - data['rec_onset_rate_mean'])/data['rec_onset_rate_std']
#     data['rec_onset_str_dev'] = (data['onset_strength_mean'] - data['rec_onset_str_mean'])/data['rec_onset_str_std']

In [None]:
# data['rec_onset_rate_dev'] = (data['onset_rate'] - data['rec_onset_rate_mean'])/data['rec_onset_rate_std']
# data['rec_onset_str_dev'] = (data['onset_strength_mean'] - data['rec_onset_str_mean'])/data['rec_onset_str_std']

In [None]:
# Number of onsets
# Onset Rate
# Time difference between onsets (np.diff) --> mean, variance

# For across-audio file segments --> stick within one year of case dates
# Project down to 32 mels instead of default 144 mels