In [None]:
import librosa
import numpy as np
import math
import pandas as pd
import re
from IPython.display import Audio
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from pathlib import Path
import json

In [None]:
def play_audio(signal, rate):
    return Audio(data=signal, rate=rate)

In [None]:
wavs = [path for path in Path('C:/Users/yagne/Downloads/wavs/wavs/').rglob('*.wav')]
jsons = [path for path in Path('C:/Users/yagne/Downloads/wavs/wavs/').rglob('*.json')]

In [None]:
wav_file, wav_sr = librosa.load(wavs[0], sr=librosa.core.get_samplerate(wavs[0]), )

In [None]:
df = pd.read_json(jsons[0])
df['file'] = wavs[0].name
df['length'] = df['end'] - df['start']
df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))

In [None]:
data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)

In [None]:
print(f'{wav_file.shape[0]:,} samples with sampling rate {wav_sr}')
print(f'{df.shape[0]} transcript lines')
print(f"{data.shape[0]} viable segments")

In [None]:
play_audio(wav_file, wav_sr)

In [None]:
data.sample(n=4)

In [None]:
row = dict(data.iloc[0])

In [None]:
start = row['start_idx']
end = row['end_idx']
clip = wav_file[start:end]

o_env = librosa.onset.onset_strength(clip, sr=wav_sr, max_size=5)
times = librosa.times_like(o_env, sr=wav_sr)
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=wav_sr, units="time")
onset_dt = librosa.feature.delta(o_env)
onset_diff = np.diff(onset_frames)

In [None]:
plt.plot(np.diff(onset_frames))

In [None]:
row

In [None]:
def get_onset_values(audio, sr, row):
    start = row['start_idx']
    end = row['end_idx']   
    # More standard but slower because it has to load in file
    # Each time. Either speed or assume quality risk.
#     clip, sr = librosa.load(wavs[0],
#                             sr=librosa.core.get_samplerate(wavs[0]),
#                             offset=row['start'],
#                             duration=row['length']
#                            )
    # Faster but MIGHT (don't know how likely) cause data quality issues
    clip = audio[start:end]


    try:
        o_env = librosa.onset.onset_strength(clip, sr=sr, max_size=5)
        times = librosa.times_like(o_env, sr=sr)
        onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
        onset_dt = librosa.feature.delta(o_env)

        row['onset_count'] = onset_frames.shape[0]
        row['onset_rate'] = row['onset_count']/row['length']

        row['onset_strength_mean'] = np.mean(o_env)
        row['onset_strength_seg_var'] = np.var(o_env)
        row['onset_strength_seg_std'] = np.std(o_env)

        row['onset_delta_abs_mean'] = np.mean(np.abs(onset_dt))
        row['onset_delta_var'] = np.var(onset_dt)

    except:
        row['onset_count'] = 0
        row['onset_rate'] = 0
        row['onset_strength_mean'] = 0
        row['onset_strength_seg_var'] = 0
        row['onset_strength_seg_std'] = 0

        row['onset_delta_abs_mean'] = 0
        row['onset_delta_var'] = 0

        
    return row

In [None]:
data = data.apply(lambda x: get_onset_values(wav_file, wav_sr, x), axis=1)

In [None]:
# Could change to median absolute deviation to make no assumptions on distribution (n = 3 for some judges)
recording_stats = data.groupby('speaker').agg({"onset_rate": ["mean","std"], "onset_strength_mean": ["mean","std"]})

exploded_recording_stats = []
for r in recording_stats.iterrows():
    info = {
        "speaker": r[0],
        "rec_onset_rate_mean": r[1]['onset_rate']['mean'],
        "rec_onset_rate_std": r[1]['onset_rate']['std'],
        "rec_onset_str_mean": r[1]['onset_strength_mean']['mean'],
        "rec_onset_str_std": r[1]['onset_strength_mean']['std'],
    }
    exploded_recording_stats.append(info)
recording_stats = pd.DataFrame(exploded_recording_stats)

data = data.merge(recording_stats, on="speaker")

In [None]:
data.columns

In [None]:
data['rec_onset_rate_dev'] = (data['onset_rate'] - data['rec_onset_rate_mean'])/data['rec_onset_rate_std']
data['rec_onset_str_dev'] = (data['onset_strength_mean'] - data['rec_onset_str_mean'])/data['rec_onset_str_std']

In [None]:
all_data = []

for w in wavs:
    wav_file, wav_sr = librosa.load(w, sr=librosa.core.get_samplerate(w), )
    df = pd.read_json(os.path.join(w.parent,w.name.replace(".wav",".json")))
    df['file'] = w.name
    df['length'] = df['end'] - df['start']
    df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
    df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
    df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))
    
    data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
    data = data.apply(lambda x: get_onset_values(wav_file, wav_sr, x), axis=1)
    
#     recording_stats = data.groupby('speaker').agg({"onset_rate": ["mean","std"], "onset_strength_mean": ["mean","std"]})
#     exploded_recording_stats = []
#     for r in recording_stats.iterrows():
#         info = {
#             "speaker": r[0],
#             "rec_onset_rate_mean": r[1]['onset_rate']['mean'],
#             "rec_onset_rate_std": r[1]['onset_rate']['std'],
#             "rec_onset_str_mean": r[1]['onset_strength_mean']['mean'],
#             "rec_onset_str_std": r[1]['onset_strength_mean']['std'],
#         }
#         exploded_recording_stats.append(info)
#     recording_stats = pd.DataFrame(exploded_recording_stats)

#     data = data.merge(recording_stats, on="speaker")
#     data['rec_onset_rate_dev'] = (data['onset_rate'] - data['rec_onset_rate_mean'])/data['rec_onset_rate_std']
#     data['rec_onset_str_dev'] = (data['onset_strength_mean'] - data['rec_onset_str_mean'])/data['rec_onset_str_std']

    all_data.append(data)

In [None]:
all_data_df = pd.concat(all_data)

In [None]:
all_data_df.to_parquet("data.pq")

In [None]:
check = pd.read_parquet("data.pq")

In [None]:
# check.loc[check['onset_delta_var'] == 0]
check.sample(n=4)

In [None]:
# Number of onsets
# Onset Rate
# Time difference between onsets (np.diff) --> mean, variance

# For across-audio file segments --> stick within one year of case dates
# Project down to 32 mels instead of default 144 mels