# Process
1. Set arguments
2. Read data
3. Filtering
4. Feature Extraction
5. BERT

In [1]:
import librosa
import numpy as np
import pandas as pd
import os
from pathlib import Path

## Arguments

In [2]:
min_words = 40

## Data

In [3]:
wavs = [path for path in Path('../../wavs/').rglob('*.wav')]
jsons = [path for path in Path('../../wavs/').rglob('*.json')]

In [4]:
wavs[0], jsons[0]

(WindowsPath('../../wavs/142-orig.wav'),
 WindowsPath('../../wavs/142-orig.json'))

In [5]:
len(wavs), len(jsons)

(56, 57)

In [6]:
wav_file, wav_sr = librosa.load(wavs[0], sr=librosa.core.get_samplerate(wavs[0]))

df = pd.read_json(jsons[0])
df['file'] = wavs[0].name
df['line'] = df.index
df['duration'] = df['end'] - df['start']
df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))

## Filtering
- Remove segments that do not belong to judges
- Remove segments less than 40 words

In [7]:
data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
print(f'{df.shape[0]} transcript lines')
print(f"{data.shape[0]} viable segments")

150 transcript lines
37 viable segments


## Score (BERT) Preparation

In [19]:
%%time
all_data = []
for i,w in enumerate(wavs):
    if i % 10 == 0:
        print(i)
    df = pd.read_json(os.path.join(w.parent,w.name.replace(".wav",".json")))
    df['file'] = w.name
    df['line'] = df.index
    df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))
    data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
    data = data[['file','line','speaker','start','end','text']]

    all_data.append(data)
    
bert_metadata = pd.concat(all_data)
print("\n Record Info:")
print(bert_metadata.shape)

0
10
20
30
40
50

 Record Info:
(2545, 6)
Wall time: 328 ms


In [21]:
bert_metadata.to_json('bert_metadata.json', orient='records')
bert_metadata.sample(n=4)

Unnamed: 0,file,line,speaker,start,end,text
15,20-297.wav,87,Brett_M_Kavanaugh,1627.99,1645.185,To pick up on Justice Alito and also Justice G...
51,20-334.wav,206,Elena_Kagan,3339.54,3382.06,"Okay. So -- but then, if you turn to 1924, 192..."
11,19-8709.wav,56,Neil_Gorsuch,1369.4,1393.01,"Counsel, good morning. I -- I'd like to unders..."
12,19-357.wav,115,Sonia_Sotomayor,1957.92,1979.4,-- just the refusal to turn over something tha...


## Features
- Pitch
- Onset

### Pitch

In [None]:
def get_features(audio, sr, row):
    start = row['start_idx']
    end = row['end_idx']   
    clip = audio[start:end]
    
    try:
        f0, _, _ = librosa.pyin(clip, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))

        f0_log = np.log(f0)
        row['pitch_log_diff_variance'] = np.nanvar(np.diff(f0_log))
        row['pitch_log_mean'] = np.nanmean(f0_log)
        row['pitch_log_stdev'] = np.nanstd(f0_log)
        row['pitch_2pct'], row['pitch_25pct'], row['pitch_50pct'], row['pitch_75pct'], row['pitch_98pct'] = np.nanquantile(f0, [.02, .25, .5, .75, .98])
    except Exception as e:
        print(f"Pitch: {row['file']} {row['line']}, {e}")
        row['pitch_log_diff_variance'] = np.nan
        row['pitch_log_mean'] = np.nan
        row['pitch_log_stdev'] = np.nan
        row['pitch_2pct'], row['pitch_25pct'], row['pitch_50pct'], row['pitch_75pct'], row['pitch_98pct'] = np.nan, np.nan, np.nan, np.nan, np.nan
        
    try:
        o_env = librosa.onset.onset_strength(clip, sr=sr, max_size=5)
        times = librosa.times_like(o_env, sr=sr)
        onset_times = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr, units="time")
        onset_dt = librosa.feature.delta(o_env)

        onset_count = onset_times.shape[0]
        onset_rate = onset_count/row['duration']
        row['onset_count'] = onset_count
        row['onset_rate'] = onset_rate

        onset_times_diff = np.diff(onset_times)
        onset_time_diff_mean = np.mean(onset_times_diff)
        onset_time_diff_var = np.var(onset_times_diff)
        row['onset_time_diff_mean'] = onset_time_diff_mean
        row['onset_time_diff_var'] = onset_time_diff_var
    except Exception as e:
        print(f"Onset: {row['file']} {row['line']}, {e}")
        row['onset_count'] = np.nan
        row['onset_rate'] = np.nan
        row['onset_time_diff_mean'] = np.nan
        row['onset_time_diff_var'] = np.nan
        
    return row

In [None]:
%%time
all_data = []
for i,w in enumerate(wavs):
    if i % 10 == 0:
        print(i)
    wav_file, wav_sr = librosa.load(w, sr=librosa.core.get_samplerate(w), )
    df = pd.read_json(os.path.join(w.parent,w.name.replace(".wav",".json")))
    df['file'] = w.name
    df['line'] = df.index
    df['duration'] = df['end'] - df['start']
    df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
    df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
    df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))
    df['word_rate'] = df['word_count']/df['duration']
    
    data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
    data = data.apply(lambda x: get_features(wav_file, wav_sr, x), axis=1)
    all_data.append(data)
    
all_data_df = pd.concat(all_data)
print("\n Record Info:")
print(all_data_df.shape)

In [None]:
all_data_df.to_json('all_data.json', orient="records")

# Join with BERT

In [None]:
full_data = all_data_df.merge(bert_info[['file','line','BERT-GS_Scores']], on=['file','line'])

In [None]:
full_data.to_csv('full_data.csv', index=False)