# Process
1. Set arguments
2. Read data
3. Filtering
4. Feature Extraction
5. BERT

In [1]:
import librosa
import numpy as np
import pandas as pd
import os
from pathlib import Path

## Arguments

In [2]:
min_words = 40

## Data

In [24]:
wavs = [path for path in Path('./wavs/').rglob('*.wav')]
jsons = [path for path in Path('./wavs/').rglob('*.json')]

In [25]:
wavs[0], jsons[0]

(WindowsPath('wavs/142-orig.wav'), WindowsPath('wavs/142-orig.json'))

In [26]:
len(wavs), len(jsons)

(56, 57)

In [27]:
wav_file, wav_sr = librosa.load(wavs[0], sr=librosa.core.get_samplerate(wavs[0]))

df = pd.read_json(jsons[0])
df['file'] = wavs[0].name
df['line'] = df.index
df['duration'] = df['end'] - df['start']
df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))

## Filtering
- Remove segments that do not belong to judges
- Remove segments less than 40 words

In [28]:
data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
print(f'{df.shape[0]} transcript lines')
print(f"{data.shape[0]} viable segments")

150 transcript lines
37 viable segments


## Score (BERT) Preparation

In [11]:
%%time
all_data = []
for i,w in enumerate(wavs):
    if i % 10 == 0:
        print(i)
    df = pd.read_json(os.path.join(w.parent,w.name.replace(".wav",".json")))
    df['file'] = w.name
    df['line'] = df.index
    df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))
    data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
    data = data[['file','line','speaker','start','end','text']]

    all_data.append(data)
    
bert_metadata = pd.concat(all_data)
print("\n Record Info:")
print(bert_metadata.shape)

0
10
20
30
40
50

 Record Info:
(2545, 6)
Wall time: 486 ms


In [12]:
bert_metadata.to_json('bert_metadata.json', orient='records')
bert_metadata.sample(n=4)

Unnamed: 0,file,line,speaker,start,end,text
32,19-508.wav,135,Amy_Coney_Barrett,3536.345,3565.09,"Counsel, the -- the damages award here or the ..."
61,19-968.wav,300,Amy_Coney_Barrett,5183.56,5237.2,So what is the effect of your argument on the ...
18,19-422.wav,77,Neil_Gorsuch,1436.0,1470.24,"-- I -- I understand that point, but Congress ..."
48,20-107.wav,244,Amy_Coney_Barrett,3867.26,3897.38,"But, General, my -- Penn Central is deliberate..."


## Features
- Pitch
- Onset

### Pitch

In [42]:
def get_features(audio, sr, row):
    start = row['start_idx']
    end = row['end_idx']   
    clip = audio[start:end]
    
    try:
        #f0, _, _ = librosa.pyin(clip, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
        f0, _, _ = librosa.pyin(clip, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C5'))

        f0_log = np.log2(f0)
        row['pitch_log_diff_variance'] = np.nanvar(np.diff(f0_log))
        row['pitch_log_mean'] = np.nanmean(f0_log)
        row['pitch_log_stdev'] = np.nanstd(f0_log)
        row['pitch_2pct'], row['pitch_25pct'], row['pitch_50pct'], row['pitch_75pct'], row['pitch_98pct'] = np.nanquantile(f0_log, [.02, .25, .5, .75, .98])
    except Exception as e:
        print(f"Pitch: {row['file']} {row['line']}, {e}")
        row['pitch_log_diff_variance'] = np.nan
        row['pitch_log_mean'] = np.nan
        row['pitch_log_stdev'] = np.nan
        row['pitch_2pct'], row['pitch_25pct'], row['pitch_50pct'], row['pitch_75pct'], row['pitch_98pct'] = np.nan, np.nan, np.nan, np.nan, np.nan
        
    try:
        o_env = librosa.onset.onset_strength(clip, sr=sr, max_size=5)
        # times = librosa.times_like(o_env, sr=sr)
        onset_times = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr, units="time")
        # onset_dt = librosa.feature.delta(o_env)

        onset_count = onset_times.shape[0]
        onset_rate = onset_count/row['duration']
        row['onset_count'] = onset_count
        row['onset_rate'] = onset_rate

        onset_histogram = np.histogram(o_env, bins=50, density=True)
        onset_data = onset_histogram[0]
        entropy = -(onset_data*np.log1p(np.abs(onset_data))).sum()
        row['onset_power_entropy'] = entropy

        onset_times_diff = np.diff(onset_times)
        onset_time_diff_mean = np.mean(onset_times_diff)
        onset_time_diff_var = np.var(onset_times_diff)
        row['onset_time_diff_mean'] = onset_time_diff_mean
        row['onset_time_diff_var'] = onset_time_diff_var

        
        onset_time_histogram = np.histogram(onset_times_diff, bins=50, density=True)
        onset_time_data = onset_time_histogram[0]
        entropy = -(onset_time_data*np.log1p(np.abs(onset_time_data))).sum()
        row['onset_time_entropy'] = entropy



    except Exception as e:
        print(f"Onset: {row['file']} {row['line']}, {e}")
        row['onset_count'] = np.nan
        row['onset_rate'] = np.nan
        
        row['onset_time_diff_mean'] = np.nan
        row['onset_time_diff_var'] = np.nan

        row['onset_power_entropy'] = np.nan
        row['onset_time_entropy'] = np.nan

    return row

In [43]:
row = dict(data.iloc[1])
get_features(wav_file, wav_sr, row)

{'start': 227.24,
 'end': 242.92,
 'speaker': 'John_G_Roberts_Jr',
 'speaker_role': 'scotus_justice',
 'text': 'Well, the -- the Special Master concluded that Georgia -- that Georgia would be required to allow huge amounts of water to flow into the bay to really allow recovery of the oyster fishery and that that would not be -- be equitable. What is your response to that?',
 'file': '142-orig.wav',
 'line': 4,
 'duration': 15.679999999999978,
 'start_idx': 3635840,
 'end_idx': 3886720,
 'word_count': 49,
 'word_rate': 3.1250000000000044,
 'onset_count': 62,
 'onset_rate': 3.954081632653067,
 'onset_power_entropy': -2.296225170870145,
 'onset_time_diff_mean': 0.24760655737704917,
 'onset_time_diff_var': 0.04646279602257457,
 'onset_time_entropy': -69.41020848678615}

In [45]:
%%time
all_data = []
for i,w in enumerate(wavs):
    if i % 10 == 0:
        print(i)
    wav_file, wav_sr = librosa.load(w, sr=librosa.core.get_samplerate(w), )
    df = pd.read_json(os.path.join(w.parent,w.name.replace(".wav",".json")))
    df['file'] = w.name
    df['line'] = df.index
    df['duration'] = df['end'] - df['start']
    df['start_idx'] = np.floor(df['start']*wav_sr).astype(int)
    df['end_idx'] = np.ceil(df['end']*wav_sr).astype(int)
    df['word_count'] = df['text'].apply(lambda x: len(x.split(" ")))
    df['word_rate'] = df['word_count']/df['duration']
    
    data = df.loc[(df['word_count'] >= 40) & (df['speaker_role'] == 'scotus_justice')].copy().reset_index(drop=True)
    data = data.apply(lambda x: get_features(wav_file, wav_sr, x), axis=1)
    all_data.append(data)
all_data_df = pd.concat(all_data)
print("\n Record Info:")
print(all_data_df.shape)

0




Onset: 19-123.wav 105, can't extend empty axis 0 using modes other than 'constant' or 'empty'
10
20
30
40
Onset: 20-18.wav 124, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 126, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 133, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 135, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 137, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 141, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 145, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 149, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 151, can't extend empty axis 0 using modes other than 'constant' or 'empty'
Onset: 20-18.wav 161, can't extend empty axis 0 using mod

In [46]:
all_data_df.to_json('all_data.json', orient="records")

# Join with BERT

In [None]:
bert_info = pd.read_csv("/path/to/bert")
full_data = all_data_df.merge(bert_info[['file','line','BERT-GS_Scores']], on=['file','line'])

In [None]:
full_data.to_csv('full_data.csv', index=False)

# Get the speaker-level info

In [67]:
import pandas as pd

# import the full data
data = pd.read_csv('full_data.csv')

# features
features = ['duration', 'onset_count', 'onset_rate', 'onset_time_diff_mean', 'onset_time_diff_var',
       'pitch_25pct', 'pitch_2pct', 'pitch_50pct', 'pitch_75pct',
       'pitch_98pct', 'pitch_log_diff_variance', 'pitch_log_mean',
       'pitch_log_stdev', 'speaker', 'word_count', 'word_rate', 'BERT-GS_Scores']

# aggregate by speaker
justice_level = data[features].groupby(['speaker']).agg(['mean', 'std'])

# fix the multi-index from aggregating
justice_level.columns = ["_".join(x) for x in justice_level.columns.ravel()]
full_data_and_speaker_level = data.merge(justice_level, how = 'left', on = 'speaker')

# out csv
full_data_and_speaker_level.to_csv("full_data_and_speaker_level.csv")

  justice_level.columns = ["_".join(x) for x in justice_level.columns.ravel()]
