Skip to content

Commit

Permalink
More evals
Browse files Browse the repository at this point in the history
  • Loading branch information
jrgillick committed Mar 16, 2020
1 parent e6cabe9 commit 534fe34
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 84 deletions.
117 changes: 73 additions & 44 deletions scripts/Evaluation/analyze_results.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,84 @@
import pandas as pd, numpy as np, os, sys
from tqdm import tqdm

def resample(data, indices):
new_data = []
for i in indices:
new_data.append(data[i])
return new_data

# takes a list of tuples of precision, recall, f1, support
# returns the 95% confidence interval for each
def get_confidence_intervals(accs, precs, recs, f1s):
accuracy_int = ("Accuracy: %s" % ' '.join(["%f" % (x) for x in np.percentile(accs, [2.5, 50, 97.5])]))
precision_int = ("Precision: %s" % ' '.join(["%f" % (x) for x in np.percentile(precs, [2.5, 50, 97.5])]))
recall_int = ("Recall: %s" % ' '.join(["%f" % (x) for x in np.percentile(recs, [2.5, 50, 97.5])]))
f1_int = ("F1: %s" % ' '.join(["%f" % (x) for x in np.percentile(f1s, [2.5, 50, 97.5])]))

return accuracy_int, precision_int, recall_int, f1_int

def get_metrics(tp_times,fp_times,tn_times,fn_times):
tp_time = sum(tp_times)
fp_time = sum(fp_times)
tn_time = sum(tn_times)
fn_time = sum(fn_times)
accuracy = (tp_time + tn_time) / (tp_time + fp_time + tn_time + fn_time)
precision = tp_time / (tp_time + fp_time)
recall = tp_time / (tp_time + fn_time)
f1 = 2*(precision*recall)/(precision+recall)
return accuracy, precision, recall, f1

def bootstrap_metrics(tp_times,fp_times,tn_times,fn_times,n_samples=1000):
accuracies = []; precisions = []; recalls = []; f1s = []
for _ in tqdm(range(n_samples)):
sample=np.random.choice(list(range(0,len(tp_times))),len(tp_times))
sample_tp_times = resample(tp_times, sample)
sample_fp_times = resample(fp_times, sample)
sample_tn_times = resample(tn_times, sample)
sample_fn_times = resample(fn_times, sample)
metrics = get_metrics(sample_tp_times, sample_fp_times, sample_tn_times, sample_fn_times)
accuracies.append(metrics[0]); precisions.append(metrics[1]); recalls.append(metrics[2]); f1s.append(metrics[3])

intervals = get_confidence_intervals(accuracies, precisions, recalls, f1s)
return intervals

# Baseline on AudioSet
baseline_audioset_results = pd.read_csv('baseline_audioset_results.csv')
tp_time = sum(baseline_audioset_results.tp_time)
fp_time = sum(baseline_audioset_results.fp_time)
tn_time = sum(baseline_audioset_results.tn_time)
fn_time = sum(baseline_audioset_results.fn_time)
accuracy = (tp_time + tn_time) / (tp_time + fp_time + tn_time + fn_time)
precision = tp_time / (tp_time + fp_time)
recall = tp_time / (tp_time + fn_time)
f1 = 2*(precision*recall)/(precision+recall)
print("Baseline results on Audioset:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"f1: {f1}")

print();print()
# Baseline on Audio Set
print("Baseline results on Audio Set...")
baseline_audioset_results = pd.read_csv('baseline_audioset_results.csv')

intervals = bootstrap_metrics(
baseline_audioset_results.tp_time, baseline_audioset_results.fp_time,
baseline_audioset_results.tn_time, baseline_audioset_results.fn_time,n_samples=1000)

for interval in intervals:
print(interval)



print();print()
# Baseline on SWB Validation Set
print("Baseline results on SWB Validation Set...")
baseline_swv_val_results = pd.read_csv('baseline_switchboard_val_results.csv')
tp_time = sum(baseline_swv_val_results.tp_time)
fp_time = sum(baseline_swv_val_results.fp_time)
tn_time = sum(baseline_swv_val_results.tn_time)
fn_time = sum(baseline_swv_val_results.fn_time)
accuracy = (tp_time + tn_time) / (tp_time + fp_time + tn_time + fn_time)
precision = tp_time / (tp_time + fp_time)
recall = tp_time / (tp_time + fn_time)
f1 = 2*(precision*recall)/(precision+recall)
print("Baseline results on SWB Validation Set:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"f1: {f1}")

print();print()
intervals = bootstrap_metrics(
baseline_swv_val_results.tp_time, baseline_swv_val_results.fp_time,
baseline_swv_val_results.tn_time, baseline_swv_val_results.fn_time,n_samples=1000)

for interval in intervals:
print(interval)


"""

print();print()
# Baseline on SWB Test Set
print("Baseline results on SWB Test Set...")
baseline_swv_test_results = pd.read_csv('baseline_switchboard_test_results.csv')
tp_time = sum(baseline_swv_test_results.tp_time)
fp_time = sum(baseline_swv_test_results.fp_time)
tn_time = sum(baseline_swv_test_results.tn_time)
fn_time = sum(baseline_swv_test_results.fn_time)
accuracy = (tp_time + tn_time) / (tp_time + fp_time + tn_time + fn_time)
precision = tp_time / (tp_time + fp_time)
recall = tp_time / (tp_time + fn_time)
f1 = 2*(precision*recall)/(precision+recall)
print("Baseline results on SWB Test Set:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"f1: {f1}")
"""

intervals = bootstrap_metrics(
baseline_swv_test_results.tp_time, baseline_swv_test_results.fp_time,
baseline_swv_test_results.tn_time, baseline_swv_test_results.fn_time,n_samples=1000)

for interval in intervals:
print(interval)
92 changes: 58 additions & 34 deletions scripts/Evaluation/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def get_audio_file_length(path):
f.close()
return l

def get_laughter_times_from_annotation_line(line, min_gap=0.5):
def get_laughter_times_from_annotation_line(line, min_gap=0.5, avoid_edges=False, edge_gap=0.5):
laughter_segments = []
if float(line['End']) > 0: laughter_segments.append([float(line['Start']), float(line['End'])])
for i in range(1,5):
Expand All @@ -24,8 +24,31 @@ def get_laughter_times_from_annotation_line(line, min_gap=0.5):
# Expand time windows to account for minimum gap (window effect)
laughter_segments = [[np.maximum(0, segment[0]-min_gap),np.minimum(line['audio_length'],segment[1]+min_gap)] for segment in laughter_segments]

# Merge any overlapping annotations
# Merge any overlapping annotations and then convert back to list from tuple
laughter_segments = dataset_utils.combine_overlapping_regions(laughter_segments, [])
laughter_segments = [list(s) for s in laughter_segments]
# Slightly fairer to compare w/ Switchboard if we only take windows for which we see the whole 1 second instead of zero-padding
# To do this, trim the audio and annotations by 0.5 seconds at start and finish

trimmed_segments = []
if avoid_edges:
for segment in laughter_segments:
start, end = segment
# Case when the whole segment is within the edge - skip the segment
if (start < edge_gap and end < edge_gap) or (start > line['audio_length']-edge_gap and end > line['audio_length']-edge_gap):
continue
# Case when part of the segment is within the edge - modify the segment
if (start < edge_gap and end > edge_gap):
segment[0] = edge_gap
if (end > line['audio_length']-edge_gap and start < line['audio_length']-edge_gap):
try:
segment[1] = line['audio_length']-edge_gap
except:
import pdb; pdb.set_trace()
# Otherwise keep the segment unchanged
trimmed_segments.append(segment)

laughter_segments = trimmed_segments

# Convert to hash
laughter_segments = [{'start': segment[0], 'end': segment[1]} for segment in laughter_segments]
Expand All @@ -41,36 +64,26 @@ def get_laughter_times_from_annotation_line(line, min_gap=0.5):
segment[0] = extra_beginning_time
laughter_segments = [{'start': segment[0], 'end': segment[1]}]
return laughter_segments
"""
def get_laughter_times_from_annotation_line(line, min_gap=0.5):
laughter_segments = []
if float(line['End']) > 0: laughter_segments.append([float(line['Start']), float(line['End'])])
for i in range(1,5):
if not np.isnan(line[f'Start.{i}']): laughter_segments.append([float(line[f'Start.{i}']), float(line[f'End.{i}'])])
# Combine annotations if they have less than min_gap seconds between events (because of windowing in the model)
# Expand time windows to account for minimum gap (window effect)
laughter_segments = [[np.maximum(0, segment[0]-min_gap),np.minimum(line['audio_length'],segment[1]+min_gap)] for segment in laughter_segments]
# Merge any overlapping annotations, then convert to hash
laughter_segments = dataset_utils.combine_overlapping_regions(laughter_segments, [])
laughter_segments = [{'start': segment[0], 'end': segment[1]} for segment in laughter_segments]
return laughter_segments
"""

# Get all the segments in the audio file that are NOT laughter, using the segments that are laughter and the file length
# Input is array of hashes like [ {'start': 1.107, 'end': 1.858}, {'start': 2.237, 'end': 2.705}]]
def get_non_laughter_times(laughter_segments, file_length):
def get_non_laughter_times(laughter_segments, file_length, avoid_edges=False, edge_gap=0.5):
non_laughter_segments = []

non_laughter_start = 0.0
if avoid_edges:
non_laughter_start=edge_gap
else:
non_laughter_start = 0.0
for segment in laughter_segments:
non_laughter_end = segment['start']
if non_laughter_end > non_laughter_start:
non_laughter_segments.append({'start': non_laughter_start, 'end': non_laughter_end})
non_laughter_start = segment['end']

non_laughter_end = file_length

if avoid_edges:
non_laughter_end=file_length-edge_gap
else:
non_laughter_end = file_length

if non_laughter_end > non_laughter_start:
non_laughter_segments.append({'start': non_laughter_start, 'end': non_laughter_end})
Expand Down Expand Up @@ -110,16 +123,15 @@ def overlap_amount(start1, end1, start2, end2):

def get_baseline_results_per_annotation_index(model, annotations_df,
baseline_laugh_segmenter, i, min_gap=0.375,
threshold=0.5, use_filter=True, min_length=0.1):
threshold=0.5, use_filter=True, min_length=0.1,
avoid_edges=False, edge_gap=0.5):
audio_file = annotations_df.audio_path.iloc[i]

extra_beginning_time = annotations_df.iloc[i].extra_beginning_time if 'extra_beginning_time' in list(annotations_df.columns) else None
extra_end_time = annotations_df.iloc[i].extra_end_time if 'extra_end_time' in list(annotations_df.columns) else None
line = dict(annotations_df.iloc[i])

#true_laughter_times = get_laughter_times_from_annotation_line(line, min_gap=min_gap)
#true_non_laughter_times = get_non_laughter_times(true_laughter_times, annotations_df.iloc[i].audio_length)

# Switchboard
if extra_beginning_time is not None and extra_end_time is not None:
true_laughter_times = get_laughter_times_from_annotation_line(line, min_gap=min_gap)
absolute_start_time = line['Start'] - extra_beginning_time
Expand All @@ -130,13 +142,15 @@ def get_baseline_results_per_annotation_index(model, annotations_df,
model, input_path=audio_file, threshold=threshold, use_filter=use_filter, min_length=min_length,
audio_start=absolute_start_time, audio_length=audio_length)
predicted_non_laughter_times = get_non_laughter_times(predicted_laughter_times, audio_length)
# Audioset
else:
audio_length = annotations_df.iloc[i].audio_length
true_laughter_times = get_laughter_times_from_annotation_line(line, min_gap=min_gap)
true_non_laughter_times = get_non_laughter_times(true_laughter_times, audio_length)
true_laughter_times = get_laughter_times_from_annotation_line(line, min_gap=min_gap, avoid_edges=avoid_edges)
true_non_laughter_times = get_non_laughter_times(true_laughter_times, audio_length, avoid_edges=avoid_edges)
predicted_laughter_times = baseline_laugh_segmenter.segment_laugh_with_model(
model, input_path=audio_file, threshold=threshold, use_filter=use_filter, min_length=min_length)
predicted_non_laughter_times = get_non_laughter_times(predicted_laughter_times, audio_length)
model, input_path=audio_file, threshold=threshold, use_filter=use_filter, min_length=min_length,
avoid_edges=False, edge_gap=edge_gap)
predicted_non_laughter_times = get_non_laughter_times(predicted_laughter_times, audio_length, avoid_edges=avoid_edges)

total_laughter_time = sum_overlap_amount(true_laughter_times,true_laughter_times)
total_non_laughter_time = sum_overlap_amount(true_non_laughter_times,true_non_laughter_times)
Expand All @@ -151,7 +165,10 @@ def get_baseline_results_per_annotation_index(model, annotations_df,
try:
assert(np.abs(total_laughter_time - (true_positive_time + false_negative_time)) < 0.1)
assert(np.abs(total_non_laughter_time - (true_negative_time + false_positive_time)) < 0.1)
assert(np.abs(total_time - audio_length) < 0.1)
if avoid_edges:
assert(np.abs(total_time - (audio_length - 2*edge_gap)) < 0.1)
else:
assert(np.abs(total_time - audio_length) < 0.1)
assert(np.abs(total_time - (total_laughter_time + total_non_laughter_time)) < 0.1)
except:
import pdb; pdb.set_trace()
Expand All @@ -163,7 +180,7 @@ def get_baseline_results_per_annotation_index(model, annotations_df,

return h

def get_annotation_stats(annotations_df, display=True, min_gap=0.5):
def get_annotation_stats(annotations_df, display=True, min_gap=0.5, avoid_edges=False, edge_gap=0.5):
laughter_lengths = []
non_laughter_lengths = []
total_lengths = []
Expand All @@ -174,6 +191,7 @@ def get_annotation_stats(annotations_df, display=True, min_gap=0.5):
extra_end_time = annotations_df.iloc[i].extra_end_time if 'extra_end_time' in list(annotations_df.columns) else None
line = dict(annotations_df.iloc[i])

#Switchboard
if extra_beginning_time is not None and extra_end_time is not None:
times = get_laughter_times_from_annotation_line(line, min_gap=min_gap)
laughter_count += len(times)
Expand All @@ -186,11 +204,17 @@ def get_annotation_stats(annotations_df, display=True, min_gap=0.5):
laughter_lengths.append(laughter_length)
non_laughter_lengths.append(non_laughter_length)
total_lengths.append(total_length)
#Audioset
else:
audio_length = annotations_df.iloc[i].audio_length
times = get_laughter_times_from_annotation_line(line, min_gap=min_gap)
times = get_laughter_times_from_annotation_line(line, min_gap=min_gap,avoid_edges=avoid_edges)
laughter_count += len(times)
total_length = annotations_df.iloc[i].audio_length

if avoid_edges:
total_length = annotations_df.iloc[i].audio_length - 2*edge_gap
else:
total_length = annotations_df.iloc[i].audio_length

laughter_length = sum_overlap_amount(times, times)
non_laughter_length = total_length - laughter_length
laughter_lengths.append(laughter_length)
Expand Down
3 changes: 2 additions & 1 deletion scripts/Evaluation/evaluate_baseline_on_audioset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tqdm import tqdm

MIN_GAP = 0
avoid_edges=True

path_to_baseline_code = '/mnt/data0/jrgillick/projects/laughter-detection-2018/laughter-detection/'
baseline_model_path = path_to_baseline_code + '/models/model.h5'
Expand All @@ -19,7 +20,7 @@
for i in tqdm(range(len(annotations_df))):
h = get_baseline_results_per_annotation_index(
baseline_model, annotations_df, baseline_laugh_segmenter, i, min_gap=MIN_GAP,
use_filter=True,min_length=0.1)
use_filter=True,min_length=0.1,avoid_edges=avoid_edges)
all_results.append(h)

results_df = pd.DataFrame(all_results)
Expand Down
11 changes: 6 additions & 5 deletions scripts/Evaluation/evaluate_baseline_on_switchboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,25 +107,26 @@ def make_switchboard_dataframe(transcription_files_A, transcription_files_B, aud
val_results_df = pd.DataFrame(val_results)
val_results_df.to_csv("baseline_switchboard_val_results.csv",index=None)

"""


print("Setting up SWB Test data")
# Get results on SWB Test Set
switchboard_test_annotations_df = make_switchboard_dataframe(
test_transcription_files_A, test_transcription_files_B, test_audio_files)
test_transcription_files_A, test_transcription_files_B, test_audio_files, min_gap=MIN_GAP)

print("Switchboard Test Set Annotations Stats:")
test_swb_minutes, test_swb_laughter_minutes, test_swb_non_laughter_minutes, test_laughter_fraction, test_laughter_count = get_annotation_stats(switchboard_test_annotations_df, display=True)
test_swb_minutes, test_swb_laughter_minutes, test_swb_non_laughter_minutes, test_laughter_fraction, test_laughter_count = get_annotation_stats(switchboard_test_annotations_df, display=True, min_gap=MIN_GAP)

print("\nPredicting on Switchboard Test Set...")
test_results = []
for i in tqdm(range(len(switchboard_test_annotations_df))):
h = get_baseline_results_per_annotation_index(
baseline_model,switchboard_test_annotations_df, baseline_laugh_segmenter, i)
baseline_model, switchboard_test_annotations_df, baseline_laugh_segmenter, i, min_gap=MIN_GAP,
threshold=0.5,use_filter=False, min_length=0.)
test_results.append(h)

test_results_df = pd.DataFrame(test_results)
test_results_df.to_csv("baseline_switchboard_test_results.csv",index=None)
"""



0 comments on commit 534fe34

Please sign in to comment.