In [1]:
import json
from pprint import pp

In [2]:
def string_to_float(time):
    left, right = map(int, time.split(":"))
    assert -1 < right < 60, f"{time} is in invalid format"
    return left + right / 60


def float_to_string(time):
    left = int(time)
    right = int((time - left) * 60)
    return f"{left:02d}:{right:02d}"


print(string_to_float("8:04"))
print(string_to_float("12:3"))
print(float_to_string(8.0666))
print(float_to_string(1234.94321))

8.066666666666666
12.05
08:03
1234:56


In [3]:
original_duration = string_to_float("8:04")

with open("output.json") as f:
    data = json.load(f)

# print(data)
print("Conversation fragments count:", len(data["output"]["diarization"]))
print("Confidence scores count:", len(data["output"]["confidence"]["score"]))
print()

total_sex = 0
for i in data["output"]["diarization"]:
    total_sex += i["end"] - i["start"]

print("Conversation duration")
print("\tin seconds:", total_sex)
print("\tin minutes:", total_sex / 60)
print("Original duration")
print("\tin seconds:", original_duration * 60)
print("\tin minutes:", original_duration)

Conversation fragments count: 139
Confidence scores count: 24224

Conversation duration
	in seconds: 273.51999999999975
	in minutes: 4.558666666666663
Original duration
	in seconds: 484.0
	in minutes: 8.066666666666666


In [4]:
merged = []

prev_speaker = ""

for i in data["output"]["diarization"]:
    if prev_speaker == i["speaker"]:
        merged[-1]["end"] = i["end"]
    else:
        merged.append(i)
        prev_speaker = i["speaker"]

print("Merged conversation fragments:", len(merged))
pp(merged, indent=2)

Merged conversation fragments: 88
[ {'speaker': 'SPEAKER_01', 'start': 0.245, 'end': 3.265},
  {'speaker': 'SPEAKER_00', 'start': 3.425, 'end': 5.705},
  {'speaker': 'SPEAKER_01', 'start': 6.485, 'end': 7.305},
  {'speaker': 'SPEAKER_00', 'start': 7.305, 'end': 12.645},
  {'speaker': 'SPEAKER_01', 'start': 12.645, 'end': 13.965},
  {'speaker': 'SPEAKER_00', 'start': 13.965, 'end': 17.185},
  {'speaker': 'SPEAKER_01', 'start': 18.125, 'end': 18.665},
  {'speaker': 'SPEAKER_00', 'start': 19.385, 'end': 28.025},
  {'speaker': 'SPEAKER_01', 'start': 30.225, 'end': 32.965},
  {'speaker': 'SPEAKER_00', 'start': 34.145, 'end': 34.745},
  {'speaker': 'SPEAKER_01', 'start': 36.665, 'end': 51.465},
  {'speaker': 'SPEAKER_00', 'start': 51.685, 'end': 60.445},
  {'speaker': 'SPEAKER_01', 'start': 61.165, 'end': 62.925},
  {'speaker': 'SPEAKER_00', 'start': 63.705, 'end': 64.865},
  {'speaker': 'SPEAKER_01', 'start': 65.865, 'end': 66.325},
  {'speaker': 'SPEAKER_00', 'start': 66.905, 'end': 71.805

In [5]:
import statistics

spans = []

for i in merged:
    spans.append(i["end"] - i["start"])

print("Avg fragment duration:", sum(spans) / len(spans), "seconds")
print("Mean:", statistics.mean(spans))

Avg fragment duration: 3.468636363636361 seconds
Mean: 3.468636363636361


In [None]:
from pydub import AudioSegment
from pydub.utils import which

# Set ffmpeg executable path
AudioSegment.converter = which("ffmpeg")

# Load the audio file
audio = AudioSegment.from_file("test.m4a")

timestamps = [(i["start"], i["end"]) for i in merged]

# Loop through timestamps and export each segment
for i, (start, end) in enumerate(timestamps):
    start_ms = start * 1000
    end_ms = end * 1000

    # Extract the audio segment
    segment = audio[start_ms:end_ms]

    # Export the segment as a new file
    output_file = f"output_segment_{i + 1}.mp3"
    segment.export(output_file, format="mp3")
    print(f"Exported {output_file}")

0 (0.245, 3.265)
Exported output_segment_1.mp3
1 (3.425, 5.705)
Exported output_segment_2.mp3
2 (6.485, 7.305)
Exported output_segment_3.mp3
3 (7.305, 12.645)
Exported output_segment_4.mp3
4 (12.645, 13.965)
Exported output_segment_5.mp3
5 (13.965, 17.185)
Exported output_segment_6.mp3
6 (18.125, 18.665)
Exported output_segment_7.mp3
7 (19.385, 28.025)
Exported output_segment_8.mp3
8 (30.225, 32.965)
Exported output_segment_9.mp3
9 (34.145, 34.745)
Exported output_segment_10.mp3
10 (36.665, 51.465)
Exported output_segment_11.mp3
11 (51.685, 60.445)
Exported output_segment_12.mp3
12 (61.165, 62.925)
Exported output_segment_13.mp3
13 (63.705, 64.865)
Exported output_segment_14.mp3
14 (65.865, 66.325)
Exported output_segment_15.mp3
15 (66.905, 71.805)
Exported output_segment_16.mp3
16 (73.125, 75.185)
Exported output_segment_17.mp3
17 (76.465, 81.465)
Exported output_segment_18.mp3
18 (83.825, 89.505)
Exported output_segment_19.mp3
19 (90.445, 99.505)
Exported output_segment_20.mp3
20 (99