## Time-based grouping to address bug identified in commit `5a8ff30c40691ccc1ed5691add2a3512e64975bf`

In [7]:
import pandas as pd 
import numpy as np 
from pathlib import Path 
from VoiceClassifier.read import ASSReader, ConsecutiveGrouper

datadir = Path.cwd() / 'data'
clip = None 

In [2]:
rdr = ASSReader(datadir / 'ichinose_tamaki_taidan.ass')
rdr.parse_subs(44100, skiprows=2).head()

Unnamed: 0,Start,End,Speaker,Start_seconds,Start_samples,End_seconds,End_samples
3,0:04:38.58,0:04:40.92,Tamaki,278.58,12285378,280.92,12388572
4,0:04:40.92,0:04:42.81,Tamaki,280.92,12388572,282.81,12471921
5,0:04:42.81,0:04:43.95,Tamaki,282.81,12471921,283.95,12522195
6,0:04:43.95,0:04:45.10,Ichinose,283.95,12522195,285.1,12572910
7,0:04:45.10,0:04:49.15,Tamaki,285.1,12572910,289.15,12751514


In [3]:
rdr.df_ts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ichinose,6,0:04:43.95,0:04:45.10,283.95,12522195,285.1,12572910
Ichinose,12,0:04:57.55,0:05:04.21,297.55,13121955,304.21,13415661
Ichinose,13,0:05:04.21,0:05:08.92,304.21,13415661,308.92,13623372
Ichinose,14,0:05:08.92,0:05:11.49,308.92,13623372,311.49,13736709
Ichinose,15,0:05:11.49,0:05:14.01,311.49,13736709,314.01,13847841


```python
df_ = self.select_speaker(df, speaker)
inds = df_.index.get_level_values(level=1)
mask, mask_1L = self.get_consecutive_mask(inds)
grouped = self.aggregate_consecutive(df_, inds, mask, mask_1L, speaker)
return self.concat_non_consecutive(df_, grouped, mask_1L)
```

In [4]:
CG = ConsecutiveGrouper()
sp_dict: dict[str, pd.DataFrame] = {speaker : CG.select_speaker(rdr.df_ts, speaker) for speaker in ['Ichinose', 'Tamaki']}

df_ = sp_dict['Ichinose']
df_.iloc[:10, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ichinose,6,0:04:43.95,0:04:45.10,283.95,12522195,285.1,12572910
Ichinose,12,0:04:57.55,0:05:04.21,297.55,13121955,304.21,13415661
Ichinose,13,0:05:04.21,0:05:08.92,304.21,13415661,308.92,13623372
Ichinose,14,0:05:08.92,0:05:11.49,308.92,13623372,311.49,13736709
Ichinose,15,0:05:11.49,0:05:14.01,311.49,13736709,314.01,13847841
Ichinose,17,0:05:16.30,0:05:20.70,316.3,13948830,320.7,14142870
Ichinose,23,0:05:34.18,0:05:38.47,334.18,14737338,338.47,14926527
Ichinose,28,0:05:47.32,0:05:51.23,347.32,15316812,351.23,15489243
Ichinose,29,0:05:51.23,0:05:54.58,351.23,15489243,354.58,15636978
Ichinose,41,0:06:24.92,0:06:29.31,384.92,16974972,389.31,17168571


In [39]:
df_.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ichinose,1110,1:01:34.70,1:01:37.20,3694.7,162936270,3697.2,163046520
Ichinose,1111,1:01:37.20,1:01:41.60,3697.2,163046520,3701.6,163240560
Ichinose,1114,1:01:47.75,1:01:50.60,3707.75,163511775,3710.6,163637460
Ichinose,1116,1:01:59.50,1:02:02.40,3719.5,164029950,3722.4,164157840
Ichinose,1118,1:02:04.70,1:02:05.90,3724.7,164259270,3725.9,164312190


In [40]:
consec_diffs: np.ndarray = df_['End_seconds'].iloc[:-1].values - df_['Start_seconds'].iloc[1:].values
# consec_diffs = np.insert(consec_diffs, 0, consec_diffs[0])

pd.Series(consec_diffs).head()

0   -12.45
1     0.00
2     0.00
3     0.00
4    -2.29
dtype: float64

In [41]:
mask = consec_diffs == 0

x_ = pd.concat(
    [pd.Series(x) for x in [consec_diffs, mask]],
    keys=['End_i - Start_{i+1}', '== 0'],
    axis=1, 
)

x_.index = df_.index.get_level_values(1)[1:]
x_.head()

Unnamed: 0,End_i - Start_{i+1},== 0
12,-12.45,False
13,0.0,True
14,0.0,True
15,0.0,True
17,-2.29,False


In [42]:
x_.tail()

Unnamed: 0,End_i - Start_{i+1},== 0
1110,-3.95,False
1111,0.0,True
1114,-6.15,False
1116,-8.9,False
1118,-2.3,False


In [8]:
ass_inds = df_.index.get_level_values(1)
ass_inds[:10]

Int64Index([6, 12, 13, 14, 15, 17, 23, 28, 29, 41], dtype='int64')

In [134]:
groups: list[list[int]] = [] 

isConsec = consec_diffs == 0

ungrouped: list[int] = [] 
current_group: list[int] = []

for i, ind in enumerate(ass_inds[:-1]):
    if isConsec[i]:
        current_group.append(ind)
        continue 
    if isConsec[i-1]:
        current_group.append(ind)
        groups.append(current_group)
        current_group = [] 
        continue 
    
    ungrouped.append(ind)
    continue 

if isConsec[-1]:
    groups[-1].append(ass_inds[-1])
else:
    ungrouped.append(ass_inds[-1])

df_groups = pd.Series(groups)
df_groups.head()

0    [12, 13, 14, 15]
1            [28, 29]
2            [41, 42]
3        [52, 53, 54]
4        [56, 57, 58]
dtype: object

In [136]:
df_groups.index = [g[0] for g in groups]
df_groups.loc[978:1118]

978       [978, 979, 980, 981, 982, 983, 984, 985]
987                                     [987, 988]
990                           [990, 991, 992, 993]
1004                            [1004, 1005, 1006]
1017                                  [1017, 1018]
1025                                  [1025, 1026]
1033                            [1033, 1034, 1035]
1038    [1038, 1039, 1040, 1041, 1042, 1043, 1044]
1051                            [1051, 1052, 1053]
1055                            [1055, 1056, 1057]
1058          [1058, 1059, 1060, 1061, 1062, 1063]
1074                                  [1074, 1075]
1081                                  [1081, 1082]
1090                                  [1090, 1091]
1098                                  [1098, 1099]
1104                                  [1104, 1105]
1107                                  [1107, 1108]
1110                                  [1110, 1111]
dtype: object

In [137]:
assert np.intersect1d(
    np.array([i for g in groups for i in g]),
    np.array(ungrouped)
).shape[0] < 1

In [139]:
# check that all indices from original dataframe are represented 
_idx = pd.Index(np.union1d([i for g in groups for i in g], ungrouped)).unique().sort_values()
assert (_idx == df_.index.get_level_values(1).unique().sort_values()).all()

In [140]:
# test that groups are truly consecutive in time 

df_sec = df_.droplevel(0).loc[:, ['Start_seconds', 'End_seconds']]

for g in groups:
    starts = df_sec.loc[g[:-1], 'End_seconds'].values
    ends = df_sec.loc[g[1:], 'Start_seconds'].values
    match = starts == ends 
    try:
        assert all(match) 
    except AssertionError:
        print(g, match, sep='\n')

In [160]:
ungrouped[:5]

[6, 17, 23, 47, 66]

In [141]:
# %%timeit

from typing import Any 

# group 
df_dict: dict[int, dict[str, Any]] = {} 
end_cols = [c for c in df_.columns if 'End' in c]

df_2 = df_.copy().droplevel(0)

for g in groups:
    start = df_2.loc[g[0], :].to_dict()
    end = df_2.loc[g[-1], :]
    
    for col in end_cols:
        start[col] = end.at[col]

    df_dict[g[0]] = start 

df_merge = pd.DataFrame.from_dict(df_dict, orient='index').\
    append(df_2.loc[ungrouped, :]).\
    sort_index()

In [142]:
df_merge.head()

Unnamed: 0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
6,0:04:43.95,0:04:45.10,283.95,12522195,285.1,12572910
12,0:04:57.55,0:05:14.01,297.55,13121955,314.01,13847841
17,0:05:16.30,0:05:20.70,316.3,13948830,320.7,14142870
23,0:05:34.18,0:05:38.47,334.18,14737338,338.47,14926527
28,0:05:47.32,0:05:54.58,347.32,15316812,354.58,15636978


In [143]:
expected_len = len(ungrouped) + len(groups)
try: 
    assert df_merge.shape[0] == expected_len
except AssertionError:
    df_merge.info()
    raise AssertionError(f"Expected {expected_len} rows, got {df_merge.shape[0]}")

## Random sample and write audio clips to verify

In [8]:
import audiofile
from VoiceClassifier.read import AudioClip

if clip is None:
    clip = AudioClip(datadir / "ichinose_tamaki_taidan.m4a")

In [147]:
samples = df_merge.sample(5)
samples

Unnamed: 0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
645,0:36:28.85,0:36:33.25,2188.85,96528285,2193.25,96722325
938,0:53:11.80,0:53:16.30,3191.8,140758380,3196.3,140956830
763,0:43:09.60,0:43:11.35,2589.6,114201360,2591.35,114278535
744,0:42:15.45,0:42:21.55,2535.45,111813344,2541.55,112082355
713,0:40:19.60,0:40:27.35,2419.6,106704360,2427.35,107046135


In [148]:
outdir = datadir / 'test'
for s in samples.index:
    start, end = samples.loc[s, ['Start_samples', 'End_samples']]
    audio_sample = clip.clip(start, end, unit='samples')

    outp = outdir / f"Ichinose_{s}.wav"
    if not outp.is_file():
        audiofile.write(outp, audio_sample, clip.rate)
        print(f"Saved file to {outp}")

Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Ichinose_645.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Ichinose_938.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Ichinose_763.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Ichinose_744.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Ichinose_713.wav


## Filter rows < 1s in duration

In [150]:
df_merge['Duration'] = df_merge['End_seconds'] - df_merge['Start_seconds']
df_merge['Duration'].describe()

count    343.000000
mean       5.204577
std        4.308233
min        0.600000
25%        2.495000
50%        3.950000
75%        6.380000
max       28.350000
Name: Duration, dtype: float64

In [154]:
df_merge1s = df_merge.loc[df_merge.Duration > 1, :]
df_merge1s.head()

Unnamed: 0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples,Duration
6,0:04:43.95,0:04:45.10,283.95,12522195,285.1,12572910,1.15
12,0:04:57.55,0:05:14.01,297.55,13121955,314.01,13847841,16.46
17,0:05:16.30,0:05:20.70,316.3,13948830,320.7,14142870,4.4
23,0:05:34.18,0:05:38.47,334.18,14737338,338.47,14926527,4.29
28,0:05:47.32,0:05:54.58,347.32,15316812,354.58,15636978,7.26


In [156]:
df_merge1s['Duration'].describe()

count    331.000000
mean       5.361692
std        4.304379
min        1.050000
25%        2.625000
50%        4.150000
75%        6.400000
max       28.350000
Name: Duration, dtype: float64

## Repeat for Tamaki's lines

In [3]:
CG = ConsecutiveGrouper()
CG.aggregate(rdr.df_ts, 'Ichinose')

Unnamed: 0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
6,0:04:43.95,0:04:45.10,283.95,12522195,285.10,12572910
12,0:04:57.55,0:05:14.01,297.55,13121955,314.01,13847841
17,0:05:16.30,0:05:20.70,316.30,13948830,320.70,14142870
23,0:05:34.18,0:05:38.47,334.18,14737338,338.47,14926527
28,0:05:47.32,0:05:54.58,347.32,15316812,354.58,15636978
...,...,...,...,...,...,...
1107,1:01:26.30,1:01:30.75,3686.30,162565830,3690.75,162762075
1110,1:01:34.70,1:01:41.60,3694.70,162936270,3701.60,163240560
1114,1:01:47.75,1:01:50.60,3707.75,163511775,3710.60,163637460
1116,1:01:59.50,1:02:02.40,3719.50,164029950,3722.40,164157840


In [6]:
tamaki_agg = CG.aggregate(rdr.df_ts, 'Tamaki')
tamaki_samples = tamaki_agg.sample(5)

In [9]:
outdir = datadir / 'test'
for s in tamaki_samples.index:
    start, end = tamaki_samples.loc[s, ['Start_samples', 'End_samples']]
    audio_sample = clip.clip(start, end, unit='samples')

    outp = outdir / f"Tamaki_{s}.wav"
    if not outp.is_file():
        audiofile.write(outp, audio_sample, clip.rate)
        print(f"Saved file to {outp}")

Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Tamaki_133.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Tamaki_535.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Tamaki_90.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Tamaki_796.wav
Saved file to c:\Users\delbe\Documents\CodeRepositories\Miscellaneous\ClipSpeechSegmenter\data\test\Tamaki_203.wav
