In [1]:
import audiofile 
import pandas as pd 
from pathlib import Path 

DATADIR = Path.cwd() / 'data/'

## Test .ass file reader

In [2]:
from VoiceClassifier.read import ASSReader

rdr = ASSReader(DATADIR / "ichinose_tamaki_taidan.ass")

### Checking individual functions

Check data loading

In [3]:
df = rdr.get_raw_dialog(skiprows=2)
df.head()

Unnamed: 0,0
3,"Dialogue: 0,0:04:38.58,0:04:40.92,Tamaki,,0,0,..."
4,"Dialogue: 0,0:04:40.92,0:04:42.81,Tamaki,,0,0,..."
5,"Dialogue: 0,0:04:42.81,0:04:43.95,Tamaki,,0,0,..."
6,"Dialogue: 0,0:04:43.95,0:04:45.10,Ichinose,,0,..."
7,"Dialogue: 0,0:04:45.10,0:04:49.15,Tamaki,,0,0,..."


Check initial parsing of lines

In [4]:
df2 = rdr.parse_dialog(df)
df2.head()

Unnamed: 0,Start,End,Speaker
3,0:04:38.58,0:04:40.92,Tamaki
4,0:04:40.92,0:04:42.81,Tamaki
5,0:04:42.81,0:04:43.95,Tamaki
6,0:04:43.95,0:04:45.10,Ichinose
7,0:04:45.10,0:04:49.15,Tamaki


Check that timestamp conversion to seconds is working

In [5]:
df2.iloc[:5,0].apply(lambda t: rdr.ts2secs(t))

3    278.58
4    280.92
5    282.81
6    283.95
7    285.10
Name: Start, dtype: float64

In [6]:
pd.concat(
    [df2.iloc[-5:, 0], df2.iloc[-5:,0].apply(lambda t: rdr.ts2secs(t))],
    axis=1    
)

Unnamed: 0,Start,Start.1
1114,1:01:47.75,3707.75
1115,1:01:50.60,3710.6
1116,1:01:59.50,3719.5
1117,1:02:03.30,3723.3
1118,1:02:04.70,3724.7


Check conversion of time into samples and seconds

In [7]:
df3 = rdr.get_sub_times(df2, 44100, rdr.ts2secs)
df3.head()

Unnamed: 0,Start,End,Speaker,Start_seconds,Start_samples,End_seconds,End_samples
3,0:04:38.58,0:04:40.92,Tamaki,278.58,12285378,280.92,12388572
4,0:04:40.92,0:04:42.81,Tamaki,280.92,12388572,282.81,12471921
5,0:04:42.81,0:04:43.95,Tamaki,282.81,12471921,283.95,12522195
6,0:04:43.95,0:04:45.10,Ichinose,283.95,12522195,285.1,12572910
7,0:04:45.10,0:04:49.15,Tamaki,285.1,12572910,289.15,12751514


### Using main functions

In [8]:
rdr.parse_subs(44100, skiprows=2).head()

Unnamed: 0,Start,End,Speaker,Start_seconds,Start_samples,End_seconds,End_samples
3,0:04:38.58,0:04:40.92,Tamaki,278.58,12285378,280.92,12388572
4,0:04:40.92,0:04:42.81,Tamaki,280.92,12388572,282.81,12471921
5,0:04:42.81,0:04:43.95,Tamaki,282.81,12471921,283.95,12522195
6,0:04:43.95,0:04:45.10,Ichinose,283.95,12522195,285.1,12572910
7,0:04:45.10,0:04:49.15,Tamaki,285.1,12572910,289.15,12751514


In [9]:
rdr.df_ts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ichinose,6,0:04:43.95,0:04:45.10,283.95,12522195,285.1,12572910
Ichinose,12,0:04:57.55,0:05:04.21,297.55,13121955,304.21,13415661
Ichinose,13,0:05:04.21,0:05:08.92,304.21,13415661,308.92,13623372
Ichinose,14,0:05:08.92,0:05:11.49,308.92,13623372,311.49,13736709
Ichinose,15,0:05:11.49,0:05:14.01,311.49,13736709,314.01,13847841


## Test `ConsecutiveGrouper`

In [10]:
from VoiceClassifier.read import ConsecutiveGrouper

CG = ConsecutiveGrouper()

In [11]:
df_ = CG.select_speaker(rdr.df_ts, 'Ichinose')
df_.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ichinose,6,0:04:43.95,0:04:45.10,283.95,12522195,285.1,12572910
Ichinose,12,0:04:57.55,0:05:04.21,297.55,13121955,304.21,13415661
Ichinose,13,0:05:04.21,0:05:08.92,304.21,13415661,308.92,13623372
Ichinose,14,0:05:08.92,0:05:11.49,308.92,13623372,311.49,13736709
Ichinose,15,0:05:11.49,0:05:14.01,311.49,13736709,314.01,13847841


In [12]:
df_ = CG.select_speaker(rdr.df_ts, 'Tamaki')
df_.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Tamaki,3,0:04:38.58,0:04:40.92,278.58,12285378,280.92,12388572
Tamaki,4,0:04:40.92,0:04:42.81,280.92,12388572,282.81,12471921
Tamaki,5,0:04:42.81,0:04:43.95,282.81,12471921,283.95,12522195
Tamaki,7,0:04:45.10,0:04:49.15,285.1,12572910,289.15,12751514
Tamaki,8,0:04:49.15,0:04:51.18,289.15,12751514,291.18,12841038


In [13]:
inds = df_.index.get_level_values(level=1)
mask, mask_1L = CG.get_consecutive_mask(inds)

In [14]:
grouped = CG.aggregate_consecutive(df_, inds, mask, mask_1L, 'Tamaki')
grouped.head()

Unnamed: 0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
3,0:04:38.58,0:04:57.55,278.58,12285378,297.55,13121955
18,0:05:20.70,0:05:34.18,320.7,14142870,334.18,14737338
24,0:05:38.47,0:05:43.84,338.47,14926527,343.84,15163344
30,0:05:54.58,0:06:24.92,354.58,15636978,384.92,16974972
43,0:06:32.75,0:06:46.92,392.75,17320275,406.92,17945172


In [19]:
import numpy as np 

mask = ((inds[1:] - inds[:-1]) == 1)
mask = np.insert(mask, 0, mask[0] == True)
mask[:5], inds[:5]

(array([ True,  True,  True, False,  True]),
 Int64Index([3, 4, 5, 7, 8], dtype='int64'))

## Bug and solution

So the masks are constructed correctly. However, we need a way to distinguish between distinct `True`s. 
Currently, we compare line indices in the subtitle file.

However, this is problematic because there may be multiple sequences of `True`s that have no `False`s between them, e.g.

|Index in Original File|Speaker|
|---|---|
|0|A|
|1|A|
|2|B|
|3|A|
|4|A|
|5|B|
|6|A|
|7|A|

And so on. When we separate by speaker, the sequence for speaker A will look like: `[0, 1, 3, 4, 6, 7]`. Each of these are consecutive 2-sets: `(0,1)`, `(3,4)`, and `(6, 7)`, since there is only one entry from B between each tuple. Thus, when we construct the mask above, we will get a list of six `True`s. 

The solution is to use actual times rather than indices. 

In [16]:
next(CG.get_true_groups(mask, mask_1L))

Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')

In [17]:
CG.concat_non_consecutive(df_, grouped, mask_1L)

Unnamed: 0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
3,0:04:38.58,1:01:47.75,278.58,12285378,3707.75,163511775
16,0:05:14.01,0:05:16.75,314.01,13847841,316.75,13968675
27,0:05:43.84,0:05:47.32,343.84,15163344,347.32,15316812
55,0:07:16.30,0:07:19.72,436.30,19240830,439.72,19391652
88,0:09:06.07,0:09:09.50,546.07,24081687,549.50,24232950
...,...,...,...,...,...,...
1100,1:01:01.15,1:01:04.80,3661.15,161456715,3664.80,161617680
1106,1:01:21.05,1:01:26.30,3681.05,162334305,3686.30,162565830
1109,1:01:30.75,1:01:34.70,3690.75,162762075,3694.70,162936270
1115,1:01:50.60,1:01:59.50,3710.60,163637460,3719.50,164029950


### Compute durations after grouping

In [None]:
from datetime import datetime, timedelta
import re 

ts_pat = re.compile("(\d{1}):(\d{2}):(\d{2}).(\d{2})")
def str2time(ts: str, pat=ts_pat) -> datetime:
    h, mm, ss, ms = map(int, pat.findall(ts)[0])
    return timedelta(hours=h, minutes=mm, seconds=ss, milliseconds=ms)

def pd_str2time(df: pd.Series) -> pd.Series:
    return df.apply(lambda t: str2time(t))

def compute_duration(df_ts: pd.DataFrame) -> pd.DataFrame:
    start, end = [pd_str2time(df_ts[col]) for col in ['Start', 'End']]
    delta1 = df_ts['End_seconds'] - df_ts['Start_seconds']
    delta2 = end - start
    return pd.concat(
        [delta1, delta2.dt.total_seconds()], 
        axis=1).\
        rename(
            {0:"'_seconds'", 1:'End-Start'}, 
            axis=1
        )

compute_duration(grouped).head()

using `timedelta` gives wrong answers

In [None]:
(inds.sort_values() == inds).sum() == inds.shape[0]

## Write grouped audio clips

In [None]:
from VoiceClassifier.read import AudioClip

clip = AudioClip(DATADIR / "ichinose_tamaki_taidan.m4a")

In [None]:
df_samples = grouped.loc[:, ['Start_samples', 'End_samples']]
df_samples.head()

Unnamed: 0,Start_samples,End_samples
12,13121955,13847841
28,15316812,15636978
41,16974972,17320275
52,18658710,19240830
56,19391652,20145321


In [None]:
# print(clip.shape, str(timedelta(seconds=clip.shape[1] / clip.rate)), sep="\t\t")

In [None]:
outdir = DATADIR / 'test'
def write_clip(ind: int, outdir: Path=outdir, df_samples: pd.DataFrame=df_samples, source: AudioClip=clip) -> list:
    start, end = df_samples.iloc[ind, :]
    c = source.clip(start, end, unit='samples')
    audiofile.write(
        outdir / f"{df_samples.index[ind]}_{start}-{end}.wav",
        c,
        clip.rate
    )
    return c 

In [None]:
ichinose_12to15 = write_clip(0)
arr, _ = audiofile.read(outdir / "ichinose_12-15.wav")

In [None]:
import numpy as np 
dA = ichinose_12to15 - arr
f"Sum={np.sum(dA):.2e}  Mean={np.mean(dA):.2e}"

## Get audio samples

In [40]:
ichinose = pd.read_csv(DATADIR / "Ichinose_agg-subs.csv", index_col=0)
tamaki = pd.read_csv(DATADIR / "Tamaki_agg-subs.csv", index_col=0)
both_ts = pd.concat([ichinose, tamaki], axis=0, keys=['Ichinose', 'Tamaki'])
both_ts

Unnamed: 0,Unnamed: 1,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Ichinose,1,0:04:35.10,0:04:40.10,275.10,12131910,280.10,12352410
Ichinose,6,0:04:43.95,0:04:45.10,283.95,12522195,285.10,12572910
Ichinose,12,0:04:57.55,0:05:14.01,297.55,13121955,314.01,13847841
Ichinose,17,0:05:16.30,0:05:20.70,316.30,13948830,320.70,14142870
Ichinose,23,0:05:34.18,0:05:38.47,334.18,14737338,338.47,14926527
...,...,...,...,...,...,...,...
Tamaki,1106,1:01:21.05,1:01:26.30,3681.05,162334305,3686.30,162565830
Tamaki,1109,1:01:30.75,1:01:34.70,3690.75,162762075,3694.70,162936270
Tamaki,1112,1:01:41.60,1:01:47.75,3701.60,163240560,3707.75,163511775
Tamaki,1115,1:01:50.60,1:01:59.50,3710.60,163637460,3719.50,164029950


In [35]:
both_ts = pd.read_csv(
    DATADIR / "ichinose_tamaki_taidan_timestamps.csv", 
    index_col=[0,1]
).sort_index()

both_ts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Start,End,Start_seconds,Start_samples,End_seconds,End_samples
Speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ichinose,3,0:00:08.85,0:00:10.00,8.85,390285,10.0,441000
Ichinose,9,0:00:22.45,0:00:29.11,22.45,990045,29.11,1283751
Ichinose,10,0:00:29.11,0:00:33.82,29.11,1283751,33.82,1491462
Ichinose,11,0:00:33.82,0:00:36.39,33.82,1491462,36.39,1604799
Ichinose,12,0:00:36.39,0:00:38.91,36.39,1604799,38.91,1715930


In [None]:
def get_random_sample(df: pd.DataFrame, n_samples=5, outdir=None, source: AudioClip=clip) -> None:
    speakers = df.index.get_level_values('Speaker').unique()
    
    sampled_lst: list[pd.DataFrame] = [] 
    
    if outdir is None:
        outdir = DATADIR / 'test'
    if not outdir.is_dir():
        outdir.mkdir()

    for speaker in speakers:
        ts = df.loc(axis=0)[speaker, :].\
            sample(n=n_samples).\
            sort_index()
        
        print(ts)
        ts = ts.loc[:, ['Start_samples', 'End_samples']]
        
        for i in range(n_samples):
            start, end = ts.iloc[i,:]
            sample = source.clip(start, end, unit='samples')
            outp = outdir / f"{speaker}_{ts.index[i][1]}.wav"

            if outp.is_file(): continue
            audiofile.write(outp, sample, source.rate)

        

get_random_sample(both_ts, n_samples=2)