In [2]:
import audiofile
import numpy as np 
import pandas as pd 
from pathlib import Path 
import librosa.feature as LF 

datadir = Path.cwd() / 'data'

In [3]:
from VoiceClassifier.common import AudioClip

prefix = r"ichinose_tamaki_taidan"
clip = AudioClip(datadir / f"{prefix}.m4a")

In [4]:
# difference between channels
dChannel = (clip.data[0,:] - clip.data[1,:])
np.sum(dChannel), np.mean(dChannel)

(19.769806, 1.135981e-07)

In [24]:
df_ts.sort_values('Start_samples')

Unnamed: 0,Speaker,ASS_index,Start_samples,End_samples,Duration (s)
343,1,3,12285378,12522195,5.37
0,0,6,12522195,12572910,1.15
344,1,7,12572910,13121955,12.45
1,0,12,13121955,13847841,16.46
345,1,16,13847841,13968675,2.74
...,...,...,...,...,...
340,0,1114,163511775,163637460,2.85
647,1,1115,163637460,164029950,8.90
341,0,1116,164029950,164157840,2.90
648,1,1117,164197530,164259270,1.40


In [27]:
12522195-df_ts['Start_samples'].min()

236817

In [28]:
test = clip.clip(12285378, 236817+12285378)
audiofile.write("./test.wav", test, clip.rate)

## Process subtitles 

1. Merge all speakers' data into a single dataframe
2. Remove clips that are < 1s in duration 
3. Descriptive statistics for all speakers
4. Find silent clips

In [38]:
def load_data(prefix: str) -> tuple[pd.DataFrame, dict[str, int]]:
    df_ts = pd.read_csv(
        datadir / f"{prefix}_merge-subs.csv", 
        index_col=[0,1]
    ).\
        rename_axis(index=["Speaker", "ASS_index"]).\
        reset_index().\
        loc[:, ['Speaker', 'ASS_index', 'Start_samples', 'End_samples']]

    encoding: dict[str, int] = {name : i for i, name in enumerate(df_ts['Speaker'].unique())}
    df_ts['Speaker'] = df_ts['Speaker'].replace(encoding)
    return df_ts, encoding 

df_ts, encoding = load_data(prefix)

print(encoding)
df_ts

{'Ichinose': 0, 'Tamaki': 1}


Unnamed: 0,Speaker,ASS_index,Start_samples,End_samples
0,0,6,12522195,12572910
1,0,12,13121955,13847841
2,0,17,13948830,14142870
3,0,23,14737338,14926527
4,0,28,15316812,15636978
...,...,...,...,...
644,1,1106,162334305,162565830
645,1,1109,162762075,162936270
646,1,1112,163240560,163511775
647,1,1115,163637460,164029950


In [39]:
df_ts['Speaker'].value_counts(normalize=True)

0    0.528505
1    0.471495
Name: Speaker, dtype: float64

In [42]:
df_ts['Duration (s)'] = (df_ts['End_samples'] - df_ts['Start_samples']) / clip.rate 
df_ts = df_ts.loc[df_ts['Duration (s)'] > 1, :].reset_index(drop=True)
df_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632 entries, 0 to 631
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Speaker        632 non-null    int64  
 1   ASS_index      632 non-null    int64  
 2   Start_samples  632 non-null    int64  
 3   End_samples    632 non-null    int64  
 4   Duration (s)   632 non-null    float64
dtypes: float64(1), int64(4)
memory usage: 24.8 KB


In [43]:
df_ts.loc[:, ['Speaker', 'Duration (s)']].\
    groupby('Speaker').\
    agg(['mean', 'std', 'max', 'min'])

Unnamed: 0_level_0,Duration (s),Duration (s),Duration (s),Duration (s)
Unnamed: 0_level_1,mean,std,max,min
Speaker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5.361692,4.30438,28.35,1.049977
1,5.205415,5.197915,36.15,1.049977


In [44]:
df_ts.to_parquet(datadir / f"{prefix}_merge-subs_processed.parquet")

In [45]:
pd.read_feather(datadir / f"{prefix}_merge-subs_processed.feather")

Unnamed: 0,Speaker,ASS_index,Start_samples,End_samples,Duration (s)
0,0,6,12522195,12572910,1.15
1,0,12,13121955,13847841,16.46
2,0,17,13948830,14142870,4.40
3,0,23,14737338,14926527,4.29
4,0,28,15316812,15636978,7.26
...,...,...,...,...,...
627,1,1106,162334305,162565830,5.25
628,1,1109,162762075,162936270,3.95
629,1,1112,163240560,163511775,6.15
630,1,1115,163637460,164029950,8.90


## Test feature extraction

In [5]:
def get_ts(ind: int, df=df_ts) -> list[int]:
    a, b = df.loc[df.ASS_index == ind, ['Start_samples', 'End_samples']].values[0]
    return a, b

def double_mean(X: np.ndarray) -> np.ndarray:
    return np.mean(np.mean(X, axis=0), axis=1)

In [6]:
test = clip.clip(*get_ts(12))
test.shape

(2, 725886)

In [15]:
c1 = np.mean(LF.mfcc(y=np.mean(test, axis=0), sr=clip.rate, n_mfcc=40), axis=1)
c2 = double_mean(LF.mfcc(y=test, sr=clip.rate, n_mfcc=40).T)

assert c1.shape == c2.shape

In [65]:
double_mean(LF.mfcc(y=test, sr=clip.rate, n_mfcc=40).T).shape

(40,)

In [56]:
stft = np.abs(librosa.stft(test))
stft.shape

(2, 1025, 1418)

In [70]:
double_mean(LF.chroma_stft(S=stft, sr=clip.rate).T).shape

(12,)

In [67]:
double_mean(LF.melspectrogram(y=test, sr=clip.rate).T).shape

(128,)

In [69]:
double_mean(LF.spectral_contrast(y=test, sr=clip.rate).T).shape

(7,)

In [72]:
from librosa.effects import harmonic 
double_mean(LF.tonnetz(y=harmonic(test), sr=clip.rate).T).shape

(6,)

In [29]:
pd.read_feather("./test.feather")

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column184,Column185,Column186,Column187,Column188,Column189,Column190,Column191,Column192,Column193
0,-336.255360,112.006787,-24.189994,30.174576,-11.558035,9.367257,-15.855213,1.638036e+01,-9.819607e+00,-11.241823,...,20.965327,19.835727,25.959488,38.879971,-0.010536,0.056836,0.031384,-0.023985,0.008968,-0.010219
1,-336.255360,112.006787,-24.189994,30.174576,-11.558035,9.367257,-15.855213,1.638036e+01,-9.819607e+00,-11.241823,...,20.965327,19.835727,25.959488,38.879971,-0.010536,0.056836,0.031384,-0.023985,0.008968,-0.010219
2,-29.676573,-27.919357,0.201476,0.003898,0.000182,0.000648,0.007141,4.912706e-04,2.273150e-06,-0.031090,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,-0.478521,-12.067139,0.054067,0.004781,0.000449,0.000084,0.003688,1.365639e-04,1.452152e-08,-0.021875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,-13.533205,7.519932,0.057429,0.024778,0.005826,0.000052,0.002244,1.476291e-04,1.410149e-09,-346.852783,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.028894,-25.112263,0.013253,0.001073,0.000016,0.002990,0.000108,5.413733e-09,1.936615e+01,-29.700314,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
596,-9.623293,-16.937656,0.002506,0.000625,0.000096,0.004845,0.000232,1.511773e-06,4.286074e+01,23.854755,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
597,-22.272326,-0.066827,0.019499,0.000027,0.000181,0.035168,0.001723,4.599518e-06,3.501115e+01,17.085579,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
598,-17.844868,0.132179,0.020918,0.000056,0.000073,0.041434,0.006630,1.693200e-06,4.660112e-02,-15.453876,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
