First thing: check that this is running on the right computer:

In [1]:
import socket
assert socket.gethostname() == "lcs098116"

In [2]:
from pathlib import Path
base_path = Path('/media/storage/jim/speech-syn/Corpora')

Some imports for later:

In [3]:
from pydub import AudioSegment
import numpy as np
import datetime
import pandas as pd

In [4]:
df = pd.DataFrame(columns=["name", "seconds", "hms"])
df_cnt = 0

In [5]:
def sum_path_by_type(path, pattern):
    acc = []
    for audio in path.glob(pattern):
        audio = AudioSegment.from_file(str(audio))
        acc.append(audio.duration_seconds)
    return np.sum(acc)

In [6]:
CO = base_path / "ga_CO"/ "pmg"

In [43]:
paths = []
relevant_paths = {}
for entry in CO.glob("RC[A-Z]*"):
    subpath = entry / "ogg"
    if subpath.is_dir():
        relevant_paths[str(subpath)] = entry.name
        paths.append(subpath)

In [11]:
all_paths = [CO / "AllOriginalWavFiles",
             CO / "AllTrimmedWavFiles44kHz",
             CO / "RC_ALL" / "wav",
             CO / "RC_ALL_141118" / "wav44",
             CO / "RC_ALL_141118" / "wav"]

In [34]:
short_paths =  ["AllOriginalWavFiles",
                "AllTrimmedWavFiles44kHz",
                "RC_ALL/wav",
                "RC_ALL_141118/wav44",
                "RC_ALL_141118/wav"]

In [37]:
dir_to_short = {str(k): v for (k, v) in zip(all_paths, short_paths)}

In [39]:
def get_wave_stems(dir):
    ret = []
    for f in dir.glob("*.wav"):
        ret.append(f.stem)
    return ret

In [40]:
stems = {str(d): get_wave_stems(d) for d in all_paths}

In [None]:
def get_ogg_stems(dir):
    ret = []
    for f in dir.glob("*.ogg"):
        ret.append(f.stem)
    return ret

def is_in(a, b):
    return set(a).issubset(b)

comb_data = []
for pth in paths:
    pth_stems = get_ogg_stems(pth)
    pth_bools = {dir_to_short[k]: is_in(pth_stems, v) for (k, v) in stems.items()}
    _dir_piece = [relevant_paths[str(pth)]]
    _just_bools = [v for (k, v) in pth_bools.items()]
    _joined = _dir_piece + _just_bools
    comb_data.append(_joined)

The table below shows which of the individual `ogg` directories are completely covered (= `True`) by the `wav` file directories.

In [47]:
import pandas as pd
pd.DataFrame(comb_data, columns=["Path"] + short_paths)

Unnamed: 0,Path,AllOriginalWavFiles,AllTrimmedWavFiles44kHz,RC_ALL/wav,RC_ALL_141118/wav44,RC_ALL_141118/wav
0,RCNamedEntities02,True,True,True,True,True
1,RCPiarsachBoithre,True,True,True,True,True
2,RCPiarsachDeargadaol,True,True,True,True,True
3,RCPiarsachMhathair,True,True,True,True,True
4,RCPiarsachIosagan,True,True,True,True,True
5,RCPiarsachBairbre,True,True,True,True,True
6,RCNamedEntities01,True,True,True,True,True
7,RCPiarsachBrid,True,True,True,True,True
8,RCPiarsachEoghainin,False,False,False,False,False
9,RCNuachtRTEMay,True,True,True,True,True


In [17]:
totals = {}
skipdirs = ["CloIarChonnacht", "no-subtitles-kids", "lm-data", "scripts", "unsorted-march", "coislife"]
for subdir in base_path.glob('*'):
    dirname = subdir.stem
    if not subdir.is_dir() or dirname in skipdirs:
        continue
    if dirname == "RnaG":
        totals[dirname] = sum_path_by_type(subdir, "*.mp3")
    else:
        totals[dirname] = sum_path_by_type(subdir, "*.wav")

In [18]:
totals_hms = {a: str(datetime.timedelta(seconds=b)) for (a, b) in totals.items()}

In [19]:
for dir in totals_hms.keys():
    print(f"{dir}: {totals[dir]} seconds ({totals_hms[dir]})")

dinotrain: 14779.008687500002 seconds (4:06:19.008688)
wallaceandgromit: 1753.088 seconds (0:29:13.088000)
olivia: 25482.027499999997 seconds (7:04:42.027500)
catahata: 38876.886375 seconds (10:47:56.886375)
gearoidnagaisce: 30558.4431875 seconds (8:29:18.443188)
astroblast: 23867.09375 seconds (6:37:47.093750)
saolfaoishraid: 17351.061875 seconds (4:49:11.061875)
amhrannamara: 5370.0906875 seconds (1:29:30.090688)
RnaG: 6864.117551020408 seconds (1:54:24.117551)
lurgan2k17: 700.565375 seconds (0:11:40.565375)
qpootle: 1616.085375 seconds (0:26:56.085375)
niko: 4669.0986875 seconds (1:17:49.098687)
dora: 9678.5066875 seconds (2:41:18.506687)
rosnarun: 42406.0379375 seconds (11:46:46.037937)
wac: 13111.8935 seconds (3:38:31.893500)
garfield: 54822.1880625 seconds (15:13:42.188062)
harveybeaks: 21050.4538125 seconds (5:50:50.453812)
whizsachistin: 24511.5950625 seconds (6:48:31.595063)
toirbeir: 1471.104 seconds (0:24:31.104000)
spongebob: 3656.4054375 seconds (1:00:56.405438)
bealoideas

In [21]:
for scdir in totals_hms.keys():
    df.loc[df_cnt] = [f"{scdir}", totals[scdir], totals_hms[scdir]]
    df_cnt += 1

In [22]:
df.to_csv("unaligned.csv")

In [23]:
from IPython.display import display, HTML
display(HTML(df.to_html()))

Unnamed: 0,name,seconds,hms
0,dinotrain,14779.008688,4:06:19.008688
1,wallaceandgromit,1753.088,0:29:13.088000
2,olivia,25482.0275,7:04:42.027500
3,catahata,38876.886375,10:47:56.886375
4,gearoidnagaisce,30558.443188,8:29:18.443188
5,astroblast,23867.09375,6:37:47.093750
6,saolfaoishraid,17351.061875,4:49:11.061875
7,amhrannamara,5370.090688,1:29:30.090688
8,RnaG,6864.117551,1:54:24.117551
9,lurgan2k17,700.565375,0:11:40.565375


In [24]:
total_all = np.sum(df['seconds'])
total_all_hms = str(datetime.timedelta(seconds=total_all))
print(f'Total: {total_all} ({total_all_hms})')

Total: 344071.44223852037 (3 days, 23:34:31.442239)
