First thing: check that this is running on the right computer:

In [1]:
import socket
assert socket.gethostname() == "lcs098116"

In [2]:
from pathlib import Path
base_path = Path('/media/storage/jim/speech-syn/Corpora')

Some imports for later:

In [3]:
from pydub import AudioSegment
import numpy as np
import datetime
import pandas as pd

In [4]:
df = pd.DataFrame(columns=["name", "seconds", "hms"])
df_cnt = 0

In [5]:
def sum_path_by_type(path, pattern):
    acc = []
    for audio in path.glob(pattern):
        audio = AudioSegment.from_file(str(audio))
        acc.append(audio.duration_seconds)
    return np.sum(acc)

In [6]:
CO = base_path / "ga_CO"/ "pmg"

In [43]:
paths = []
relevant_paths = {}
for entry in CO.glob("RC[A-Z]*"):
    subpath = entry / "ogg"
    if subpath.is_dir():
        relevant_paths[str(subpath)] = entry.name
        paths.append(subpath)

In [11]:
all_paths = [CO / "AllOriginalWavFiles",
             CO / "AllTrimmedWavFiles44kHz",
             CO / "RC_ALL" / "wav",
             CO / "RC_ALL_141118" / "wav44",
             CO / "RC_ALL_141118" / "wav"]

In [34]:
short_paths =  ["AllOriginalWavFiles",
                "AllTrimmedWavFiles44kHz",
                "RC_ALL/wav",
                "RC_ALL_141118/wav44",
                "RC_ALL_141118/wav"]

In [37]:
dir_to_short = {str(k): v for (k, v) in zip(all_paths, short_paths)}

In [39]:
def get_wave_stems(dir):
    ret = []
    for f in dir.glob("*.wav"):
        ret.append(f.stem)
    return ret

In [40]:
stems = {str(d): get_wave_stems(d) for d in all_paths}

In [None]:
def get_ogg_stems(dir):
    ret = []
    for f in dir.glob("*.ogg"):
        ret.append(f.stem)
    return ret

def is_in(a, b):
    return set(a).issubset(b)

comb_data = []
for pth in paths:
    pth_stems = get_ogg_stems(pth)
    pth_bools = {dir_to_short[k]: is_in(pth_stems, v) for (k, v) in stems.items()}
    _dir_piece = [relevant_paths[str(pth)]]
    _just_bools = [v for (k, v) in pth_bools.items()]
    _joined = _dir_piece + _just_bools
    comb_data.append(_joined)

The table below shows which of the individual `ogg` directories are completely covered (= `True`) by the `wav` file directories.

In [47]:
import pandas as pd
pd.DataFrame(comb_data, columns=["Path"] + short_paths)

Unnamed: 0,Path,AllOriginalWavFiles,AllTrimmedWavFiles44kHz,RC_ALL/wav,RC_ALL_141118/wav44,RC_ALL_141118/wav
0,RCNamedEntities02,True,True,True,True,True
1,RCPiarsachBoithre,True,True,True,True,True
2,RCPiarsachDeargadaol,True,True,True,True,True
3,RCPiarsachMhathair,True,True,True,True,True
4,RCPiarsachIosagan,True,True,True,True,True
5,RCPiarsachBairbre,True,True,True,True,True
6,RCNamedEntities01,True,True,True,True,True
7,RCPiarsachBrid,True,True,True,True,True
8,RCPiarsachEoghainin,False,False,False,False,False
9,RCNuachtRTEMay,True,True,True,True,True


In [52]:
totals = {}
for subdir in paths:
    dirname = relevant_paths[str(subdir)]
    totals[dirname] = sum_path_by_type(subdir, "*.ogg")

In [53]:
totals_hms = {a: str(datetime.timedelta(seconds=b)) for (a, b) in totals.items()}

In [54]:
for dir in totals_hms.keys():
    print(f"{dir}: {totals[dir]} seconds ({totals_hms[dir]})")

RCNamedEntities02: 771.496 seconds (0:12:51.496000)
RCPiarsachBoithre: 1564.104 seconds (0:26:04.104000)
RCPiarsachDeargadaol: 530.072 seconds (0:08:50.072000)
RCPiarsachMhathair: 725.456 seconds (0:12:05.456000)
RCPiarsachIosagan: 1134.2559999999999 seconds (0:18:54.256000)
RCPiarsachBairbre: 2151.608 seconds (0:35:51.608000)
RCNamedEntities01: 371.096 seconds (0:06:11.096000)
RCPiarsachBrid: 584.0639999999999 seconds (0:09:44.064000)
RCPiarsachEoghainin: 1573.4 seconds (0:26:13.400000)
RCNuachtRTEMay: 1913.512 seconds (0:31:53.512000)
RCNuachtRTE1-1707: 1433.952 seconds (0:23:53.952000)
RCPiarsachBheanchaointe: 2568.024 seconds (0:42:48.024000)
RCAlphaBet: 40.648 seconds (0:00:40.648000)
RCNuachtRTE16-3006: 761.928 seconds (0:12:41.928000)
RCPiarsachSagart: 813.232 seconds (0:13:33.232000)
RCNuachtRTE1-1506: 1034.456 seconds (0:17:14.456000)
RCPiarsachGadai: 945.2639999999999 seconds (0:15:45.264000)


In [55]:
for scdir in totals_hms.keys():
    df.loc[df_cnt] = [f"{scdir}", totals[scdir], totals_hms[scdir]]
    df_cnt += 1

In [56]:
df.to_csv("co-pmg.csv")

In [65]:
df.to_excel("co-pmg.xlsx", engine='openpyxl')

In [57]:
from IPython.display import display, HTML
display(HTML(df.to_html()))

Unnamed: 0,name,seconds,hms
0,RCNamedEntities02,771.496,0:12:51.496000
1,RCPiarsachBoithre,1564.104,0:26:04.104000
2,RCPiarsachDeargadaol,530.072,0:08:50.072000
3,RCPiarsachMhathair,725.456,0:12:05.456000
4,RCPiarsachIosagan,1134.256,0:18:54.256000
5,RCPiarsachBairbre,2151.608,0:35:51.608000
6,RCNamedEntities01,371.096,0:06:11.096000
7,RCPiarsachBrid,584.064,0:09:44.064000
8,RCPiarsachEoghainin,1573.4,0:26:13.400000
9,RCNuachtRTEMay,1913.512,0:31:53.512000


In [58]:
total_all = np.sum(df['seconds'])
total_all_hms = str(datetime.timedelta(seconds=total_all))
print(f'Total: {total_all} ({total_all_hms})')

Total: 18916.568 (5:15:16.568000)
