First thing: check that this is running on the right computer:

In [1]:
import socket
assert socket.gethostname() == "lcs098116"

In [2]:
from pathlib import Path
base_path = Path("/media/storage/jim/Playing/unlabelled/")

Some imports for later:

In [3]:
from pydub import AudioSegment
import numpy as np
import datetime
import pandas as pd

In [4]:
df = pd.DataFrame(columns=["name", "seconds", "hms"])
df_cnt = 0

In [5]:
def sum_path_by_type(path, pattern):
    acc = []
    for audio in path.glob(pattern):
        audio = AudioSegment.from_file(str(audio))
        acc.append(audio.duration_seconds)
    return np.sum(acc)

# BBC clips

In [6]:
bbc = base_path / "bbc"

In [7]:
bbc_audio_total = sum_path_by_type(bbc, "*.m4a")

In [8]:
bbc_audio_hms = str(datetime.timedelta(seconds=bbc_audio_total))

In [9]:
bbc_video_total = sum_path_by_type(bbc, "*.mp4")
bbc_video_hms = str(datetime.timedelta(seconds=bbc_video_total))

In [10]:
bbc_combined_total = bbc_audio_total + bbc_video_total
bbc_combined_hms = str(datetime.timedelta(seconds=bbc_combined_total))

In [11]:
print(f"BBC audio: {bbc_audio_total} seconds ({bbc_audio_hms})")
print(f"BBC video: {bbc_video_total} seconds ({bbc_video_hms})")
print(f"BBC combined: {bbc_combined_total} seconds ({bbc_combined_hms})")

BBC audio: 41793.49333333334 seconds (11:36:33.493333)
BBC video: 70314.97941043084 seconds (19:31:54.979410)
BBC combined: 112108.47274376417 seconds (1 day, 7:08:28.472744)


In [12]:
df.loc[df_cnt] = ['BBC audio', bbc_audio_total, bbc_audio_hms]
df_cnt += 1
df.loc[df_cnt] = ['BBC video', bbc_video_total, bbc_video_hms]
df_cnt += 1

# RnaG

In [13]:
rnag = base_path / "podcasts" / "rnag"
rnag_total = sum_path_by_type(rnag, "*.mp3")
rnag_hms = str(datetime.timedelta(seconds=rnag_total))

In [14]:
print(f"RnaG podcasts: {rnag_total} seconds ({rnag_hms})")
df.loc[df_cnt] = ['RnaG podcasts', rnag_total, rnag_hms]
df_cnt += 1

RnaG podcasts: 629406.0146938775 seconds (7 days, 6:50:06.014694)


# Soundcloud

In [15]:
def sum_path_by_type_l(path, patterns):
    acc = []
    for pattern in patterns:
        for audio in path.glob(pattern):
            audio = AudioSegment.from_file(str(audio))
            acc.append(audio.duration_seconds)
    return np.sum(acc)

In [16]:
pats = ["*.mp3", "*.MP3", "*.wav", "*.m4a"]
scdirs = ["clubleabhar", "coislife", "forasnagaeilge", "nos", "raidiofailte", "rnag", "rnl", "tuairisc"]

soundcloud_totals = {}
sc_path = base_path / "soundcloud"
for scdir in scdirs:
    cur_path = sc_path / scdir
    aud_total = sum_path_by_type_l(cur_path, pats)
    soundcloud_totals[scdir] = aud_total

In [17]:
soundcloud_hms = {}
for scdir in soundcloud_totals.keys():
    soundcloud_hms[scdir] = str(datetime.timedelta(seconds=soundcloud_totals[scdir]))
    print(f"{scdir} soundcloud: {soundcloud_totals[scdir]} seconds ({soundcloud_hms[scdir]})")

clubleabhar soundcloud: 90873.72494125566 seconds (1 day, 1:14:33.724941)
coislife soundcloud: 249077.49430839 seconds (2 days, 21:11:17.494308)
forasnagaeilge soundcloud: 54686.50995464853 seconds (15:11:26.509955)
nos soundcloud: 71400.43099773242 seconds (19:50:00.430998)
raidiofailte soundcloud: 964028.171941043 seconds (11 days, 3:47:08.171941)
rnag soundcloud: 8012.71469387755 seconds (2:13:32.714694)
rnl soundcloud: 2468330.290506661 seconds (28 days, 13:38:50.290507)
tuairisc soundcloud: 43831.89979591836 seconds (12:10:31.899796)


In [18]:
for scdir in soundcloud_totals.keys():
    df.loc[df_cnt] = [f"{scdir} soundcloud", soundcloud_totals[scdir], soundcloud_hms[scdir]]
    df_cnt += 1

In [19]:
df.to_csv("unlabelled.csv")

In [21]:
from IPython.display import display, HTML
display(HTML(df.to_html()))

Unnamed: 0,name,seconds,hms
0,BBC audio,41793.49,11:36:33.493333
1,BBC video,70314.98,19:31:54.979410
2,RnaG podcasts,629406.0,"7 days, 6:50:06.014694"
3,clubleabhar soundcloud,90873.72,"1 day, 1:14:33.724941"
4,coislife soundcloud,249077.5,"2 days, 21:11:17.494308"
5,forasnagaeilge soundcloud,54686.51,15:11:26.509955
6,nos soundcloud,71400.43,19:50:00.430998
7,raidiofailte soundcloud,964028.2,"11 days, 3:47:08.171941"
8,rnag soundcloud,8012.715,2:13:32.714694
9,rnl soundcloud,2468330.0,"28 days, 13:38:50.290507"


# Corpora/TG4

Location: `/media/storage/jim/speech-syn/Corpora/TG4`

In [22]:
base2 = Path('/media/storage/jim/speech-syn/Corpora/TG4')

In [23]:
kids_speech = base2 / "no-subtitles-kids"

In [24]:
ks_dirs = []
for dir in kids_speech.glob("*"):
    if dir.is_dir():
        ks_dirs.append(dir.stem)

In [28]:
totals = {}
totals_hms = {}
for dir in ks_dirs:
    totals[dir] = sum_path_by_type(kids_speech / dir, "*.wav")
    totals_hms[dir] = str(datetime.timedelta(seconds=totals[dir]))

In [29]:
for dir in ks_dirs:
    print(f"TG4 kids {dir}: {totals[dir]} seconds ({totals_hms[dir]})")

TG4 kids leirmheasnanog: 813.0347499999999 seconds (0:13:33.034750)
TG4 kids bialinn: 6166.8481875 seconds (1:42:46.848187)
TG4 kids culaclab: 1698.32575 seconds (0:28:18.325750)
TG4 kids cluichicula: 15772.39525 seconds (4:22:52.395250)


In [30]:
for scdir in totals.keys():
    df.loc[df_cnt] = [f"TG4 kids {scdir}", totals[scdir], totals_hms[scdir]]
    df_cnt += 1

In [32]:
cic = base2 / "CloIarChonnacht"

In [33]:
cic_totals = sum_path_by_type(cic, "*.mp3")
cic_hms = str(datetime.timedelta(seconds=cic_totals))

In [34]:
print(f"Cló Iar-Chonnacht: {cic_totals} seconds ({cic_hms})")

Cló Iar-Chonnacht: 41162.515396825394 seconds (11:26:02.515397)


In [35]:
df.loc[df_cnt] = [f"Cló Iar-Chonnacht", cic_totals, cic_hms]
df_cnt += 1

In [36]:
df.to_csv("unlabelled.csv")

In [37]:
from IPython.display import display, HTML
display(HTML(df.to_html()))

Unnamed: 0,name,seconds,hms
0,BBC audio,41793.49,11:36:33.493333
1,BBC video,70314.98,19:31:54.979410
2,RnaG podcasts,629406.0,"7 days, 6:50:06.014694"
3,clubleabhar soundcloud,90873.72,"1 day, 1:14:33.724941"
4,coislife soundcloud,249077.5,"2 days, 21:11:17.494308"
5,forasnagaeilge soundcloud,54686.51,15:11:26.509955
6,nos soundcloud,71400.43,19:50:00.430998
7,raidiofailte soundcloud,964028.2,"11 days, 3:47:08.171941"
8,rnag soundcloud,8012.715,2:13:32.714694
9,rnl soundcloud,2468330.0,"28 days, 13:38:50.290507"


In [42]:
pats = ["*.mkv", "*.mp4"]

rosnarun_totals = {}
sc_path = base_path / "rosnarun"
for curdir in sc_path.glob('*'):
    if not curdir.is_dir():
        continue
    aud_total = sum_path_by_type_l(curdir, pats)
    rosnarun_totals[curdir.stem] = aud_total

In [43]:
rosnarun_totals

{'sraith1': 95472.04789115647,
 'sraith3': 99081.07900226758,
 'sraith2': 103359.47174603175}

In [39]:
total_all = np.sum(df['seconds'])
total_all_hms = str(datetime.timedelta(seconds=total_all))
print(f'Total: {total_all} ({total_all_hms})')

Total" 4757368.843911493 (55 days, 1:29:28.843911)


In [40]:
print(55*24)

1320
