In [1]:
import random
import wave
import os
import pandas as pd
from pathlib import Path

In [2]:
MANIFESTS_DIR = Path("/home/clairepajot/M2M-VC-CycleGAN/manifests")
DATA_DIR = Path("/home/data")

### Clean VoC and Coraal Manifests

In [3]:
coraal = pd.read_csv(MANIFESTS_DIR / "coraal_manifest_old.csv")
coraal.rename(columns={" groundtruth_text_train ": "groundtruth_text_train"}, inplace=True)
coraal["aave"] = 1
# Drop rows with no groundtruth text (5/12571 rows)
coraal.dropna(subset=['groundtruth_text_train'], inplace=True)

In [4]:
coraal['wav_file'] = coraal['wav_file'].apply(lambda x: "data_processed_coraal/wav/" + x.split("/")[-1])
coraal['txt_file'] = coraal['txt_file'].apply(lambda x: "data_processed_coraal/txt/" + x.split("/")[-1])

In [5]:
len(coraal['wav_file'].apply(lambda x: "_".join(x.split("/")[-1].split("_")[-8:-3])).unique())

81

In [6]:
#Fixed speaker id
coraal['speaker_id'] = coraal['wav_file'].apply(lambda x: "_".join(x.split("/")[-1].split("_")[-8:-3]))
coraal['gender'] = coraal['wav_file'].apply(lambda x: x.split("/")[-1].split("_")[3])

In [7]:
# SPELLING OUT PS4
txt_path = DATA_DIR / coraal.loc[coraal["groundtruth_text_train"].str.contains("4"),"txt_file"].values[0]
fixed_transcript = coraal.loc[coraal["groundtruth_text_train"].str.contains("4"),"groundtruth_text_train"].str.replace("4", " FOUR").values[0]
file = open(txt_path, "w")
file.write(fixed_transcript) 
file.close() 
coraal.loc[coraal["groundtruth_text_train"].str.contains("4"),"groundtruth_text_train"] = fixed_transcript

In [8]:
print("Number of interviewees in CoRAAL: ", len(coraal["speaker_id"].unique()))
print("Number of female interviewees in CoRAAL: ", len(coraal[coraal["gender"]=="f"]["speaker_id"].unique()))
print("Number of male interviewees in CoRAAL: ", len(coraal[coraal["gender"]=="m"]["speaker_id"].unique()))

Number of interviewees in CoRAAL:  81
Number of female interviewees in CoRAAL:  43
Number of male interviewees in CoRAAL:  38


In [9]:
print("Number of hours of CoRAAL data: ", coraal["duration"].sum() / (60 * 60))
print("Average number of hours per speaker in CoRAAL: ", coraal.groupby("speaker_id").sum()["duration"].mean() / (60 * 60))

Number of hours of CoRAAL data:  50.89041027777778
Average number of hours per speaker in CoRAAL:  0.6282766700960218


In [10]:
voc = pd.read_csv(MANIFESTS_DIR / "voc_manifest_old.csv")
voc.rename(columns={"groundtruth_text": "groundtruth_text_raw", "cleaned_text": "groundtruth_text_train"}, inplace=True)
voc["aave"] = 0
# Drop rows with no groundtruth text (22/8446 rows)
voc.dropna(subset=['groundtruth_text_train'], inplace=True)

In [11]:
voc["wav_file"] = voc['wav_file'].apply(lambda x: "data_processed_voc/wav/" + x.split("/")[-1])
voc["txt_file"] = voc['txt_file'].apply(lambda x: "data_processed_voc/txt/" + x.split("/")[-1])

In [12]:
# Remove Spenser Deardoff (transcript + wav file don't match up) 8424 -> 8348 rows
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '3'] 
# Remove Eric Heryford (wav file has some silences) 8348 -> 8158 rows
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '5'] 

In [13]:
# Remove high loss data points
high_loss_examples = [
    'data_processed_voc/wav/voc_33_part_3.wav', #Transcript: UM
    'data_processed_voc/wav/voc_2_part_1.wav', #Transcript: UM
    'data_processed_voc/wav/voc_12_part_198.wav', # KILL
    'data_processed_voc/wav/voc_0_part_330.wav', #KILL
    'data_processed_voc/wav/voc_0_part_328.wav', # Basically silent
    'data_processed_voc/wav/voc_10_part_414.wav', # Basically silent
    'data_processed_voc/wav/voc_10_part_404.wav', #Transcript: UM
    'data_processed_voc/wav/voc_2_part_4.wav', #Transcript: UM YEAH SURE
    'data_processed_voc/wav/voc_8_part_228.wav', #KILL
    'data_processed_voc/wav/voc_26_part_207.wav', #Transcript: Incorrect
    'data_processed_voc/wav/voc_13_part_234.wav', #Transcript: Incorrect
    'data_processed_voc/wav/voc_12_part_185.wav', #KILL
    'data_processed_voc/wav/voc_12_part_181.wav', #KILL
    'data_processed_voc/wav/voc_17_part_93.wav', #KILL
    'data_processed_voc/wav/voc_10_part_428.wav', #KILL
    'data_processed_voc/wav/voc_11_part_80.wav' #Incorrect transcript
]
voc = voc[~voc["wav_file"].isin(high_loss_examples)]

In [14]:
# SPELLING OUT U2
txt_path = DATA_DIR / voc.loc[voc["groundtruth_text_train"].str.contains("2"),"txt_file"].values[0]
fixed_transcript = voc.loc[voc["groundtruth_text_train"].str.contains("2"),"groundtruth_text_train"].str.replace("2", " TWO").values[0]
file = open(txt_path, "w")
file.write(fixed_transcript) 
file.close() 
voc.loc[voc["groundtruth_text_train"].str.contains("2"),"groundtruth_text_train"] = fixed_transcript
# SPELLING OUT D7
txt_path = DATA_DIR / voc.loc[voc["groundtruth_text_train"].str.contains("7"),"txt_file"].values[0]
fixed_transcript = voc.loc[voc["groundtruth_text_train"].str.contains("7"),"groundtruth_text_train"].str.replace("7", " SEVEN").values[0]
file = open(txt_path, "w")
file.write(fixed_transcript) 
file.close() 
voc.loc[voc["groundtruth_text_train"].str.contains("7"),"groundtruth_text_train"] = fixed_transcript
# REMOVING -
txt_path = DATA_DIR / voc.loc[voc["groundtruth_text_train"].str.contains("-"),"txt_file"].values[0]
fixed_transcript = voc.loc[voc["groundtruth_text_train"].str.contains("-"),"groundtruth_text_train"].str.replace("-", " ").values[0]
file = open(txt_path, "w")
file.write(fixed_transcript) 
file.close() 
voc.loc[voc["groundtruth_text_train"].str.contains("-"),"groundtruth_text_train"] = fixed_transcript

In [15]:
voc['speaker_id'] = voc['wav_file'].apply(lambda x: x.split("/")[-1].split("_")[1])

In [16]:
print("Number of interviewees in VoC: ", len(voc["speaker_id"].unique()))

Number of interviewees in VoC:  37


In [17]:
print("Number of hours of VoC data: ", voc["duration"].sum() / (60 * 60))
print("Average number of hours per speaker in VoC: ", voc.groupby("speaker_id").sum()["duration"].mean() / (60 * 60))

Number of hours of VoC data:  37.041958611111106
Average number of hours per speaker in VoC:  1.0011340165165166


### Train/Dev/Test Split

In [18]:
N_GENDER_VAL_TEST = 6
coraal_female_speakers = coraal[coraal["gender"]=="f"]["speaker_id"].unique()
coraal_male_speakers = coraal[coraal["gender"]=="m"]["speaker_id"].unique()

In [19]:
random.seed(17)
random.shuffle(coraal_female_speakers)
random.shuffle(coraal_male_speakers)
coraal["split"] = "train"
coraal.loc[coraal["speaker_id"].isin(coraal_female_speakers[:N_GENDER_VAL_TEST]), "split"] ="val"
coraal.loc[coraal["speaker_id"].isin(coraal_female_speakers[N_GENDER_VAL_TEST:2 * N_GENDER_VAL_TEST]), "split"] ="test"
coraal.loc[coraal["speaker_id"].isin(coraal_male_speakers[:N_GENDER_VAL_TEST]), "split"] ="val"
coraal.loc[coraal["speaker_id"].isin(coraal_male_speakers[N_GENDER_VAL_TEST:2 * N_GENDER_VAL_TEST]), "split"] ="test"

In [20]:
print("CoRAAL Train Duration: ", coraal[coraal["split"]=="train"].duration.sum() / (60 * 60))
print("CoRAAL Val Duration: ", coraal[coraal["split"]=="val"].duration.sum() / (60 * 60))
print("CoRAAL Test Duration: ", coraal[coraal["split"]=="test"].duration.sum() /(60 * 60))

CoRAAL Train Duration:  36.801762777777775
CoRAAL Val Duration:  6.838458333333333
CoRAAL Test Duration:  7.250189166666666


In [21]:
# All VoC data is train
voc["split"] = "train"

In [22]:
def get_sr(row):
    with wave.open(str(DATA_DIR / row["wav_file"]), "rb") as wav_file:
        frame_rate = wav_file.getframerate()
    return frame_rate

In [23]:
# Add sample rate
voc["sr"] = voc.apply(get_sr, axis=1)
coraal["sr"] = coraal.apply(get_sr, axis=1)
print(voc.groupby(["sr"]).sum())
print(coraal.groupby(["sr"]).sum())

         duration  aave
sr                     
44100  110404.518     0
48000   22946.533     0
         duration   aave
sr                      
44100  183205.477  12566


In [24]:
coraal.to_csv(MANIFESTS_DIR / "coraal_manifest.csv", index=False, header=True)
voc.to_csv(MANIFESTS_DIR / "voc_manifest.csv", index=False, header=True)

### Small datasets for debugging

In [25]:
coraal_small = coraal.sample(n=1000, random_state=22)
coraal_small.to_csv(MANIFESTS_DIR / "coraal_small_manifest.csv", index=False, header=True)

In [26]:
voc_small = voc.sample(n=1000, random_state=22)
voc_small.to_csv(MANIFESTS_DIR / "voc_small_manifest.csv", index=False, header=True)