In [9]:
from pathlib import Path

import pandas as pd
import seaborn as sns

import transformers

## Compute phoneme data representation

In [10]:
paths = list(Path("data/TRAIN").glob("**/*.PHN"))
phone_df = pd.concat([pd.read_csv(path, sep=" ", header=None) for path in paths],
                     names=["sentence_idx", "dialect", "speaker"],
                     keys=[(p.stem, p.parts[2], p.parts[3]) for p in paths])
phone_df.columns = ["onset", "offset", "phone"]
phone_df.index.rename("phone_idx", level=-1, inplace=True)
phone_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,onset,offset,phone
sentence_idx,dialect,speaker,phone_idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SI1311,DR4,MMDM0,0,0,2680,h#
SI1311,DR4,MMDM0,1,2680,5640,s
SI1311,DR4,MMDM0,2,5640,7853,ao
SI1311,DR4,MMDM0,3,7853,8647,l
SI1311,DR4,MMDM0,4,8647,10026,r
...,...,...,...,...,...,...
SI1044,DR8,MRDM0,60,59494,60974,zh
SI1044,DR8,MRDM0,61,60974,62208,en
SI1044,DR8,MRDM0,62,62208,63169,epi
SI1044,DR8,MRDM0,63,63169,64698,z


In [11]:
# "Extra" phones in TIMIT not corresponding to speech sounds
timit_extras = ["pau", "epi", "h#"]
phone_df = phone_df[~phone_df.phone.isin(timit_extras)]

In [12]:
paths = list(Path("data/TRAIN").glob("**/*.WRD"))
word_df = pd.concat([pd.read_csv(path, sep=" ", header=None) for path in paths],
                    names=["sentence_idx", "dialect", "speaker"],
                    keys=[(p.stem, p.parts[2], p.parts[3]) for p in paths])
word_df.columns = ["onset", "offset", "word"]
word_df.index.rename("word_idx", level=-1, inplace=True)
word_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,onset,offset,word
sentence_idx,dialect,speaker,word_idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SI1311,DR4,MMDM0,0,2680,8647,soil
SI1311,DR4,MMDM0,1,8647,23227,redeposition
SI1311,DR4,MMDM0,2,31832,33420,is
SI1311,DR4,MMDM0,3,33420,46280,evaluated
SI1311,DR4,MMDM0,4,46280,48840,by
...,...,...,...,...,...,...
SI1044,DR8,MRDM0,3,33520,34855,to
SI1044,DR8,MRDM0,4,35569,41320,insulate
SI1044,DR8,MRDM0,5,41320,48806,themselves
SI1044,DR8,MRDM0,6,48806,54040,against


In [13]:
# asof merge word indices and word identities into phoneme representation
phone_df = pd.merge_asof(phone_df.sort_values("onset").reset_index(level="phone_idx"), word_df.sort_values("onset").reset_index(level="word_idx"),
                         by=["sentence_idx", "dialect", "speaker"], on="onset", suffixes=("", "_word"))
phone_df = phone_df.set_index(["dialect", "speaker", "sentence_idx", "phone_idx"]).sort_index()

In [14]:
# Compute denormalized word phonetic representation
words_phon = phone_df.loc[~phone_df.word_idx.isna()] \
    .groupby(["dialect", "speaker", "sentence_idx", "word_idx"]) \
    .apply(lambda xs: xs.phone.str.cat(sep=" ")) \
    .rename("word_phon")
merged_df = pd.merge(phone_df, words_phon,
                     left_on=["dialect", "speaker", "sentence_idx", "word_idx"],
                     right_index=True)

In [15]:
merged_df.to_csv("timit_merged.csv")

In [16]:
merged_df.word.dropna().unique()

array(['she', 'had', 'your', ..., 'murderer', 'outset', 'reconnaissance'],
      dtype=object)

In [17]:
merged_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,onset,offset,phone,word_idx,offset_word,word,word_phon
dialect,speaker,sentence_idx,phone_idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DR1,FCJF0,SA1,1,3050,4559,sh,0,5723,she,sh ix
DR1,FCJF0,SA1,2,4559,5723,ix,0,5723,she,sh ix
DR1,FCJF0,SA1,3,5723,6642,hv,1,10337,had,hv eh dcl
DR1,FCJF0,SA1,4,6642,8772,eh,1,10337,had,hv eh dcl
DR1,FCJF0,SA1,5,8772,9190,dcl,1,10337,had,hv eh dcl
...,...,...,...,...,...,...,...,...,...,...
DR8,MTCS0,SX82,28,33770,34460,t,7,40870,tips,t ih pcl p s
DR8,MTCS0,SX82,29,34460,36386,ih,7,40870,tips,t ih pcl p s
DR8,MTCS0,SX82,30,36386,38150,pcl,7,40870,tips,t ih pcl p s
DR8,MTCS0,SX82,31,38150,38575,p,7,40870,tips,t ih pcl p s
