Permalink
Browse files

Addition of methods for learning a Markov model.

Meaning fixed size frame sequence.
  • Loading branch information...
Laurent Dinh
Laurent Dinh committed Feb 5, 2014
1 parent 7fcdc91 commit 745d8a3bdb3403e42c91a81faef79dbd4f687664
Showing with 201 additions and 10 deletions.
  1. +201 −10 dataset/timit.py
View
@@ -102,9 +102,10 @@ def load(self, subset):
print "Done !"
# Frames
print "Loading frames...",
# Acoustic samples
print "Loading accoustic samples...",
raw_wav = np.load(raw_wav_path)
raw_wav_len = map(lambda x:len(x), raw_wav)
print "Done !"
print str(raw_wav.shape[0]) + " sentences."
@@ -129,6 +130,7 @@ def load(self, subset):
data = {}
data[subset+"_raw_wav"] = raw_wav
data[subset+"_raw_wav_len"] = raw_wav_len
data[subset+"_n_seq"] = raw_wav.shape[0]
data[subset+"_phn"] = phn
data[subset+"_seq_to_phn"] = seq_to_phn
@@ -153,6 +155,7 @@ def clear(self, subset):
del self.__dict__[subset+"_raw_wav"]
del self.__dict__[subset+"_raw_wav_len"]
del self.__dict__[subset+"_n_seq"]
del self.__dict__[subset+"_phn"]
del self.__dict__[subset+"_seq_to_phn"]
@@ -206,11 +209,18 @@ def sanity_check(self, subset):
else:
print "KO for "+feature_name[i]+"."
def get_raw_seq(self, subset, seq_id, framelength, overlap):
"""
This section is about extracting sequences of varying size.
"""
def get_raw_seq(self, subset, seq_id, frame_length, overlap):
"""
Given an id of the sequence, this method will return a frames sequence
from a given set, the associated phoneme sequence and the
information vector on the speaker.
Given the id of the subset, the id of the sequence, the frame length and
the overlap between frames, this method will return a frames sequence
from a given set, the associated phonemes and words sequences (including
a binary variable indicating change) and the information vector on the
speaker.
"""
self.check_subset_value(subset)
@@ -242,25 +252,44 @@ def get_raw_seq(self, subset, seq_id, framelength, overlap):
for (wrd_start, wrd_end, wrd) in wrd_start_end:
wrd_seq[wrd_start:wrd_end] = wrd+1
# Binary variable announcing the end of the word or phoneme
end_phn = np.zeros_like(phn_seq)
end_wrd = np.zeros_like(wrd_seq)
for i in range(len(phn_seq) - 1):
if phn_seq[i] != phn_seq[i+1]:
end_phn[i] = 1
if wrd_seq[i] != wrd_seq[i+1]:
end_wrd[i] = 1
end_phn[-1] = 1
end_wrd[-1] = 1
# Find the speaker id
spkr_id = self.__dict__[subset+"_spkr"][seq_id]
# Find the speaker info
spkr_info = self.spkrinfo[spkr_id]
# Segment into frames
wav_seq = segment_axis(wav_seq, framelength, overlap)
wav_seq = segment_axis(wav_seq, frame_length, overlap)
# Take the most occurring phoneme in a sequence
phn_seq = segment_axis(phn_seq, framelength, overlap)
phn_seq = segment_axis(phn_seq, frame_length, overlap)
phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
phn_seq = np.asarray(phn_seq, dtype='int')
# Take the most occurring word in a sequence
wrd_seq = segment_axis(wrd_seq, framelength, overlap)
wrd_seq = segment_axis(wrd_seq, frame_length, overlap)
wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
wrd_seq = np.asarray(wrd_seq, dtype='int')
return [wav_seq, phn_seq, wrd_seq, spkr_info]
# Announce the end if and only if it was announced in the current frame
end_phn = segment_axis(end_phn, frame_length, overlap)
end_phn = end_phn.max(axis=1)
end_wrd = segment_axis(end_wrd, frame_length, overlap)
end_wrd = end_wrd.max(axis=1)
return [wav_seq, phn_seq, end_phn, wrd_seq, end_wrd, spkr_info]
def get_n_seq(self, subset):
"""
@@ -269,6 +298,168 @@ def get_n_seq(self, subset):
"""
self.check_subset_value(subset)
self.check_subset_presence(subset)
return self.__dict__[subset+"_n_seq"]
"""
This section is about extracting sequences of fixed size.
"""
def init_markov_frames(self, subset, n_frames_in, frame_length, overlap):
"""
Given the subset id, the frame length, the overlap between frames and
the number of frames we take as input to predict the next, this methods
initializes the get_markov_frames
"""
self.check_subset_value(subset)
self.check_subset_presence(subset)
# Compute the required length to build a frame sequence of fixed size
wav_length = n_frames_in*(frame_length - overlap) + frame_length
# Compute the number of unique frame sequence we can extract from a
# acoustic samples sequence
actual_seq_length = np.array(self.__dict__[subset+"_raw_wav_len"]) \
- (frame_length - overlap) + 1
self.__dict__[subset+"_n_frames_in"] = n_frames_in
self.__dict__[subset+"_frame_length"] = frame_length
self.__dict__[subset+"_overlap"] = overlap
self.__dict__[subset+"_wav_length"] = wav_length
self.__dict__[subset+"_intervals_seq"] = \
np.zeros((actual_seq_length.shape[0] + 1))
self.__dict__[subset+"_intervals_seq"][1:] = \
np.cumsum(actual_seq_length)
def get_markov_frames(self, subset, id):
"""
Given the subset and an id, this method returns the list [input_frames,
input_phonemes, input_words, output_phoneme, output_word, spkr_info,
output_frame, ending_phoneme, ending_word].
"""
assert subset+"_intervals_seq" in self.__dict__.keys()
assert id < self.__dict__[subset+"_intervals_seq"][-1]
n_frames_in = self.__dict__[subset+"_n_frames_in"]
frame_length = self.__dict__[subset+"_frame_length"]
overlap = self.__dict__[subset+"_overlap"]
wav_length = self.__dict__[subset+"_wav_length"]
intervals_seq = self.__dict__[subset+"_intervals_seq"]
# Find the acoustic samples sequence we are looking for
seq_id = np.digitize([id], intervals_seq) - 1
seq_id = seq_id[0]
# Find the position in this sequence
idx_in_seq = id - intervals_seq[seq_id] - (wav_length - frame_length \
+ overlap)
# Get the sequence
wav_seq = self.__dict__[subset+"_raw_wav"][seq_id]
# Get the phonemes
phn_l_start = self.__dict__[subset+"_seq_to_phn"][seq_id][0]
phn_l_end = self.__dict__[subset+"_seq_to_phn"][seq_id][1]
phn_start_end = self.__dict__[subset+"_phn"][phn_l_start:phn_l_end]
phn_seq = np.zeros_like(wav_seq)
for (phn_start, phn_end, phn) in phn_start_end:
phn_seq[phn_start:phn_end] = phn
# Get the words
wrd_l_start = self.__dict__[subset+"_seq_to_wrd"][seq_id][0]
wrd_l_end = self.__dict__[subset+"_seq_to_wrd"][seq_id][1]
wrd_start_end = self.__dict__[subset+"_wrd"][wrd_l_start:wrd_l_end]
wrd_seq = np.zeros_like(wav_seq)
# Some timestamp does not correspond to any word so 0 is
# the index for "NO_WORD" and the other index are shifted by one
for (wrd_start, wrd_end, wrd) in wrd_start_end:
wrd_seq[wrd_start:wrd_end] = wrd+1
# Binary variable announcing the end of the word or phoneme
end_phn = np.zeros_like(phn_seq)
end_wrd = np.zeros_like(wrd_seq)
for i in range(len(phn_seq) - 1):
if phn_seq[i] != phn_seq[i+1]:
end_phn[i] = 1
if wrd_seq[i] != wrd_seq[i+1]:
end_wrd[i] = 1
end_phn[-1] = 1
end_wrd[-1] = 1
# Find the speaker id
spkr_id = self.__dict__[subset+"_spkr"][seq_id]
# Find the speaker info
spkr_info = self.spkrinfo[spkr_id]
# Pick the selected segment
padded_wav_seq = np.zeros((wav_length))
import pdb; pdb.set_trace()
if idx_in_seq < 0:
padded_wav_seq[-idx_in_seq:] = wav_seq[0:(wav_length+idx_in_seq)]
else:
padded_wav_seq = wav_seq[idx_in_seq:(idx_in_seq + wav_length)]
padded_phn_seq = np.zeros((wav_length))
if idx_in_seq < 0:
padded_phn_seq[-idx_in_seq:] = phn_seq[0:(wav_length+idx_in_seq)]
else:
padded_phn_seq = phn_seq[idx_in_seq:(idx_in_seq + wav_length)]
padded_wrd_seq = np.zeros((wav_length))
if idx_in_seq < 0:
padded_wrd_seq[-idx_in_seq:] = wrd_seq[0:(wav_length+idx_in_seq)]
else:
padded_wrd_seq = wrd_seq[idx_in_seq:(idx_in_seq + wav_length)]
# Segment into frames
wav_seq = segment_axis(padded_wav_seq, frame_length, overlap)
# Take the most occurring phoneme in a sequence
phn_seq = segment_axis(padded_phn_seq, frame_length, overlap)
phn_seq = scipy.stats.mode(phn_seq, axis=1)[0].flatten()
phn_seq = np.asarray(phn_seq, dtype='int')
# Take the most occurring word in a sequence
wrd_seq = segment_axis(padded_wrd_seq, frame_length, overlap)
wrd_seq = scipy.stats.mode(wrd_seq, axis=1)[0].flatten()
wrd_seq = np.asarray(wrd_seq, dtype='int')
# Announce the end if and only if it was announced in the current frame
end_phn = segment_axis(end_phn, frame_length, overlap)
end_phn = end_phn.max(axis=1)
end_wrd = segment_axis(end_wrd, frame_length, overlap)
end_wrd = end_wrd.max(axis=1)
# Put names on the output
input_frames = wav_seq[:-1]
input_phonemes = phn_seq[:-1]
input_words = wrd_seq[:-1]
output_phoneme = phn_seq[-1]
output_word = wrd_seq[-1]
output_frame = wav_seq[-1]
ending_phoneme = end_phn[-1]
ending_word = end_wrd[-1]
return [input_frames, input_phonemes, input_words, output_phoneme, \
output_word, spkr_info, output_frame, ending_phoneme, \
ending_word]
def get_n_markov_frames(self, subset):
"""
Given the subset id, return the number of frame segments of fixed size
in it.
"""
self.check_subset_value(subset)
self.check_subset_presence(subset)
assert subset+"_intervals_seq" in self.__dict__.keys()
return self.__dict__[subset+"_intervals_seq"][-1]

0 comments on commit 745d8a3

Please sign in to comment.