In [None]:
import os
import sys
import numpy as np
from numpy import ma
from aubio import source, pitch
from matplotlib import pyplot as plt
from itertools import groupby
from operator import itemgetter
import subprocess
import timeit
from moviepy.audio.io import AudioFileClip

os.chdir('/home/sharedfolder/The_World')

%matplotlib inline

In [None]:

def get_vowel_segments(media_path):
    downsample = 1
    samplerate = 44100 // downsample

    win_s = 2048 // downsample # fft size
    hop_s = 512  // downsample # hop size

    s = source(media_path, samplerate, hop_s)
    samplerate = s.samplerate

    tolerance = 0.6

    pitch_o = pitch("yin", win_s, hop_s, samplerate)
    pitch_o.set_unit("Hz")
    pitch_o.set_tolerance(tolerance)

    pitches = []
    confidences = []

    # total number of frames read
    total_frames = 0
    samples=[]
    pitches=[]
    while True:
        samples, read = s()
        pitch_ = pitch_o(samples)[0]
        #pitch = int(round(pitch))
        confidence = pitch_o.get_confidence()
        #print("%f %f %f" % (total_frames / float(samplerate), pitch, confidence))
        pitches += [pitch_]
        confidences += [confidence]
        total_frames += read
        if read < hop_s: break

    pitches = np.array(pitches)
    confidences = np.array(confidences)

    cleaned_pitches = ma.masked_where(confidences < tolerance, pitches)
    cleaned_pitches = ma.masked_where(cleaned_pitches > 1000, cleaned_pitches)
    return list(np.logical_not(cleaned_pitches.mask))



# Takes list of 1-second segments classified as applause (1.0) or
# non-applause (0.0) and returns list of 2-tuples specifying applause ranges:
# (start time, duration)
def vowels_to_ranges(vowel_bools):
    vowel_nums = []
    counter=0
    for val in vowel_bools:
        if val == True:
            vowel_nums.append(counter)
        counter+=1
    ranges = []
    for k, g in groupby(enumerate(vowel_nums), lambda (i,x):i-x):
        group = map(itemgetter(1), g)
        ranges.append((group[0], group[-1]))
    ranges=[(i,(x+1)-i) for i,x in ranges]       # Adding 1 to make time range inclusive. Format: (start_time,duration)
    return ranges



## Returns duration of any media file using ffprobe
def media_duration(media_path):
    return float(subprocess.check_output(['ffprobe', '-v', 'quiet', '-of', 'csv=p=0', '-show_entries', 'format=duration', media_path]).strip())



def extract_pairs(media_path,vowel_ranges):
    snd = AudioFileClip.AudioFileClip(media_path)
    file_duration = media_duration(media_path)
    for pair in vowel_ranges:
        if int(pair[1]) >= 4:
            start, duration = pair
            start=float(start)*(512/44100.0)
            duration=float(duration)*(512/44100.0)
            if start + duration > file_duration:
                duration = file_duration - start
            basename = media_path.split('/')[-1][:-4]
            out_filename = basename+'_'+str(start)+'_'+str(duration)+'.wav'
            snd.subclip(float(start),float(start)+float(duration)).write_audiofile(os.path.join('_vowel_clips',out_filename))


def batch_extract_vowels(media_dir):
    os.chdir(media_dir)

    try: os.mkdir('_vowel_clips')
    except: pass

    filenames=[item for item in os.listdir('./') if item[-4:].lower() in ('.mp3','.wav')]

    tic=timeit.default_timer()

    for filename in filenames:
        try:
            vowel_bools = get_vowel_segments(filename)
            vowel_ranges = vowels_to_ranges(vowel_bools)
            extract_pairs(filename,vowel_ranges)
        except: print("***** ERROR: "+filename)

    print("Time elapsed: "+str(timeit.default_timer() - tic))



In [None]:
class_dirs = [item for item in os.listdir('_classes') if item!='.DS_Store']

class_dirs

In [None]:
%%capture
# The line above prevents this cell from displaying text output while it runs.

for dir_name in class_dirs:
    media_dir='_classes/'+dir_name
    batch_extract_vowels(media_dir)
    os.chdir('/home/sharedfolder/The_World')
    
## You should stretch your legs and get a cup of coffee at this point. 
## Depending on the size of your collection, this may take a while.

In [None]:
!pwd

In [None]:
os.chdir('/home/sharedfolder/The_World')
!ls

In [None]:
media_path = "/home/sharedfolder/Gist_labeled_clips_June_16_2017/Background_Speaker/SM6487149057_414.0.wav"

def trace_pitch(media_path):
    downsample = 1
    samplerate = 44100 // downsample

    win_s = 2048 // downsample # fft size
    hop_s = 512  // downsample # hop size

    s = source(media_path, samplerate, hop_s)
    samplerate = s.samplerate

    tolerance = 0.6

    pitch_o = pitch("yin", win_s, hop_s, samplerate)
    pitch_o.set_unit("Hz")
    pitch_o.set_tolerance(tolerance)

    pitches = []
    confidences = []

    # total number of frames read
    total_frames = 0
    samples=[]
    pitches=[]
    while True:
        samples, read = s()
        pitch_ = pitch_o(samples)[0]
        #pitch = int(round(pitch))
        confidence = pitch_o.get_confidence()
        #print("%f %f %f" % (total_frames / float(samplerate), pitch, confidence))
        pitches += [pitch_]
        confidences += [confidence]
        total_frames += read
        if read < hop_s: break

    pitches = np.array(pitches)
    confidences = np.array(confidences)

    cleaned_pitches = ma.masked_where(confidences < tolerance, pitches)
    cleaned_pitches = ma.masked_where(cleaned_pitches > 1000, cleaned_pitches)

    plt.figure(figsize=(10,5))
    plt.plot(cleaned_pitches)
    plt.show()

trace_pitch(media_path)

In [None]:
vowel_books = get_vowel_segments(media_path)

plt.figure(figsize=(10,5))
plt.plot(vowel_bools)
plt.show()