# Applause/Speech Classifier

In [None]:
import attk
import os
import csv
import numpy as np
import librosa
import timeit
import random
import subprocess
import urllib2
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
from IPython.display import display, Audio
import pandas as pd

!mkdir -p /sharedfolder/applause_classifier/

os.chdir('/sharedfolder/applause_classifier/')

# A directory that would contain subdirectories of full-length audio 
# recordings in the speaker recognition version of this notebook. 
# In this case we're using pre-segmented audio collections, so the 
# 'Applause' directory only contains one directory: `_classes_Audio`.
training_audio_dir_name = "Applause"

# A directory containing pre-segmented audio training sets in two or 
# more subdirectories.
class_dir_pathname = '/sharedfolder/applause_classifier/_classes_' + training_audio_dir_name

# Creating the directories we need, in case they don't already exist.
subprocess.call(['mkdir', '-p', class_dir_pathname])

In [None]:
## Downloading male and female training data: our 'background' set, 
# which we will use to create a universal background model (UBM) for 
# our classifier, putatively representing all speakers in the world.

os.chdir('/sharedfolder/applause_classifier/')

!wget -N http://xtra.arloproject.com/datasets/audio/Mozilla_CV_UBM_Subset/females_5k.zip
!wget -N http://xtra.arloproject.com/datasets/audio/Mozilla_CV_UBM_Subset/males_5k.zip
!wget -N http://xtra.arloproject.com/datasets/aapb-ubm/Male_AAPB_171110.zip
!wget -N http://xtra.arloproject.com/datasets/aapb-ubm/Female_AAPB_171110.zip

!unzip -o females_5k.zip
!unzip -o males_5k.zip
!unzip -o Male_AAPB_171110.zip
!unzip -o Female_AAPB_171110.zip

In [None]:
## Downloading applause training data from GitHub.
# Roughly half the WAVs in this collection come from the PennSound 
# poetry archive, the rest from speeches and public appearances by 
# Bill Clinton.

!git clone https://github.com/hipstas/applause-classifier.git
    
# The `!` character at the start of the line above is known as a 
# 'magic' in Jupyter. The following command runs in bash -- 
# not great style if you're developing a Python script or package, 
# but quick and readable for everyday work.

In [None]:
## Creating directories for our two classes:
#     /sharedfolder/applause_classifier/_classes_Audio/Applause/ 
#     /sharedfolder/applause_classifier/_classes_Audio/Non-Applause/

applause_path = os.path.join(class_dir_pathname, 'Applause')
non_applause_path = os.path.join(class_dir_pathname, 'Non-Applause')

subprocess.call(['mkdir', applause_path])
subprocess.call(['mkdir', non_applause_path])

In [None]:
## Consolidating all non-applause clips in a single directory 
# using bash, via Jupyter's `!` magic character.

!mv females_5k/* '/sharedfolder/applause_classifier/_classes_Applause/Non-Applause'
!mv males_5k/* '/sharedfolder/applause_classifier/_classes_Applause/Non-Applause'
!mv Male_AAPB_171110/* '/sharedfolder/applause_classifier/_classes_Applause/Non-Applause'
!mv Female_AAPB_171110/* '/sharedfolder/applause_classifier/_classes_Applause/Non-Applause'

In [None]:
## Unzipping applause training data.

os.chdir('/sharedfolder/applause_classifier/applause-classifier')

!unzip -o Applause_from_Bill_Clinton_speeches_1.zip
!unzip -o Applause_from_Bill_Clinton_speeches_2.zip
!unzip -o applause_pt1.zip
!unzip -o applause_pt2.zip

In [None]:
## Moving all applause WAV clips (located in the 4 directories we 
# created in the previous cell) to the master Applause class directory.

!mv */*.wav '/sharedfolder/applause_classifier/_classes_Applause/Applause'

In [None]:
%%capture
## Extracting features (MFCCs, deltas, and delta-deltas) from all clips, then writing features to CSVs
# (Note that we are not extracting vowel segments like we do with speech classifiers.)

# Notes: 
# The `%%capture` magic above suppresses a cell's output.

# If you aren't getting the results you expect when you run this cell, 
# comment out the `%%capture` line to check for error messages.
# (The `moviepy` package is extremely verbose; while running large-scale 
# jobs, your browser may run out of memory.)
above 
os.chdir(class_dir_pathname)

for dir_name in [item for item in os.listdir('./') if os.path.isdir(item)]:
    print("> Starting " + dir_name)
    try:
        os.chdir(os.path.join(class_dir_pathname, dir_name))
        try: os.mkdir('./_mfccs_and_deltas')
        except: pass
        filenames = [item for item in os.listdir('./') if item[-4:].lower()=='.wav']
        for filename in filenames:
            csv_out_path = './_mfccs_and_deltas/' + filename[:-4] + '.mfcc.csv'
            if not os.path.isfile(csv_out_path):
                try:
                    mfccs = attk.get_mfccs_and_deltas(filename, n_mfcc=30, n_fft=4096, freq_min=100, freq_max=16000)
                    if len(mfccs) > 0:
                        with open(csv_out_path, 'w') as fo:
                            csv_writer = csv.writer(fo)
                            csv_writer.writerows(mfccs)  
                except Exception as e:
                    print('FILE ERROR: ' + filename)
                    print(e)
    except Exception as e:
        print('SKIPPING DIRECTORY: ' + dir_name)     ## Skipping class directories for which we didn't extract vowels
        #print(e)

In [None]:
## Continue to the next notebook to train and run the applause classifier.