# Vowel analysis using Hillenbrand et al. 1995

The goal of this exercise is to take the vowel measurements from Hillenbrand *et al.* 1995 and cluster the vowels via supervised and unsupervised models and plotting the results. This notebook first cleans and formats the data in order to do so.

# Importing data
Steps:  
1. Define function for importing data  
2. Remove entries with at least one missing value in a row  
3. Reindex the resulting `DataFrame`

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data_path = '~/GitHub/hillenbrand-vowel-clustering/'

def parse_data(filename):
  df = pd.read_csv(filename, index_col = 0)
  return df

hillenbrand_file = os.path.expanduser(data_path + 'hillenbrand-vowel-formatted.csv')

hillenbrand_data = parse_data(hillenbrand_file)

# remove rows with ANY zero values
hillenbrand_data = hillenbrand_data.replace(0, np.nan)
hillenbrand_data = hillenbrand_data.dropna()


hillenbrand_data = hillenbrand_data.reset_index()

hillenbrand_data = hillenbrand_data.drop('Index', 1)

hillenbrand_data.head()

Unnamed: 0,ID,Duration,F0,F1,F2,F3,F4,F1_20,F2_20,F3_20,F1_50,F2_50,F3_50,F1_80,F2_80,F3_80
0,m01ae,323,174,663,2012,2659,3691,669,2008,2671,671,1992,2659,685,1773,2680
1,m02ae,250,102,628,1871,2477,3489,627,1871,2456,636,1881,2455,628,1793,2451
2,m04ae,312,124,627,1910,2488,3463,629,1882,2460,720,1750,2435,757,1563,2527
3,m06ae,254,115,647,1864,2561,3506,642,1866,2557,666,1829,2499,689,1696,2556
4,m07ae,254,96,582,1999,2567,3754,592,1958,2568,624,1925,2569,626,1791,2577


# ID columns

## Sex, Vowel and Word columns
Next, the observations will be identified by speaker Sex as well as the Vowel and Word the measurements were taken from.

In [3]:
# Sex column
for x in range(hillenbrand_data.shape[0]):
    if hillenbrand_data.ix[x, 'ID'].startswith('m'):
        hillenbrand_data.ix[x, 'Sex'] = 'male'
    elif hillenbrand_data.ix[x, 'ID'].startswith('w'):
        hillenbrand_data.ix[x, 'Sex'] = 'female'
    elif hillenbrand_data.ix[x, 'ID'].startswith('b'):
        hillenbrand_data.ix[x, 'Sex'] = 'boy'
    elif hillenbrand_data.ix[x, 'ID'].startswith('g'):
        hillenbrand_data.ix[x, 'Sex'] = 'girl'

hillenbrand_data.head()

Unnamed: 0,ID,Duration,F0,F1,F2,F3,F4,F1_20,F2_20,F3_20,F1_50,F2_50,F3_50,F1_80,F2_80,F3_80,Sex
0,m01ae,323,174,663,2012,2659,3691,669,2008,2671,671,1992,2659,685,1773,2680,male
1,m02ae,250,102,628,1871,2477,3489,627,1871,2456,636,1881,2455,628,1793,2451,male
2,m04ae,312,124,627,1910,2488,3463,629,1882,2460,720,1750,2435,757,1563,2527,male
3,m06ae,254,115,647,1864,2561,3506,642,1866,2557,666,1829,2499,689,1696,2556,male
4,m07ae,254,96,582,1999,2567,3754,592,1958,2568,624,1925,2569,626,1791,2577,male


In [4]:
# Word column
for x in range(hillenbrand_data.shape[0]):
    if hillenbrand_data.ix[x, 'ID'].endswith('ae'):
        hillenbrand_data.ix[x, 'Word'] = 'had'
    elif hillenbrand_data.ix[x, 'ID'].endswith('ah'):
        hillenbrand_data.ix[x, 'Word'] = 'hod'
    elif hillenbrand_data.ix[x, 'ID'].endswith('aw'):
        hillenbrand_data.ix[x, 'Word'] = 'hawed'
    elif hillenbrand_data.ix[x, 'ID'].endswith('eh'):
        hillenbrand_data.ix[x, 'Word'] = 'head'
    elif hillenbrand_data.ix[x, 'ID'].endswith('er'):
        hillenbrand_data.ix[x, 'Word'] = 'heard'
    elif hillenbrand_data.ix[x, 'ID'].endswith('ei'):
        hillenbrand_data.ix[x, 'Word'] = 'haid'
    elif hillenbrand_data.ix[x, 'ID'].endswith('ih'):
        hillenbrand_data.ix[x, 'Word'] = 'hid'
    elif hillenbrand_data.ix[x, 'ID'].endswith('iy'):
        hillenbrand_data.ix[x, 'Word'] = 'heed'
    elif hillenbrand_data.ix[x, 'ID'].endswith('oa'):
        hillenbrand_data.ix[x, 'Word'] = 'boat'
    elif hillenbrand_data.ix[x, 'ID'].endswith('oo'):
        hillenbrand_data.ix[x, 'Word'] = 'hood'
    elif hillenbrand_data.ix[x, 'ID'].endswith('uh'):
        hillenbrand_data.ix[x, 'Word'] = 'hud'
    elif hillenbrand_data.ix[x, 'ID'].endswith('uw'):
        hillenbrand_data.ix[x, 'Word'] = 'whod'

hillenbrand_data.head()

Unnamed: 0,ID,Duration,F0,F1,F2,F3,F4,F1_20,F2_20,F3_20,F1_50,F2_50,F3_50,F1_80,F2_80,F3_80,Sex,Word
0,m01ae,323,174,663,2012,2659,3691,669,2008,2671,671,1992,2659,685,1773,2680,male,had
1,m02ae,250,102,628,1871,2477,3489,627,1871,2456,636,1881,2455,628,1793,2451,male,had
2,m04ae,312,124,627,1910,2488,3463,629,1882,2460,720,1750,2435,757,1563,2527,male,had
3,m06ae,254,115,647,1864,2561,3506,642,1866,2557,666,1829,2499,689,1696,2556,male,had
4,m07ae,254,96,582,1999,2567,3754,592,1958,2568,624,1925,2569,626,1791,2577,male,had


In [5]:
# Vowel column (redundant, but easy to understand)
for x in range(hillenbrand_data.shape[0]):
    if hillenbrand_data.ix[x, 'ID'].endswith('ae'):
        hillenbrand_data.ix[x, 'Vowel'] = 'ae'
    elif hillenbrand_data.ix[x, 'ID'].endswith('ah'):
        hillenbrand_data.ix[x, 'Vowel'] = 'ah'
    elif hillenbrand_data.ix[x, 'ID'].endswith('aw'):
        hillenbrand_data.ix[x, 'Vowel'] = 'aw'
    elif hillenbrand_data.ix[x, 'ID'].endswith('eh'):
        hillenbrand_data.ix[x, 'Vowel'] = 'eh'
    elif hillenbrand_data.ix[x, 'ID'].endswith('er'):
        hillenbrand_data.ix[x, 'Vowel'] = 'er'
    elif hillenbrand_data.ix[x, 'ID'].endswith('ei'):
        hillenbrand_data.ix[x, 'Vowel'] = 'ei'
    elif hillenbrand_data.ix[x, 'ID'].endswith('ih'):
        hillenbrand_data.ix[x, 'Vowel'] = 'ih'
    elif hillenbrand_data.ix[x, 'ID'].endswith('iy'):
        hillenbrand_data.ix[x, 'Vowel'] = 'iy'
    elif hillenbrand_data.ix[x, 'ID'].endswith('oa'):
        hillenbrand_data.ix[x, 'Vowel'] = 'oa'
    elif hillenbrand_data.ix[x, 'ID'].endswith('oo'):
        hillenbrand_data.ix[x, 'Vowel'] = 'oo'
    elif hillenbrand_data.ix[x, 'ID'].endswith('uh'):
        hillenbrand_data.ix[x, 'Vowel'] = 'uh'
    elif hillenbrand_data.ix[x, 'ID'].endswith('uw'):
        hillenbrand_data.ix[x, 'Vowel'] = 'uw'

hillenbrand_data.head()

Unnamed: 0,ID,Duration,F0,F1,F2,F3,F4,F1_20,F2_20,F3_20,F1_50,F2_50,F3_50,F1_80,F2_80,F3_80,Sex,Word,Vowel
0,m01ae,323,174,663,2012,2659,3691,669,2008,2671,671,1992,2659,685,1773,2680,male,had,ae
1,m02ae,250,102,628,1871,2477,3489,627,1871,2456,636,1881,2455,628,1793,2451,male,had,ae
2,m04ae,312,124,627,1910,2488,3463,629,1882,2460,720,1750,2435,757,1563,2527,male,had,ae
3,m06ae,254,115,647,1864,2561,3506,642,1866,2557,666,1829,2499,689,1696,2556,male,had,ae
4,m07ae,254,96,582,1999,2567,3754,592,1958,2568,624,1925,2569,626,1791,2577,male,had,ae


## Mapping vowel characteristics
Next, attributes such as vowel height, position will be mapped.

In [6]:
from collections import OrderedDict

height_dict = OrderedDict()
pos_dict = OrderedDict()

height_dict = \
  {'ae': 'near-open', 'ah': 'near-open', 'aw': 'open', 'eh': 'open-mid',
   'er': 'mid', 'ei': 'close-mid', 'ih': 'close-mid','iy': 'close',
   'oa': 'close-mid', 'oo': 'close-mid', 'uh': 'open-mid', 'uw': 'close'}

pos_dict = \
  {'ae': 'front', 'ah': 'central', 'aw': 'back', 'eh': 'front',
   'er': 'central', 'ei': 'front', 'ih': 'front','iy': 'back',
   'oa': 'back', 'oo': 'back', 'uh': 'back', 'uw': 'back'}

hillenbrand_data['Height'] = hillenbrand_data.Vowel.map(height_dict)
hillenbrand_data['Position'] = hillenbrand_data.Vowel.map(pos_dict)


# rounding; iterative loop, but fine since relatively small
for x in xrange(hillenbrand_data.shape[0]):
	if hillenbrand_data.ix[x, 'Vowel'] == 'ae':
		hillenbrand_data.ix[x, 'Rounding'] = 1
	elif hillenbrand_data.ix[x, 'Vowel'] == 'aw':
		hillenbrand_data.ix[x, 'Rounding'] = 1
	elif hillenbrand_data.ix[x, 'Vowel'] == 'oa':
		hillenbrand_data.ix[x, 'Rounding'] = 1
	elif hillenbrand_data.ix[x, 'Vowel'] == 'uw':
		hillenbrand_data.ix[x, 'Rounding'] = 1
	else:
		hillenbrand_data.ix[x, 'Rounding'] = 0
		


In [7]:
hillenbrand_data.head()

Unnamed: 0,ID,Duration,F0,F1,F2,F3,F4,F1_20,F2_20,F3_20,...,F3_50,F1_80,F2_80,F3_80,Sex,Word,Vowel,Height,Position,Rounding
0,m01ae,323,174,663,2012,2659,3691,669,2008,2671,...,2659,685,1773,2680,male,had,ae,near-open,front,1
1,m02ae,250,102,628,1871,2477,3489,627,1871,2456,...,2455,628,1793,2451,male,had,ae,near-open,front,1
2,m04ae,312,124,627,1910,2488,3463,629,1882,2460,...,2435,757,1563,2527,male,had,ae,near-open,front,1
3,m06ae,254,115,647,1864,2561,3506,642,1866,2557,...,2499,689,1696,2556,male,had,ae,near-open,front,1
4,m07ae,254,96,582,1999,2567,3754,592,1958,2568,...,2569,626,1791,2577,male,had,ae,near-open,front,1


## Mapping targets
The last step will be to map target values to use for supervised learning.

In [8]:
# map target names to values


sex_dict = {'male': 0, 'female':1, 'boy':2, 'girl': 3}
word_dict = OrderedDict()
vowel_dict = OrderedDict()

word_list = np.unique(hillenbrand_data.Word.values.tolist())

vowel_list = np.unique(hillenbrand_data.Vowel.values.tolist())

for idx, word in enumerate(word_list):
    word_dict[word] = idx

for idx, vowel in enumerate(vowel_list):
    vowel_dict[vowel] = idx

hillenbrand_data['SexTarget'] = hillenbrand_data.Sex.map(sex_dict)
hillenbrand_data['WordTarget'] = hillenbrand_data.Word.map(word_dict)
hillenbrand_data['VowelTarget'] = hillenbrand_data.Vowel.map(vowel_dict)

hillenbrand_data.head()

Unnamed: 0,ID,Duration,F0,F1,F2,F3,F4,F1_20,F2_20,F3_20,...,F3_80,Sex,Word,Vowel,Height,Position,Rounding,SexTarget,WordTarget,VowelTarget
0,m01ae,323,174,663,2012,2659,3691,669,2008,2671,...,2680,male,had,ae,near-open,front,1,0,1,0
1,m02ae,250,102,628,1871,2477,3489,627,1871,2456,...,2451,male,had,ae,near-open,front,1,0,1,0
2,m04ae,312,124,627,1910,2488,3463,629,1882,2460,...,2527,male,had,ae,near-open,front,1,0,1,0
3,m06ae,254,115,647,1864,2561,3506,642,1866,2557,...,2556,male,had,ae,near-open,front,1,0,1,0
4,m07ae,254,96,582,1999,2567,3754,592,1958,2568,...,2577,male,had,ae,near-open,front,1,0,1,0


# Normalizing features
In order to avoid scaling issues when implementing the machine learning algorithms, the features will be scaled. TO keep things simple, the z-score will be used. This could be done within groups, but that might be too granular of an analysis. The formant ratios will aalso be calculated.

In [10]:
from scipy import stats

hillenbrand_data['F0_zscore'] = stats.zscore(hillenbrand_data.F0)
hillenbrand_data['F1_zscore'] = stats.zscore(hillenbrand_data.F1)
hillenbrand_data['F2_zscore'] = stats.zscore(hillenbrand_data.F2)
hillenbrand_data['F3_zscore'] = stats.zscore(hillenbrand_data.F3)
hillenbrand_data['F4_zscore'] = stats.zscore(hillenbrand_data.F4)

hillenbrand_data['F1_20_zscore'] = stats.zscore(hillenbrand_data.F1_20)
hillenbrand_data['F2_20_zscore'] = stats.zscore(hillenbrand_data.F2_20)
hillenbrand_data['F3_20_zscore'] = stats.zscore(hillenbrand_data.F3_20)

hillenbrand_data['F1_80_zscore'] = stats.zscore(hillenbrand_data.F1_80)
hillenbrand_data['F2_80_zscore'] = stats.zscore(hillenbrand_data.F2_80)
hillenbrand_data['F3_80_zscore'] = stats.zscore(hillenbrand_data.F3_80)

# additional features: formant ratios
hillenbrand_data['F1_F2_ratio'] = np.divide(hillenbrand_data.F1.values, hillenbrand_data.F2.values)
hillenbrand_data['F1_F3_ratio'] = np.divide(hillenbrand_data.F1.values, hillenbrand_data.F3.values)
hillenbrand_data['F2_F3_ratio'] = np.divide(hillenbrand_data.F2.values, hillenbrand_data.F3.values)

hillenbrand_data.head()

Unnamed: 0,ID,Duration,F0,F1,F2,F3,F4,F1_20,F2_20,F3_20,...,F4_zscore,F1_20_zscore,F2_20_zscore,F3_20_zscore,F1_80_zscore,F2_80_zscore,F3_80_zscore,F1_F2_ratio,F1_F3_ratio,F2_F3_ratio
0,m01ae,323,174,663,2012,2659,3691,669,2008,2671,...,-0.676212,0.441193,0.49634,-0.357916,0.705378,-0.072264,-0.351086,0.329523,0.249342,0.756675
1,m02ae,250,102,628,1871,2477,3489,627,1871,2456,...,-1.098747,0.176182,0.263107,-0.860731,0.334264,-0.034458,-0.890092,0.335649,0.253532,0.755349
2,m04ae,312,124,627,1910,2488,3463,629,1882,2460,...,-1.153132,0.188802,0.281834,-0.851377,1.174153,-0.469231,-0.711208,0.328272,0.25201,0.767685
3,m06ae,254,115,647,1864,2561,3506,642,1866,2557,...,-1.063187,0.270829,0.254595,-0.624525,0.731421,-0.217819,-0.642949,0.347103,0.252636,0.727841
4,m07ae,254,96,582,1999,2567,3754,592,1958,2568,...,-0.544431,-0.04466,0.411219,-0.5988,0.321242,-0.038239,-0.593521,0.291146,0.226724,0.77873


# Feature matrix creation
The last step is to create the matrices that will be used. In order to do this, the proper columns must be selection and converted to `NumPy` matrices. These are the data that will be used when learning clusters.

In [11]:
formant_columns = ['F0_zscore', 'F1_zscore', 'F2_zscore', 'F3_zscore', 'F4_zscore',
                   'F1_20_zscore', 'F2_20_zscore', 'F3_20_zscore',
                   'F1_80_zscore', 'F2_80_zscore', 'F3_80_zscore']

formant_ratio_columns = ['F1_F2_ratio', 'F1_F3_ratio', 'F2_F3_ratio']

def create_np_matrix(input_data):
    matrix_rep = input_data.as_matrix()
    matrix_rep = np.float64(matrix_rep)
    return matrix_rep

formant_ratio_data = hillenbrand_data.ix[:, formant_ratio_columns]

formant_ratio_mtx = create_np_matrix(formant_ratio_data)

formant_data = hillenbrand_data.ix[:, formant_columns]

formant_mtx = create_np_matrix(formant_data)

target_columns = ['SexTarget', 'WordTarget', 'VowelTarget']

target_data = hillenbrand_data.ix[:, target_columns]

target_mtx = create_np_matrix(target_data)


In [15]:
import IPython
import sys
import scipy as scipy




In [16]:
print 'IPython version: ', IPython.__version__
print 'Platform: ', sys.platform
print 'NumPy version: ', np.__version__
print 'pandas version: ', pd.__version__
print 'SciPy stats version: ', scipy.__version__

IPython version:  3.2.1
Platform:  darwin
NumPy version:  1.9.2
pandas version:  0.16.2
SciPy stats version:  0.15.1
