In [None]:
%matplotlib inline
import matplotlib.pylab as plt

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

from datetime import datetime
from scipy.cluster.hierarchy import dendrogram, linkage
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from math import sqrt
from scipy.spatial.distance import squareform

In [None]:
words = pd.read_csv('data/50words_TEST.csv', index_col = 0, header = None)

In [None]:
words.index

### Let's take a look at some of the words 'on average'

In [None]:
for i in range(7):
    row = words.groupby(words.index).mean().iloc[i]
    row.plot()

### We can also check to see whether the 'average' matches the individual plot for a given type

First, the 'average'

In [None]:
word_type = 7
row = words.groupby(words.index).mean().iloc[word_type-1]
row.plot()

In [None]:
word_type = 7
row = words.groupby(words.index).median().iloc[word_type-1]
row.plot()

Next the full sample of all those words

In [None]:
for i in range(words[words.index == word_type].shape[0]):
    row = words[words.index == word_type].iloc[i]
    row.plot()

### Let's try to code up the sensible distance function to describe the distance between two times series

In [None]:
# courtesy of http://alexminnaar.com/time-series-classification-and-clustering-with-python.html
# %load snippets/dtwdistance.py

In [None]:
chosen_words = words[words.index == word_type]
s1 = chosen_words.iloc[2]
s2 = chosen_words.iloc[3]
print(type(s1))
DTWDistance(s1.values, s2.values)

In [None]:
s3 = words.iloc[0]
DTWDistance(s1.values, s3.values)

In [None]:
s3 = words.iloc[0]
DTWDistance(s2.values, s3.values)

In [None]:
plt.plot(s1)
plt.plot(s2)
plt.plot(s3)

### Compare the performance of Euclidean distance with that of DTDW for s1, s2, s3 as specified above

In [None]:
# %load snippets/euclidedistance.py

In [None]:
EuclidDistance(s1.values, s2.values)

In [None]:
EuclidDistance(s3.values, s2.values)

In [None]:
EuclidDistance(s1.values, s3.values)

### Can you classify a random row by determining which 'mean' curve it is closest to? How successful is this?

In [None]:
mean_curve = {}
for j in range(1,len(set(words.index))):
    row = words[words.index == j].mean()
    mean_curve[j] = row

In [None]:
test_word = words[words.index == 5].iloc[3]

In [None]:
distance_dict = {key:DTWDistance(test_word.values, value.values) for (key, value) in mean_curve.items() }


In [None]:
from collections import OrderedDict

OrderedDict(sorted(distance_dict.items(), key=lambda t: t[1]))

### Can you cluster the words using the DTW metric?

In [None]:
# Yes, but this would take a really long time, so we're not going to do it

### Instead cluster with features

In [None]:
max_location = []
first_local_max_location = []
second_local_max_location = []
num_inflections = []
ratio_first_local_max_to_abs_max = []
word_type = []

def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

for row in range(words.shape[1]):
    word_type.append(words.index[row])
    
    # locations of maximum, locations of first and second inflection points, number of inflection points
    w = words.iloc[row]
    
    # max point
    max_location.append(w.idxmax())
    
    # local maxima and minima
    w_arr = np.array(w)
    w_arr = smooth(w_arr, 10)
    
    lows = np.where(np.r_[True, w_arr[1:] < w_arr[:-1]] & np.r_[w_arr[:-1] < w_arr[1:], True])
    lows = lows[0]
    mask = ((265 > lows ) & (lows > 5))
    lows = lows[mask]
    
    highs = np.where(np.r_[True, w_arr[1:] > w_arr[:-1]] & np.r_[w_arr[:-1] > w_arr[1:], True])
    highs = highs[0]
    mask = ((265 > highs) & (highs > 5))
    highs = highs[mask]
    
    first_local_max_location.append(highs[0])
    second_local_max_location.append(highs[1])

    ratio_first_local_max_to_abs_max.append(w_arr[highs[0]]/w_arr[w.idxmax()])
    
    num_inflections.append(len(highs) + len(lows))

In [None]:
results = pd.DataFrame({'word_type' : word_type,
                        'max_location': max_location, 
                        'first_local_max_location' : first_local_max_location,
                        'second_local_max_location' : second_local_max_location,
                        'num_inflections' : num_inflections,
                        'ratio_first_local_max_to_abs_max' : ratio_first_local_max_to_abs_max})

In [None]:
results.head()

In [None]:
from sklearn.cluster import KMeans

In [None]:
estimator = KMeans(n_clusters=50)
estimator.fit(results.drop('word_type', axis=1))

In [None]:
results['labels'] = estimator.labels_

In [None]:
results[results.word_type == 5]