# Scotch Exploration

Playing with the scotch dataset. Seeing what we we could do with it.

References

* https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt
* http://wonkviz.tumblr.com/post/72400253092/whiskey-data-sleuthing-with-help-from-reddit
* http://blog.revolutionanalytics.com/2013/12/k-means-clustering-86-single-malt-scotch-whiskies.html

In [None]:
%matplotlib inline

In [None]:
from __future__ import division
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 25)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 1024)

Get the whiskey data from https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt.

In [None]:
#!wget https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt

In [None]:
df = pd.read_csv('datasets/whiskies.txt')

In [None]:
df.head(1)

## PCA

In [None]:
import sklearn.datasets
import sklearn.metrics as metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.drop(['RowID', 'Distillery', 'Postcode', ' Latitude', ' Longitude'], axis=1)

In [None]:
X.describe()

In [None]:
# X_std = StandardScaler().fit_transform(X)

In [None]:
# pd.DataFrame(X_std, columns=X.columns).describe()

In [None]:
n_components = 5

In [None]:
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

In [None]:
X.shape, X_pca.shape

In [None]:
weights = np.round(pca.components_, 3)
ev = np.round(pca.explained_variance_ratio_, 3)

In [None]:
ev

In [None]:
pca_df = pd.DataFrame(weights, columns=X.columns)

In [None]:
pca_df

In [None]:
import warnings
warnings.simplefilter(action = "ignore")

In [None]:
component_names = []
pca_df_t = pca_df.T
for col in pca_df_t:
    component = pca_df_t[col]
    order = component.abs().order(ascending=False)
    top = order.head(3)
    component_name = [name if component[name] > 0 else ('neg-' + name) for name in top.index]
    component_names.append('/'.join(component_name))

In [None]:
pca_df.index = component_names
pca_df

## k-Means

In [None]:
inertia = [np.NaN]
for i in range(1,20):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

In [None]:
ax = plt.subplot(111)
ax.plot(inertia, 'o-')
ax.set_ylabel('inertia')
ax.set_xlabel('# clusters')

In [None]:
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

In [None]:
X['cluster'] = kmeans.labels_

In [None]:
X.cluster.value_counts()

## Plot PCA Points w/ Cluster Info

In [None]:
X_pca_clustered = np.insert(X_pca, n_components, values=kmeans.labels_, axis=1)

In [None]:
pca_cluster_df = pd.DataFrame(X_pca_clustered, columns=component_names + ['cluster'])

In [None]:
g = sns.PairGrid(pca_cluster_df, hue='cluster', vars=component_names, size=3)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()

## Similarities

In [None]:
dist = sklearn.metrics.pairwise.euclidean_distances(X)

In [None]:
sim = sklearn.metrics.pairwise.cosine_similarity(X)

In [None]:
dist_df = pd.DataFrame(dist, columns=df.Distillery, index=df.Distillery)

In [None]:
sim_df = pd.DataFrame(sim, columns=df.Distillery, index=df.Distillery)

In [None]:
cluster_s = X.cluster
cluster_s.index = df.Distillery

In [None]:
sim_df['Ardbeg'].order(ascending=False)

In [None]:
features_df = df.ix[:,1:13].set_index('Distillery')

## Persist

Add the cluster info to the features DataFrame so we only have to persist one file for both.

In [None]:
features_df['cluster'] = cluster_s

In [None]:
features_df.to_pickle('datasets/features.dataframe')

In [None]:
sim_df.to_pickle('datasets/sims.dataframe')