# Data for Principal Component Analysis

I use this notebook to create a randomized ice cream sundae data set.  We will perform PCA on this and look at how many dimensions we are able to reduce, how the data is transformed to new coordinates based on the PCs, and how to interpret relationships in the data by examining the PCs.  We can also check for nonlinearity in the data set that would inform us that PCA is not a good choice for interpretation of this data.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.decomposition import PCA
import seaborn as sns

%matplotlib notebook

In [2]:
# Create the data set.  These random numbers are not seeded, so the resulting data will vary everytime this is run.

#sundae ingredients: ice cream, chocolate syrup, caramel syrup, marshmallow syrup, whipped cream, nuts, cherry, banana
numrows = 1000
#ice cream:
flavors = {0:'sweet cream',1:'vanilla',2:'mexican vanilla',3:'triplemex'}
icecreamcat = np.random.randint(0,len(flavors),numrows)
icecreamflav = [flavors[x] for x in icecreamcat]

#syrups can come in 1/2 pump units, up to 3 (what madman would have more than 3?!)
#all values are forced to be positive, since you can't have negative toppings
hotfudge = abs(np.random.normal(2,1,numrows)) #mean 2, stdev 1
caramel = abs(1/hotfudge - np.random.normal(0,0.5,numrows)) #1/hf - mean 0, stdev 0.5
fluff = abs(hotfudge - np.random.randint(0,2,numrows)/2) #hf - random 0-2, in 0.5 increments
whip = abs(np.random.normal(1,0.2,numrows)) #mean 1, stdev 0.2
nuts = abs(np.random.normal(0.5,0.5,numrows)) #mean 0.5, stdev 0.5

#discrete number of cherries (0, 1, or 2)
cherry = np.random.randint(0,2,numrows)

#number of banana slices (estimated 20 slices per banana)
banana = np.random.randint(0,20,numrows)

#create DataFrame
sundae_df = pd.DataFrame(icecreamflav,columns=['flavor'])
sundae_df['ice_cream'] = icecreamcat
sundae_df['hot_fudge'] = hotfudge
sundae_df['caramel'] = caramel
sundae_df['marshmallow'] = fluff
sundae_df['whipped_cream'] = whip
sundae_df['nuts'] = nuts
sundae_df['cherry'] = cherry
sundae_df['banana'] = banana

In [3]:
#calorie calculator, amounts via googling

# ice cream = 137
# hot fudge = 130
# caramel = 120
# marshmallow = 91
# whipped cream = 8
# nuts = 50
# cherry = 8
# banana = 5

calories = sundae_df['ice_cream']*137 + sundae_df['hot_fudge']*130 + sundae_df['caramel']*120 + \
    sundae_df['marshmallow']*91 + sundae_df['whipped_cream']*8 + sundae_df['nuts']*50 + sundae_df['cherry']*8 \
    + sundae_df['banana']*5
sundae_df['calories'] = calories

In [4]:
sundae_df.head(10)

Unnamed: 0,flavor,ice_cream,hot_fudge,caramel,marshmallow,whipped_cream,nuts,cherry,banana,calories
0,mexican vanilla,2,2.744696,0.358941,2.744696,0.764141,2.560974,1,1,1070.812502
1,sweet cream,0,1.674893,0.227068,1.674893,0.836131,0.215874,0,16,494.882311
2,vanilla,1,0.435202,2.488417,0.435202,0.988188,0.216089,1,8,598.499741
3,vanilla,1,2.968186,0.487875,2.968186,1.054404,0.539857,1,17,979.942153
4,mexican vanilla,2,2.149873,0.065169,1.649873,0.991671,0.387153,1,5,771.733158
5,sweet cream,0,2.282432,0.517492,2.282432,0.744374,0.76752,1,12,678.84742
6,sweet cream,0,2.688277,1.431378,2.688277,0.735268,1.588567,0,5,876.185046
7,vanilla,1,4.360838,0.001483,4.360838,0.992617,0.634573,0,8,1180.592584
8,sweet cream,0,0.839862,1.276352,0.339862,0.900982,0.6175,1,14,409.354527
9,sweet cream,0,1.960831,0.722165,1.960831,1.133068,1.033991,1,12,648.767643


In [8]:
#Because calories is a calculated value based on many random numbers, we should look at the data for outliers.
sundae_df.hist(column='calories',bins=200)

<IPython.core.display.Javascript object>

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11ca2d278>]], dtype=object)

In [6]:
# To find a good cut off for outliers, print the number of points over various cutoffs.  
# 3000 seems to be pretty robust on multiple runs of the random data set.
print(len(sundae_df[sundae_df['calories'] > 1500]))
print(len(sundae_df[sundae_df['calories'] > 2500]))
print(len(sundae_df[sundae_df['calories'] > 3000]))
print(len(sundae_df[sundae_df['calories'] > 4000]))
print(len(sundae_df[sundae_df['calories'] > 5000]))
print(len(sundae_df[sundae_df['calories'] > 8000]))

16
5
4
4
3
2


In [9]:
scatter_matrix(sundae_df, alpha=0.2, figsize=(10, 10), diagonal='kde')

<IPython.core.display.Javascript object>

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11d742588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d9b1dd8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d9fd278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11da3af28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11da87f60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11dac6630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11dc44860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11dc7ef98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11dcca550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11dd04a58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11dd56400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11dda1128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11ddbcd68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11de0c7f0>,
     

In [None]:
# Remove outliers and output data set for use in PCA.
df = sundae_df[sundae_df['calories'] < 3000] #remove outliers
df.to_csv('IceCreamSundaes.csv')