# Imports

In [225]:
import datetime
import pymongo
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

import numpy as np
from collections import defaultdict

from skimage import color as skcolor

import plotly
import plotly.plotly as py
plotly.offline.init_notebook_mode(connected=True)

In [96]:
from sklearn import cluster
from sklearn import linear_model

In [41]:
def centroid_histogram(clt):
    """
    Calculates the percentage of each dominant color

    Source: pyimagesearch.com

    :param clt: kmeans cluster after fitting
    :return: percentages for each cluster
    """

    # grab the number of different clusters and create a histogram
    # based on the number of pixels assigned to each cluster
    numLabels = len(np.unique(clt.labels_))
    (hist, _) = np.histogram(clt.cluster_centers_, bins=numLabels)

    # normalize the histogram, such that it sums to one
    hist = hist.astype("float")
    hist /= hist.sum()

    # return the histogram (percentage described by each cluster)
    return hist.reshape(numLabels, 1)

# Connect to Data

##### Connect to databases

In [3]:
# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB = db.TTB # collection for form data
TTB_COLORS = db.COLORS # collection for the label image data
TTB_IMG_META = db.IMG_META
TTB_IMG_SUP = db.IMG_SUP

##### Load into pandas

In [4]:
df = pd.DataFrame(list(TTB.find()))
df_colors = pd.DataFrame(list(TTB_COLORS.find()))
df_img_meta = pd.DataFrame(list(TTB_IMG_META.find()))
df_img_sup = pd.DataFrame(list(TTB_IMG_SUP.find()))

##### Drop useless `_id` columns

In [5]:
df_colors = df_colors.drop(['_id'], axis=1)
df_img_meta = df_img_meta.drop(['_id'], axis=1)
df_img_sup = df_img_sup.drop(['_id'], axis=1)

##### Drop duplicates, unclear why present

In [6]:
print('df_color duplicates:  {}'.format(df_colors.shape[0] - df_colors.drop_duplicates().shape[0]))
print('df_img_meta duplicates:  {}'.format(df_img_meta.shape[0] - df_img_meta.drop_duplicates().shape[0]))
print('df_img_sup duplicates:  {}'.format(df_img_sup.shape[0] - df_img_meta.drop_duplicates().shape[0]))

df_colors = df_colors.drop_duplicates()
df_img_meta = df_img_meta.drop_duplicates()
df_img_sup = df_img_sup.drop_duplicates()

df_color duplicates:  391
df_img_meta duplicates:  119
df_img_sup duplicates:  119


##### Clean up bad values in color

In [7]:
df_colors[['r', 'g', 'b']] = df_colors[['r', 'g', 'b']].clip(lower=0.0, upper=255.0)

##### Convert recieve date to datetime

In [80]:
df['recieve_date'] = pd.to_datetime(df['recieve_date'], format='%m/%d/%Y')

##### Helper lookup for US states

In [81]:
# get list of all US states, convert to uppercase as that is what is used
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
states = [state.upper() for state in states]

us_state_abbrev = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}

# capitalized versions
abbrev_lookup=defaultdict(str)
for k, v in us_state_abbrev.items():
    abbrev_lookup[k.upper()] = v


##### Select US states only from the main df

In [82]:
us_only = df.loc[df['OriginCode'].isin(states)]
us_only = us_only.loc[df['Status'] == 'APPROVED']
us_only['_id'].count()

50736

# Analysis

## Number of colors by type of beverage

### Categorize the classes

We can group more broadly using the following categories

In [196]:
SPIRITS_MISC = ["CORDIALS (FRUIT & PEELS)", "FRUIT FLAVORED LIQUEURS", "CURACAO", "TRIPLE SEC", "FRUITS & PEELS SCHNAPPS LIQUEUR", "OTHER FRUITS & PEELS LIQUEURS", "CORDIALS (HERBS & SEEDS)", "ANISETTE, OUZO, OJEN", "COFFEE (CAFE) LIQUEUR", "KUMMEL", "PEPPERMINT SCHNAPPS", "HERBS AND SEEDS SCHNAPPS LIQUEUR", "AMARETTO", "SAMBUCA", "ARACK/RAKI/ARAK", "OTHER (HERBS & SEEDS)", "CORDIALS (CREMES OR CREAMS)", "CREME DE CACAO WHITE", "CREME DE CACAO BROWN", "CREME DE MENTHE WHITE", "CREME DE MENTHE GREEN", "CREME DE ALMOND (NOYAUX)", "DAIRY CREAM LIQUEUR/CORDIAL", "NON DAIRY CREME LIQUEUR/CORDIAL", "OTHER LIQUEUR (CREME OR CREAMS)", "SPECIALTIES & PROPRIETARIES", "OTHER SPECIALTIES & PROPRIETARIES", "CORDIALS (FRUIT & PEELS)", "FRUIT FLAVORED LIQUEURS", "CURACAO", "TRIPLE SEC", "FRUITS & PEELS SCHNAPPS LIQUEUR", "OTHER FRUIT & PEELS LIQUEURS", "CORDIALS (HERBS & SEEDS)", "ANISETTE, OUZO, OJEN", "COFFEE (CAFE) LIQUEUR", "KUMMEL", "PEPPERMINT SCHNAPPS", "HERBS & SEEDS SCHNAPPS LIQUEUR", "AMARETTO", "SAMBUCA", "ARACK/RAKI/ARAK", "OTHER HERB & SEED CORDIALS/LIQUEURS", "CORDIALS (CREMES OR CREAMS)", "CREME DE CACAO WHITE", "CREME DE CACAO BROWN", "CREME DE MENTHE WHITE", "CREME DE MENTHE GREEN", "CREME DE ALMOND (NOYAUX)a", "DAIRY CREAM LIQUEUR/CO", "NON DAIRY CREME LIQUEUR/CORDIAL", "OTHER LIQUEUR (CREMES OR CREAMS)", "SPECIALITIES & PROPRIETARIES", "OTHER SPECIALTIES & PROPRIETARIES", "COCKTAILS 48 PROOF UP", "MARGARITA (48 PROOF UP)", "COCKTAILS 48 PROOF UP (CONT)", "DAIQUIRI (48 PROOF UP)", "COLADA (48PROOF UP)", "OTHER COCTAILS (48PROOF UP)", "COCKTAILS UNDER 48 PROOF", "MARGARITA (UNDER 48 PROOF)", "OTHER TEQUILA-BASED COCKTAILS (UNDER 48 PROOF)", "COCKTAILS UNDER 48 PROOF (CONT)", "DAIQUIRI (UNDER 48 PROOF)", "COLADA (UNDER 48 PROOF)", "OTHER COCKTAILS (UNDER 48 PROOF)", "MIXED DRINKS-HI BALLS COCKTAILS", "SCREW DRIVER", "COLLINS", "BLOODY MARY", "EGG NOG", "OTHER MIXED DRINKS HI-BALLS COCKTAILS", "COCKTAILS 48 PROOF UP", "MARGARITA 48 PROOF UP", "DAIQUIRI 48 PROOF UP", "COLADA (48 PROOF UP )", "OTHER COCKTAILS (48 PROOF UP)", "COCKTAILS UNDER 48 PROOF", "MARGARITA UNDER 48 PROOF", "OTHER TEQUILA-BASED COCKTAILS (UNDER 48 PROOF)", "COCKTAILS UNDER 48 PR(CONT)", "DAIQUIRI UNDER 48 PROOF", "COLADA (UNDER 48 PROOF )", "OTHER COCKTAILS (UNDER 48 PROOF)", "MIXED DRINKS-HI BALLS COCKTAILS", "SCREW DRIVER", "COLLINS", "BLOODY MARY", "EGG NOG", "OTHER MIXED DRINKS HI-BALLS COCKTAILS", "86	VERMOUTH/MIXED TYPES", "OTHER SPIRITS", "NEUTRAL SPIRITS - GRAIN", "NEUTRAL SPIRITS - FRUIT", "NEUTRAL SPIRITS - CANE", "NEUTRAL SPIRITS - VEGETABLE", "NEUTRAL SPIRITS - PETROLEUM", "BITTERS - BEVERAGE", "GRAIN SPIRITS", "OTHER SPIRITS", "SAKE", "SAKE - DOMESTIC FLAVORED", "NON ALCOHOLIC MIXES", "OTHER SPIRITS", "NEUTRAL SPIRITS - GRAIN", "NEUTRAL SPIRITS - FRUIT", "NEUTRAL SPIRITS - CANE", "NEUTRAL SPIRITS - VEGETABLE", "NEUTRAL SPIRITS - PETROLEUM", "BITTERS - BEVERAGE*", "TEQUILA FB", "TEQUILA USB", "MEZCAL", "MEZCAL FB", "SAKE - IMPORTED", "MEZCAL US", "SAKE - IMPORTED FLAVORED", "DILUTED TEQUILA FB", "DILUTED TEQUILA USB", "NON ALCOHOL MIXES", "VERMOUTH/MIXED TYPES"]
BEER = ["BEER", "CEREAL BEVERAGES - NEAR BEER (NON ALCOHOLIC)", "OTHER MALT BEVERAGES (BEER)", "BEER", "CEREAL BEVERAGES - NEAR BEER (NON ALCOHOLIC)", "MALT BEVERAGES", "ALE", "MALT LIQUOR", "STOUT", "PORTER", "MALT BEVERAGES SPECIALITIES - FLAVORED", "MALT BEVERAGES SPECIALITIES", "OTHER MALT BEVERAGES"]
WINE = ["TABLE RED WINE", "ROSE WINE", "TABLE WHITE WINE", "TABLE FLAVORED WINE", "TABLE FRUIT WINE", "SPARKLING WINE/CHAMPAGNE", "CARBONATED WINE", "DESSERT FLAVORED WINE", "DESSERT /PORT/SHERRY/(COOKING) WINE", "DESSERT FRUIT WINE"]
BRANDY = ["CALIFORNIA GRAPE BRANDY", "CALIFORNIA BRANDY", "CALIFORNIA DRIED BRANDY", "CALIFORNIA LEES BRANDY", "CALIFORNIA POMACE OR MARC BRANDY", "CALIFORNIA RESIDUE BRANDY", "CALIFORNIA NEUTRAL BRANDY", "OTHER CALIFORNIA BRANDY", "NEW YORK GRAPE BRANDY", "NEW YORK BRANDY", "NEW YORK DRIED BRANDY", "NEW YORK LEES BRANDY", "NEW YORK POMACE OR MARC BRANDY", "NEW YORK RESIDUE BRANDY", "NEW YORK NEUTRAL BRANDY", "OTHER NEW YORK BRANDY", "OTHER DOMESTIC GRAPE BRANDY", "BRANDY", "DRIED BRANDY", "LEES BRANDY", "POMACE OR MARC BRANDY", "RESIDUE BRANDY", "NEUTRAL BRANDY", "IMMATURE BRANDY", "OTHER BRANDY", "BRANDY - FLAVORED", "BRANDY - APRICOT FLAVORED", "BRANDY - BLACKBERRY FLAVORED", "BRANDY - PEACH FLAVORED", "BRANDY - CHERRY FLAVORED", "BRANDY - COFFEE FLAVORED", "BRANDY APPLE FLAVORED", "OTHER BRANDY - FLAVORED", "FRUIT BRANDY", "APPLE BRANDY", "CHERRY BRANDY", "PLUM BRANDY", "BLACKBERRY BRANDY", "BLENDED APPLE JACK BRANDY", "PEAR BRANDY", "APRICOT BRANDY", "DILUTED BRANDY", "OTHER FRUIT BRANDY", "FRENCH BRANDY", "COGNAC (BRANDY) FB", "COGNAC (BRANDY) USB", "ARMAGNAC (BRANDY) FB", "ARMAGNAC (BRANDY) USB", "OTHER FRENCH BRANDY FB", "OTHER FRENCH BRANDY USB", "OTHER FOREIGN BRANDY", "ITALIAN GRAPE BRANDY FB", "ITALIAN GRAPE BRANDY USB", "SPANISH GRAPE BRANDY FB", "SPANISH GRAPE BRANDY USB", "PORTUGUESE GRAPE BRANDY FB", "PORTUGUESE GRAPE BRANDY USB", "GREEK GRAPE BRANDY FB", "GREEK GRAPE BRANDY USB", "GERMAN GRAPE BRANDY FB", "GERMAN GRAPE BRANDY USB", "AUSTRALIAN GRAPE BRANDY FB", "AUSTRALIAN GRAPE BRANDY USB", "SOUTH AFRICAN GRAPE BRANDY FB", "SOUTH AFRICAN GRAPE BRANDY USB", "OTHER FOREIGN BRANDY (CONT.)", "BRANDY APRICOT FLAVORED", "BRANDY BLACKBERRY FLAVORED", "BRANDY PEACH FLAVORED", "BRANDY CHERRY FLAVORED", "BRANDY COFFEE FLAVORED", "BRANDY APPLE FLAVORED", "OTHER GRAPE BRANDY (PISCO, GRAPPA) FB", "OTHER GRAPE BRANDY (GRAPPA) USB", "FOREIGN FRUIT BRANDY", "APPLE BRANDY (CALVADOS)", "CHERRY BRANDY", "PLUM BRANDY (SLIVOVITZ)", "BLACKBERRY BRANDY", "BLENDED APPLE JACK BRANDY", "APRICOT BRANDY", "DILUTED BRANDY FB", "DILUTED BRANDY USB", "OTHER FRUIT BRANDY", "BLACKBERRY FLAVORED BRANDY", "LIQUEUR & BRANDY", "FLAVORED BRANDY", "APRICOT FLAVORED BRANDY", "BLACKBERRY FLAVORED BRANDY", "PEACH FLAVORED BRANDY", "CHERRY FLAVORED BRANDY", "LIQUEUR & BRANDY", "OTHER FLAVORED BRANDY", "BRANDY STINGER (48 PROOF UP)", "BRANDY SIDE CAR (48 PROOF UP)", "BRANDY STINGER (UNDER 48 PROOF)", "BRANDY SIDE CAR (UNDER 48 PROOF)", "BRANDY STINGER 48 PROOF UP", "BRANDY SIDE CAR 48 PROOF UP", "BRANDY STINGER UNDER 48 PROOF", "BRANDY SIDE CAR UNDER 48 PROOF", "BRANDY PEAR FLAVORED"]
GIN = ["DISTILLED GIN", "LONDON DRY DISTILLED GIN", "OTHER DISTILLED GIN", "GIN", "LONDON DRY GIN", "OTHER GIN", "GIN - FLAVORED", "GIN - MINT FLAVORED", "GIN - ORANGE FLAVORED", "GIN - LEMON FLAVORED", "OTHER GIN - FLAVORED", "DILUTED GIN", "DISTILLED GIN", "LONDON DRY DISTILLED GIN FB", "LONDON DRY DISTILLED GIN USB", "OTHER DISTILLED GIN FB", "OTHER DISTILLED GIN USB", "GIN", "LONDON DRY GIN FB", "LONDON DRY GIN USB", "OTHER GIN FB", "OTHER GIN USB", "GIN - FLAVORED", "GIN - MINT FLAVORED", "GIN - ORANGE FLAVORED", "GIN - LEMON FLAVORED", "GIN - CHERRY FLAVORED", "GIN - APPLE FLAVORED", "GIN - BLACKBERRY FLAVORED", "GIN - PEACH FLAVORED", "GIN - GRAPE FLAVORED", "OTHER GIN - FLAVORED", "OTHER GIN", "DILUTED GIN FB", "DILUTED GIN USB", "BRANDY - GINGER FLAVORED", "BRANDY GINGER FLAVORED", "SLOE GIN", "GIN SPECIALTIES", "LIQUEURS (GIN)", "SLOE GIN", "GINGER FLAVORED BRANDY", "GIN SPECIALTIES", "LIQUEURS (GIN)", "GIN MARTINI (48 PROOF UP)", "GIN SOUR (48 PROOF UP)", "GIN MARTINI (UNDER 48 PROOF)", "GIN SOUR (UNDER 48 PROOF)", "GIN MARTINI 48 PROOF UP", "GIN SOUR 48 PROOF UP", "GIN MARTINI UNDER 48 PROOF", "GIN SOUR UNDER 48 PROOF"]
WHISKY = ["STRAIGHT WHISKY", "STRAIGHT BOURBON WHISKY", "STRAIGHT RYE WHISKY", "STRAIGHT CORN WHISKY", "OTHER STRAIGHT WHISKY", "WHISKY BOTTLED IN BOND (BIB)", "BOURBON WHISKY BIB", "RYE WHISKY BIB", "CORN WHISKY BIB", "OTHER WHISKY BIB", "STRAIGHT WHISKY BLENDS", "STRAIGHT BOURBON WHISKY BLENDS", "STRAIGHT RYE WHISKY BLENDS", "STRAIGHT CORN WHISKY BLENDS", "OTHER STRAIGHT BLENDED WHISKY", "WHISKY BLENDS", "BLENDED BOURBON WHISKY", "BLENDED RYE WHISKY", "BLENDED CORN WHISKY", "BLENDED LIGHT WHISKY", "BLENDED WHISKY", "DILUTED BLENDED WHISKY", "OTHER WHISKY BLENDS", "WHISKY", "BOURBON WHISKY", "RYE WHISKY", "CORN WHISKY", "LIGHT WHISKY", "WHISKY PROPRIETARY", "SPIRIT WHISKY", "DILUTED WHISKY", "OTHER WHISKY (FLAVORED)", "SCOTCH WHISKY", "SCOTCH WHISKY FB", "SCOTCH WHISKY USB", "SINGLE MALT SCOTCH WHISKY", "UNBLENDED SCOTCH WHISKY USB", "DILUTED SCOTCH WHISKY FB", "DILUTED SCOTCH WHISKY USB", "CANADIAN WHISKY", "CANADIAN WHISKY FB", "CANADIAN WHISKY USB", "DILUTED CANADIAN WHISKY FB", "DILUTED CANADIAN WHISKY USB", "IRISH WHISKY", "IRISH WHISKY FB", "IRISH WHISKY USB", "DILUTED IRISH WHISKY FB", "DILUTED IRISH WHISKY USB", "WHISKY ORANGE FLAVORED", "WHISKY GRAPE FLAVORED", "WHISKY LIME FLAVORED", "WHISKY LEMON FLAVORED", "WHISKY CHERRY FLAVORED", "WHISKY CHOCOLATE FLAVORED", "WHISKY MINT FLAVORED", "WHISKY PEPPERMINT FLAVORED", "WHISKY PTHER FLAVORED", "OTHER IMPORTED WHISKY", "OTHER IMPORTED WHISKY FB", "OTHER IMPORTED WHISKY USB", "DILUTED OTHER IMPORTED WHISKY FB", "DILUTED OTHER IMPORTED WHISKY USB", "WHISKY SPECIALTIES", "LIQUEURS (WHISKY)", "WHISKY SPECIALTIES", "LIQUEURS (WHISKY)", "WHISKY MANHATTAN (48 PROOF UP)", "WHISKY OLD FASHIONED (48 PROOF UP)", "WHISKY SOUR (48 PROOF UP )", "WHISKY MANHATTAN (UNDER 48 PROOF)", "WHISKY OLD FASHIONED (UNDER 48 PROOF)", "WHISKY SOUR (UNDER 48 PROOF)", "WHISKY MANHATTAN (48 PROOF UP)", "WHISKY OLD FASHIONED (48 PROOF UP)", "WHISKY SOUR (48 PROOF UP)", "WHISKY MANHATTAN UNDER 48 PROOF", "WHISKY OLD FASHIONED UNDER 48 PROOF", "WHISKY SOUR UNDER 48 PROOF"]
RUM = ["RUM LEMON FLAVORED", "RUM CHERRY FLAVORED", "RUM CHOCOLATE FLAVORED", "RUM MINT FLAVORED", "RUM PEPPERMINT FLAVORED", "UR.S. RUM (WHITE)", "PUERTO RICAN RUM (WHITE)", "VIRGIN ISLANDS RUM (WHITE)", "HAWAIIAN RUM (WHITE)", "FLORIDA RUM (WHITE)", "OTHER RUM (WHITE)", "U.S. RUM (GOLD)", "PUERTO RICAN RUM (GOLD)", "VIRGIN ISLAND RUM (GOLD)", "HAWAIIAN RUM (GOLD)", "FLORIDA RUM (GOLD)", "OTHER RUM (GOLD)", "RUM FLAVORED (BOLD)", "RUM ORANGE GLAVORED", "RUM GRAPE FLAVORED", "RUM LIME FLAVORED", "RUM LEMON FLAVORED", "RUM CHERRY FLAVORED", "RUM CHOCOLATE FLAVORED", "RUM MINT FLAVORED", "RUM PEPPERMINT FLAVORED", "RUM OTHER FLAVORED", "OTHER WHITE RUM", "FLAVORED RUM (BOLD)", "RUM ORANGE FLAVORED", "RUM GRAPE FLAVORED", "RUM LIME FLAVORED", "RUM OTHER FLAVORED", "DILUTED RUM (WHITE)", "DILUTED RUM (GOLD)", "DOMESTIC FLAVORED RUM", "FOREIGN RUM", "CUBAN RUM WHITE FB", "CUBAN RUM WHITE USB", "CUBAN RUM GOLD FB", "CUBAN RUM GOLD USB", "OTHER FORIEGN RUM", "JAMAICAN RUM WHITE FB", "JAMAICAN RUM WHITE USB", "JAMAICAN RUM GOLD FB", "JAMICAN RUM GOLD USB", "GUIANAN RUM WHITE FB", "GUIANAN RUM WHITE USB", "DUTCH GUIANAN RUM GOLD FB", "DUTCH GUIANAN RUM GOLD USB", "FRENCH GUIANAN RUM FB", "FRENCH GUIANAN RUM USB", "MARTINICAN RUM WHITE FB", "MARTINICAN RUM WHITE USB", "MARTINICAN RUM GOLD FB", "MARTINICAN RUM GOLD USB", "OTHER FOREIGN RUM", "OTHER RUM WHITE FB", "OTHER RUM WHITE USB", "OTHER RUM GOLD FB", "OTHER RUM GOLD USB", "DILUTED RUM WHITE FB", "DILUTED RUM WHITE USB", "DILUTED RUM GOLD FB", "DILUTED RUM GOLD USB", "IMPORTED FLAVORED RUM", "ROCK & RYE, RUM & BRANDY (ETC.)", "RUM SPECIALTIES", "LIQUEURS (RUM)", "ROCK & RYE, RUM & BRANDY (ETC)", "RUM SPECIALTIES", "LIQUEURS (RUM)"]
VODKA = ["VODKA", "VODKA 80-90 PROOF", "VODKA", "VODKA 90-99 PROOF", "VODKA", "VODKA 100 PROOF UP", "VODKA - FLAVORED", "VODKA - ORANGE FLAVORED", "VODKA - GRAPE FLAVORED", "VODKA - LIME FLAVORED", "VODKA - LEMON FLAVORED", "VODKA - CHERRY FLAVORED", "VODKA - CHOCOLATE FLAVORED", "VODKA - MINT FLAVORED", "VODKA - PEPPERMINT FLAVORED", "VODKA - OTHER FLAVORED", "OTHER VODKA", "DILUTED VODKA", "VODKA", "VODKA 80-90 PROOF FB", "VODKA 80-90 PROOF USB", "VODKA", "VODKA 90-99 PROOF FB", "VODKA 90-99 PROOF USB", "VODKA", "VODKA 100 PROOF UP FB", "VODKA 100 PROOF UP USB", "VODKA - FLAVORED", "VODKA - ORANGE FLAVORED", "VODKA - GRAPE FLAVORED", "VODKA - LIME FLAVORED", "VODKA - LEMON FLAVORED", "VODKA - CHERRY FLAVORED", "VODKA - CHOCOLATE FLAVORED", "VODKA - MINT FLAVORED", "VODKA - PEPPERMINT FLAVORED", "VODKA - OTHER FLAVORED", "OTHER VODKA", "DILUTED VODKA FB", "DILUTED VODKA USB", "VODKA SPECIALTIES", "LIQUEURS (VODKA)", "VODKA SPECIALTIES", "LIQUEURS (VODKA)", "VODKA MARTINI (48 PROOF UP)", "VODKA MARTINI (UNDER 48 PROOF)", "VODKA MARTINI 48 PROOF UP", "VODKA MARTINI UNDER 48 PROOF", "VODKA SOUR UNDER 48 PROOF"]

We can use broadly re-classify, buy using lookups and replacing values

In [197]:
us_only['gen_class'] = us_only['Class/TypeCode']
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(SPIRITS_MISC)] = 'SPIRITS_MISC'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(BEER)] = 'BEER'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(WINE)] = 'WINE'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(BRANDY)] = 'BRANDY'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(GIN)] = 'GIN'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(WHISKY)] = 'WHISKY'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(RUM)] = 'RUM'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(VODKA)] = 'VODKA'



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



We can check to see if things worked properly by checking the number of unique categories

In [198]:
len(us_only['Class/TypeCode'].unique())

158

In [199]:
len(us_only.gen_class.unique())

8

### What does the distribution of these more general categories look like?

##### Select only the 2016 year to be consistent

In [200]:
mask = ('01/01/2016' <= us_only['recieve_date']) & (us_only['recieve_date'] < '01/01/2017')
us_2016 = us_only.loc[mask]
us_2016.shape

(38143, 24)

In [203]:
counts_by_genclass = us_2016[['gen_class', 'TTBID']].groupby('gen_class').count()
counts_by_genclass.reset_index(inplace=True)
counts_by_genclass.rename(columns={'TTBID': 'counts'}, inplace=True)  # rename percentage 
counts_by_genclass.sort_values('counts', axis=0, ascending=False, inplace=True)

In [206]:
plot_data = [dict(
                type='bar',
                x=counts_by_genclass['gen_class'],
                y=counts_by_genclass['counts'])]

layout = dict(
            title = 'Applications by Type',
            xaxis = dict(tickangle = -45),
            #margin = dict(b = 200)
             )

 #clean version, no labels
layout = dict(
            xaxis = dict(visible = False),
            yaxis = dict(visible = False)
             )

fig = dict( data=plot_data, layout=layout )
plotly.offline.iplot( fig, filename='TypeCounts.html' )

In [205]:
plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')

'<div id="27d9871a-be23-4831-b257-7143869c99b7" style="height: 100%; width: 100%;" class="plotly-graph-div"></div><script type="text/javascript">window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL="https://plot.ly";Plotly.newPlot("27d9871a-be23-4831-b257-7143869c99b7", [{"type": "bar", "x": ["WINE", "BEER", "WHISKY", "SPIRITS_MISC", "VODKA", "RUM", "GIN", "BRANDY"], "y": [20176, 13840, 1218, 996, 986, 414, 350, 163]}], {"title": "Applications by Type", "xaxis": {"tickangle": -45}}, {"showLink": true, "linkText": "Export to plot.ly"})</script>'

## Color space for each general class

### Merge the color space data with the requisite application data

In [73]:
colors_class = pd.merge(us_only[['TTBID', 'Class/TypeCode', 'gen_class']], df_colors, left_on='TTBID', right_on='TTBID', how='inner')

In [74]:
colors_class.head()

Unnamed: 0,TTBID,Class/TypeCode,gen_class,b,g,img_num,percentage,r
0,16001001000009,MALT BEVERAGES SPECIALITIES - FLAVORED,BEERS,57.854447,56.692722,0,0.5,60.746631
1,16001001000009,MALT BEVERAGES SPECIALITIES - FLAVORED,BEERS,239.106518,238.742448,0,0.5,239.810811
2,16001001000018,OTHER WHISKY (FLAVORED),WHISKY,166.517781,174.014225,0,0.5,179.84495
3,16001001000018,OTHER WHISKY (FLAVORED),WHISKY,48.377104,50.838384,0,0.5,53.707071
4,16001001000018,OTHER WHISKY (FLAVORED),WHISKY,232.319809,234.338902,1,0.25,236.873508


### Cluster beer and wine colors

#### Attempt at clustering is largely unsuccessful

In [76]:
beer_rgb = colors_class.loc[colors_class['gen_class'] == 'BEERS']
beer_rgb.shape

(60680, 8)

In [77]:
beer_clusters = cluster.MiniBatchKMeans(n_clusters=10, max_iter=1000, batch_size=10000)
beer_clusters.fit(beer_rgb[['r', 'g', 'b']])
proportion = centroid_histogram(beer_clusters)
centers = pd.DataFrame(beer_clusters.cluster_centers_)
centers.columns = ['r', 'g', 'b']

In [78]:
scatter = dict(
    mode = "markers",
    name = "y",
    type = "scatter3d",    
    x = centers['r'], 
    y = centers['g'], 
    z = centers['b'],
    marker = dict( size=2, color="rgb(23, 190, 207)" )
)

clusters = dict(
    alphahull = 7,
    name = "y",
    opacity = 0.1,
    type = "mesh3d",    
    x = centers['r'], 
    y = centers['g'], 
    z = centers['b']
)

# clean version, no labels
#layout = dict(
#            xaxis = dict(visible = False),
#            yaxis = dict(visible = False)
#             )

fig = dict( data=[scatter, clusters], layout=layout )
plotly.offline.iplot( fig, filename='TypeCounts.html' )

### Discretize attempt

##### Discretize the color space

Calculate our bins and the labels we want associated with the bins

In [104]:
bins = np.linspace(0, 255, 10)
labels = (bins[1:] + bins[:-1]) / 2

Apply a `binned` categorical label to each of the rgb values

In [105]:
colors_class[['dr', 'dg', 'db']] = colors_class[['r', 'g', 'b']].apply(lambda x: 
                                                                       pd.cut(x, bins, labels=labels, right=True, 
                                                                              include_lowest=True))

Create tuple strings, so we can more easily count the groupings of colors

In [106]:
colors_class['str_color'] = ['rgb({}, {}, {})'.format(int(r),int(g),int(b)) for (r,g,b)
                             in zip(colors_class['dr'], colors_class['dg'], colors_class['db'])]

In [268]:
type_colors = pd.DataFrame(colors_class.groupby(['gen_class', 'str_color']).count()['TTBID'])
type_colors.columns = ['counts']
type_colors['prop_freq'] = type_colors['counts']/type_colors['counts'].groupby('gen_class').sum()
type_colors['prop_freq2'] = type_colors['prop_freq']/type_colors['prop_freq'].groupby('gen_class').max() * 255
type_colors.reset_index(inplace=True)

In [269]:
type_colors = pd.merge(type_colors, colors_class[['str_color', 'dr', 'dg', 'db']], left_on='str_color',
                                                                                   right_on='str_color')
type_colors = type_colors.drop_duplicates()

In [245]:
type_colors.loc[type_colors['gen_class'] == 'BEERS'].shape

(650, 8)

In [182]:
type_colors.head()

Unnamed: 0,gen_class,str_color,counts,prop_freq,prop_freq2,dr,dg,db
0,BEERS,"rgb(127, 127, 127)",2086,0.034377,46.546202,127.5,127.5,127.5
7934,BRANDY,"rgb(127, 127, 127)",28,0.027972,36.428571,127.5,127.5,127.5
15868,GIN,"rgb(127, 127, 127)",102,0.044252,57.8,127.5,127.5,127.5
23802,RUM,"rgb(127, 127, 127)",92,0.031978,43.045872,127.5,127.5,127.5
31736,SPIRITS_MISC,"rgb(127, 127, 127)",188,0.028637,47.0,127.5,127.5,127.5


In [207]:
cgroup = type_colors.groupby('gen_class')

scatter = []
for group in type_colors['gen_class'].unique():
    scatter.append( dict(
        mode = "markers",
        name = group,
        type = "scatter3d",    
        x = cgroup.get_group(group)['dr'], 
        y = cgroup.get_group(group)['dg'], 
        z = cgroup.get_group(group)['db'],
        marker = dict( size=np.log(cgroup.get_group(group)['prop_freq']*100000), 
                      color=cgroup.get_group(group)['str_color'] )
    )
  )

clusters = dict(
    alphahull = 7,
    name = "y",
    opacity = 0.1,
    type = "mesh3d",    
    x = centers['r'], 
    y = centers['g'], 
    z = centers['b']
)

# clean version, no labels
#layout = dict(
#            xaxis = dict(visible = False),
#            yaxis = dict(visible = False)
#             )

fig = dict( data=scatter, layout=layout )
plotly.offline.iplot( fig, filename='TypeCounts.html' )

Perhaps I could use a classifier on this set to see if a given color is more likely to be in one type or another?

In [284]:
res = skcolor.rgb2hsv(np.array(type_colors[['dr', 'dg', 'db']]).astype(int).reshape(-1, 1, 3))
res = res.reshape(-1, 3)
res = pd.DataFrame(res)
res.columns = ['h', 's', 'v']
res.shape


Possible precision loss when converting from int64 to float64



(3441, 3)

In [266]:
res.isnull().sum()

h    0
s    0
v    0
dtype: int64

In [304]:
type_colors['hsv'] = [tuple(skcolor.rgb2hsv(np.array((r,g,b)).reshape(1,1,3)).reshape(1,3)[0]) for (r,g,b)
                             in zip(type_colors['dr'], type_colors['dg'], type_colors['db'])]

In [305]:
type_colors.head()

Unnamed: 0,gen_class,str_color,counts,prop_freq,prop_freq2,dr,dg,db,hsv
0,BEERS,"rgb(127, 127, 127)",2086,0.034377,46.546202,127.5,127.5,127.5,"(0.0, 0.0, 127.5)"
7934,BRANDY,"rgb(127, 127, 127)",28,0.027972,36.428571,127.5,127.5,127.5,"(0.0, 0.0, 127.5)"
15868,GIN,"rgb(127, 127, 127)",102,0.044252,57.8,127.5,127.5,127.5,"(0.0, 0.0, 127.5)"
23802,RUM,"rgb(127, 127, 127)",92,0.031978,43.045872,127.5,127.5,127.5,"(0.0, 0.0, 127.5)"
31736,SPIRITS_MISC,"rgb(127, 127, 127)",188,0.028637,47.0,127.5,127.5,127.5,"(0.0, 0.0, 127.5)"


In [307]:
type_colors.sort_values('hsv', inplace=True)

#### Stacked bar representation (take 96)

In [312]:
cgroup = type_colors.groupby('str_color')

bars = []
for group in type_colors['str_color'].unique():
    bars.append( dict(
        name = group,
        type = "bar",    
        x = cgroup.get_group(group)['gen_class'], 
        y = cgroup.get_group(group)['prop_freq'], 
        hoverinfo = 'none',
        marker = dict(color=cgroup.get_group(group)['str_color'] )
    )
           )

layout = dict(
    title = 'Proportional Color',
    barmode='stack',
    showlegend=False,
    xaxis = dict(title='Beverage Class'),
    yaxis = dict(title='Proportion')
    )

# clean version, no labels
#layout = dict(
#            barmode='stack',
#            xaxis = dict(visible = False),
#            yaxis = dict(visible = False),
#             showlegend=False
#             )

fig = dict( data=bars, layout=layout )
plotly.offline.iplot( fig, filename='TypeCounts.html' )

In [310]:
plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')

'<div id="bc3ca802-83e7-4bdf-8dfd-e953830715f2" style="height: 100%; width: 100%;" class="plotly-graph-div"></div><script type="text/javascript">window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL="https://plot.ly";Plotly.newPlot("bc3ca802-83e7-4bdf-8dfd-e953830715f2", [{"name": "rgb(14, 14, 14)", "type": "bar", "x": ["BRANDY", "WINE", "WHISKY", "VODKA", "SPIRITS_MISC", "RUM", "BEERS", "GIN"], "y": [0.06593406593406594, 0.057583888329899545, 0.04890219560878244, 0.04331954246775371, 0.0654988575780655, 0.04240528328119569, 0.052406064601186554, 0.04251626898047722], "hoverinfo": "none", "marker": {"color": ["rgb(14, 14, 14)", "rgb(14, 14, 14)", "rgb(14, 14, 14)", "rgb(14, 14, 14)", "rgb(14, 14, 14)", "rgb(14, 14, 14)", "rgb(14, 14, 14)", "rgb(14, 14, 14)"]}}, {"name": "rgb(42, 42, 42)", "type": "bar", "x": ["VODKA", "WHISKY", "WINE", "RUM", "SPIRITS_MISC", "GIN", "BRANDY", "BEERS"], "y": [0.05512290094913604, 0.04302506098913284, 0.04592496765847348, 0.04518595759471672, 