# Imports

In [1]:
import datetime
import pymongo
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

import numpy as np
from collections import defaultdict

from skimage import color as skcolor

import plotly
import plotly.plotly as py
plotly.offline.init_notebook_mode(connected=True)

In [2]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [3]:
sess = None

def reset_vars():
    sess.run(tf.global_variables_initializer())

def reset_tf():
    global sess
    if sess:
        sess.close()
    tf.reset_default_graph()
    sess = tf.Session()

# Connect to Data

##### Connect to databases

In [4]:
# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB = db.TTB # collection for form data
TTB_COLORS = db.COLORS # collection for the label image data
TTB_IMG_META = db.IMG_META
TTB_IMG_SUP = db.IMG_SUP

##### Load into pandas

In [5]:
df = pd.DataFrame(list(TTB.find()))
df_colors = pd.DataFrame(list(TTB_COLORS.find()))
df_img_meta = pd.DataFrame(list(TTB_IMG_META.find()))
df_img_sup = pd.DataFrame(list(TTB_IMG_SUP.find()))

## Clean Data

##### Drop useless `_id` columns

In [6]:
df_colors = df_colors.drop(['_id'], axis=1)
df_img_meta = df_img_meta.drop(['_id'], axis=1)
df_img_sup = df_img_sup.drop(['_id'], axis=1)

##### Drop duplicates, unclear why present

In [7]:
print('df_color duplicates:  {}'.format(df_colors.shape[0] - df_colors.drop_duplicates().shape[0]))
print('df_img_meta duplicates:  {}'.format(df_img_meta.shape[0] - df_img_meta.drop_duplicates().shape[0]))
print('df_img_sup duplicates:  {}'.format(df_img_sup.shape[0] - df_img_meta.drop_duplicates().shape[0]))

df_colors = df_colors.drop_duplicates()
df_img_meta = df_img_meta.drop_duplicates()
df_img_sup = df_img_sup.drop_duplicates()

df_color duplicates:  391
df_img_meta duplicates:  119
df_img_sup duplicates:  119


##### Clean up bad values in color

In [8]:
df_colors[['r', 'g', 'b']] = df_colors[['r', 'g', 'b']].clip(lower=0.0, upper=255.0)

##### Convert recieve date to datetime

In [9]:
df['recieve_date'] = pd.to_datetime(df['recieve_date'], format='%m/%d/%Y')

##### Helper lookup for US states

In [10]:
# get list of all US states, convert to uppercase as that is what is used
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
states = [state.upper() for state in states]

us_state_abbrev = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}

# capitalized versions
abbrev_lookup=defaultdict(str)
for k, v in us_state_abbrev.items():
    abbrev_lookup[k.upper()] = v


##### Helper lookup for broad alcohol classes

In [11]:
SPIRITS_MISC = ["CORDIALS (FRUIT & PEELS)", "FRUIT FLAVORED LIQUEURS", "CURACAO", "TRIPLE SEC", "FRUITS & PEELS SCHNAPPS LIQUEUR", "OTHER FRUITS & PEELS LIQUEURS", "CORDIALS (HERBS & SEEDS)", "ANISETTE, OUZO, OJEN", "COFFEE (CAFE) LIQUEUR", "KUMMEL", "PEPPERMINT SCHNAPPS", "HERBS AND SEEDS SCHNAPPS LIQUEUR", "AMARETTO", "SAMBUCA", "ARACK/RAKI/ARAK", "OTHER (HERBS & SEEDS)", "CORDIALS (CREMES OR CREAMS)", "CREME DE CACAO WHITE", "CREME DE CACAO BROWN", "CREME DE MENTHE WHITE", "CREME DE MENTHE GREEN", "CREME DE ALMOND (NOYAUX)", "DAIRY CREAM LIQUEUR/CORDIAL", "NON DAIRY CREME LIQUEUR/CORDIAL", "OTHER LIQUEUR (CREME OR CREAMS)", "SPECIALTIES & PROPRIETARIES", "OTHER SPECIALTIES & PROPRIETARIES", "CORDIALS (FRUIT & PEELS)", "FRUIT FLAVORED LIQUEURS", "CURACAO", "TRIPLE SEC", "FRUITS & PEELS SCHNAPPS LIQUEUR", "OTHER FRUIT & PEELS LIQUEURS", "CORDIALS (HERBS & SEEDS)", "ANISETTE, OUZO, OJEN", "COFFEE (CAFE) LIQUEUR", "KUMMEL", "PEPPERMINT SCHNAPPS", "HERBS & SEEDS SCHNAPPS LIQUEUR", "AMARETTO", "SAMBUCA", "ARACK/RAKI/ARAK", "OTHER HERB & SEED CORDIALS/LIQUEURS", "CORDIALS (CREMES OR CREAMS)", "CREME DE CACAO WHITE", "CREME DE CACAO BROWN", "CREME DE MENTHE WHITE", "CREME DE MENTHE GREEN", "CREME DE ALMOND (NOYAUX)a", "DAIRY CREAM LIQUEUR/CO", "NON DAIRY CREME LIQUEUR/CORDIAL", "OTHER LIQUEUR (CREMES OR CREAMS)", "SPECIALITIES & PROPRIETARIES", "OTHER SPECIALTIES & PROPRIETARIES", "COCKTAILS 48 PROOF UP", "MARGARITA (48 PROOF UP)", "COCKTAILS 48 PROOF UP (CONT)", "DAIQUIRI (48 PROOF UP)", "COLADA (48PROOF UP)", "OTHER COCTAILS (48PROOF UP)", "COCKTAILS UNDER 48 PROOF", "MARGARITA (UNDER 48 PROOF)", "OTHER TEQUILA-BASED COCKTAILS (UNDER 48 PROOF)", "COCKTAILS UNDER 48 PROOF (CONT)", "DAIQUIRI (UNDER 48 PROOF)", "COLADA (UNDER 48 PROOF)", "OTHER COCKTAILS (UNDER 48 PROOF)", "MIXED DRINKS-HI BALLS COCKTAILS", "SCREW DRIVER", "COLLINS", "BLOODY MARY", "EGG NOG", "OTHER MIXED DRINKS HI-BALLS COCKTAILS", "COCKTAILS 48 PROOF UP", "MARGARITA 48 PROOF UP", "DAIQUIRI 48 PROOF UP", "COLADA (48 PROOF UP )", "OTHER COCKTAILS (48 PROOF UP)", "COCKTAILS UNDER 48 PROOF", "MARGARITA UNDER 48 PROOF", "OTHER TEQUILA-BASED COCKTAILS (UNDER 48 PROOF)", "COCKTAILS UNDER 48 PR(CONT)", "DAIQUIRI UNDER 48 PROOF", "COLADA (UNDER 48 PROOF )", "OTHER COCKTAILS (UNDER 48 PROOF)", "MIXED DRINKS-HI BALLS COCKTAILS", "SCREW DRIVER", "COLLINS", "BLOODY MARY", "EGG NOG", "OTHER MIXED DRINKS HI-BALLS COCKTAILS", "86	VERMOUTH/MIXED TYPES", "OTHER SPIRITS", "NEUTRAL SPIRITS - GRAIN", "NEUTRAL SPIRITS - FRUIT", "NEUTRAL SPIRITS - CANE", "NEUTRAL SPIRITS - VEGETABLE", "NEUTRAL SPIRITS - PETROLEUM", "BITTERS - BEVERAGE", "GRAIN SPIRITS", "OTHER SPIRITS", "SAKE", "SAKE - DOMESTIC FLAVORED", "NON ALCOHOLIC MIXES", "OTHER SPIRITS", "NEUTRAL SPIRITS - GRAIN", "NEUTRAL SPIRITS - FRUIT", "NEUTRAL SPIRITS - CANE", "NEUTRAL SPIRITS - VEGETABLE", "NEUTRAL SPIRITS - PETROLEUM", "BITTERS - BEVERAGE*", "TEQUILA FB", "TEQUILA USB", "MEZCAL", "MEZCAL FB", "SAKE - IMPORTED", "MEZCAL US", "SAKE - IMPORTED FLAVORED", "DILUTED TEQUILA FB", "DILUTED TEQUILA USB", "NON ALCOHOL MIXES", "VERMOUTH/MIXED TYPES"]
BEER = ["BEER", "CEREAL BEVERAGES - NEAR BEER (NON ALCOHOLIC)", "OTHER MALT BEVERAGES (BEER)", "BEER", "CEREAL BEVERAGES - NEAR BEER (NON ALCOHOLIC)", "MALT BEVERAGES", "ALE", "MALT LIQUOR", "STOUT", "PORTER", "MALT BEVERAGES SPECIALITIES - FLAVORED", "MALT BEVERAGES SPECIALITIES", "OTHER MALT BEVERAGES"]
WINE = ["TABLE RED WINE", "ROSE WINE", "TABLE WHITE WINE", "TABLE FLAVORED WINE", "TABLE FRUIT WINE", "SPARKLING WINE/CHAMPAGNE", "CARBONATED WINE", "DESSERT FLAVORED WINE", "DESSERT /PORT/SHERRY/(COOKING) WINE", "DESSERT FRUIT WINE"]
BRANDY = ["CALIFORNIA GRAPE BRANDY", "CALIFORNIA BRANDY", "CALIFORNIA DRIED BRANDY", "CALIFORNIA LEES BRANDY", "CALIFORNIA POMACE OR MARC BRANDY", "CALIFORNIA RESIDUE BRANDY", "CALIFORNIA NEUTRAL BRANDY", "OTHER CALIFORNIA BRANDY", "NEW YORK GRAPE BRANDY", "NEW YORK BRANDY", "NEW YORK DRIED BRANDY", "NEW YORK LEES BRANDY", "NEW YORK POMACE OR MARC BRANDY", "NEW YORK RESIDUE BRANDY", "NEW YORK NEUTRAL BRANDY", "OTHER NEW YORK BRANDY", "OTHER DOMESTIC GRAPE BRANDY", "BRANDY", "DRIED BRANDY", "LEES BRANDY", "POMACE OR MARC BRANDY", "RESIDUE BRANDY", "NEUTRAL BRANDY", "IMMATURE BRANDY", "OTHER BRANDY", "BRANDY - FLAVORED", "BRANDY - APRICOT FLAVORED", "BRANDY - BLACKBERRY FLAVORED", "BRANDY - PEACH FLAVORED", "BRANDY - CHERRY FLAVORED", "BRANDY - COFFEE FLAVORED", "BRANDY APPLE FLAVORED", "OTHER BRANDY - FLAVORED", "FRUIT BRANDY", "APPLE BRANDY", "CHERRY BRANDY", "PLUM BRANDY", "BLACKBERRY BRANDY", "BLENDED APPLE JACK BRANDY", "PEAR BRANDY", "APRICOT BRANDY", "DILUTED BRANDY", "OTHER FRUIT BRANDY", "FRENCH BRANDY", "COGNAC (BRANDY) FB", "COGNAC (BRANDY) USB", "ARMAGNAC (BRANDY) FB", "ARMAGNAC (BRANDY) USB", "OTHER FRENCH BRANDY FB", "OTHER FRENCH BRANDY USB", "OTHER FOREIGN BRANDY", "ITALIAN GRAPE BRANDY FB", "ITALIAN GRAPE BRANDY USB", "SPANISH GRAPE BRANDY FB", "SPANISH GRAPE BRANDY USB", "PORTUGUESE GRAPE BRANDY FB", "PORTUGUESE GRAPE BRANDY USB", "GREEK GRAPE BRANDY FB", "GREEK GRAPE BRANDY USB", "GERMAN GRAPE BRANDY FB", "GERMAN GRAPE BRANDY USB", "AUSTRALIAN GRAPE BRANDY FB", "AUSTRALIAN GRAPE BRANDY USB", "SOUTH AFRICAN GRAPE BRANDY FB", "SOUTH AFRICAN GRAPE BRANDY USB", "OTHER FOREIGN BRANDY (CONT.)", "BRANDY APRICOT FLAVORED", "BRANDY BLACKBERRY FLAVORED", "BRANDY PEACH FLAVORED", "BRANDY CHERRY FLAVORED", "BRANDY COFFEE FLAVORED", "BRANDY APPLE FLAVORED", "OTHER GRAPE BRANDY (PISCO, GRAPPA) FB", "OTHER GRAPE BRANDY (GRAPPA) USB", "FOREIGN FRUIT BRANDY", "APPLE BRANDY (CALVADOS)", "CHERRY BRANDY", "PLUM BRANDY (SLIVOVITZ)", "BLACKBERRY BRANDY", "BLENDED APPLE JACK BRANDY", "APRICOT BRANDY", "DILUTED BRANDY FB", "DILUTED BRANDY USB", "OTHER FRUIT BRANDY", "BLACKBERRY FLAVORED BRANDY", "LIQUEUR & BRANDY", "FLAVORED BRANDY", "APRICOT FLAVORED BRANDY", "BLACKBERRY FLAVORED BRANDY", "PEACH FLAVORED BRANDY", "CHERRY FLAVORED BRANDY", "LIQUEUR & BRANDY", "OTHER FLAVORED BRANDY", "BRANDY STINGER (48 PROOF UP)", "BRANDY SIDE CAR (48 PROOF UP)", "BRANDY STINGER (UNDER 48 PROOF)", "BRANDY SIDE CAR (UNDER 48 PROOF)", "BRANDY STINGER 48 PROOF UP", "BRANDY SIDE CAR 48 PROOF UP", "BRANDY STINGER UNDER 48 PROOF", "BRANDY SIDE CAR UNDER 48 PROOF", "BRANDY PEAR FLAVORED"]
GIN = ["DISTILLED GIN", "LONDON DRY DISTILLED GIN", "OTHER DISTILLED GIN", "GIN", "LONDON DRY GIN", "OTHER GIN", "GIN - FLAVORED", "GIN - MINT FLAVORED", "GIN - ORANGE FLAVORED", "GIN - LEMON FLAVORED", "OTHER GIN - FLAVORED", "DILUTED GIN", "DISTILLED GIN", "LONDON DRY DISTILLED GIN FB", "LONDON DRY DISTILLED GIN USB", "OTHER DISTILLED GIN FB", "OTHER DISTILLED GIN USB", "GIN", "LONDON DRY GIN FB", "LONDON DRY GIN USB", "OTHER GIN FB", "OTHER GIN USB", "GIN - FLAVORED", "GIN - MINT FLAVORED", "GIN - ORANGE FLAVORED", "GIN - LEMON FLAVORED", "GIN - CHERRY FLAVORED", "GIN - APPLE FLAVORED", "GIN - BLACKBERRY FLAVORED", "GIN - PEACH FLAVORED", "GIN - GRAPE FLAVORED", "OTHER GIN - FLAVORED", "OTHER GIN", "DILUTED GIN FB", "DILUTED GIN USB", "BRANDY - GINGER FLAVORED", "BRANDY GINGER FLAVORED", "SLOE GIN", "GIN SPECIALTIES", "LIQUEURS (GIN)", "SLOE GIN", "GINGER FLAVORED BRANDY", "GIN SPECIALTIES", "LIQUEURS (GIN)", "GIN MARTINI (48 PROOF UP)", "GIN SOUR (48 PROOF UP)", "GIN MARTINI (UNDER 48 PROOF)", "GIN SOUR (UNDER 48 PROOF)", "GIN MARTINI 48 PROOF UP", "GIN SOUR 48 PROOF UP", "GIN MARTINI UNDER 48 PROOF", "GIN SOUR UNDER 48 PROOF"]
WHISKY = ["STRAIGHT WHISKY", "STRAIGHT BOURBON WHISKY", "STRAIGHT RYE WHISKY", "STRAIGHT CORN WHISKY", "OTHER STRAIGHT WHISKY", "WHISKY BOTTLED IN BOND (BIB)", "BOURBON WHISKY BIB", "RYE WHISKY BIB", "CORN WHISKY BIB", "OTHER WHISKY BIB", "STRAIGHT WHISKY BLENDS", "STRAIGHT BOURBON WHISKY BLENDS", "STRAIGHT RYE WHISKY BLENDS", "STRAIGHT CORN WHISKY BLENDS", "OTHER STRAIGHT BLENDED WHISKY", "WHISKY BLENDS", "BLENDED BOURBON WHISKY", "BLENDED RYE WHISKY", "BLENDED CORN WHISKY", "BLENDED LIGHT WHISKY", "BLENDED WHISKY", "DILUTED BLENDED WHISKY", "OTHER WHISKY BLENDS", "WHISKY", "BOURBON WHISKY", "RYE WHISKY", "CORN WHISKY", "LIGHT WHISKY", "WHISKY PROPRIETARY", "SPIRIT WHISKY", "DILUTED WHISKY", "OTHER WHISKY (FLAVORED)", "SCOTCH WHISKY", "SCOTCH WHISKY FB", "SCOTCH WHISKY USB", "SINGLE MALT SCOTCH WHISKY", "UNBLENDED SCOTCH WHISKY USB", "DILUTED SCOTCH WHISKY FB", "DILUTED SCOTCH WHISKY USB", "CANADIAN WHISKY", "CANADIAN WHISKY FB", "CANADIAN WHISKY USB", "DILUTED CANADIAN WHISKY FB", "DILUTED CANADIAN WHISKY USB", "IRISH WHISKY", "IRISH WHISKY FB", "IRISH WHISKY USB", "DILUTED IRISH WHISKY FB", "DILUTED IRISH WHISKY USB", "WHISKY ORANGE FLAVORED", "WHISKY GRAPE FLAVORED", "WHISKY LIME FLAVORED", "WHISKY LEMON FLAVORED", "WHISKY CHERRY FLAVORED", "WHISKY CHOCOLATE FLAVORED", "WHISKY MINT FLAVORED", "WHISKY PEPPERMINT FLAVORED", "WHISKY PTHER FLAVORED", "OTHER IMPORTED WHISKY", "OTHER IMPORTED WHISKY FB", "OTHER IMPORTED WHISKY USB", "DILUTED OTHER IMPORTED WHISKY FB", "DILUTED OTHER IMPORTED WHISKY USB", "WHISKY SPECIALTIES", "LIQUEURS (WHISKY)", "WHISKY SPECIALTIES", "LIQUEURS (WHISKY)", "WHISKY MANHATTAN (48 PROOF UP)", "WHISKY OLD FASHIONED (48 PROOF UP)", "WHISKY SOUR (48 PROOF UP )", "WHISKY MANHATTAN (UNDER 48 PROOF)", "WHISKY OLD FASHIONED (UNDER 48 PROOF)", "WHISKY SOUR (UNDER 48 PROOF)", "WHISKY MANHATTAN (48 PROOF UP)", "WHISKY OLD FASHIONED (48 PROOF UP)", "WHISKY SOUR (48 PROOF UP)", "WHISKY MANHATTAN UNDER 48 PROOF", "WHISKY OLD FASHIONED UNDER 48 PROOF", "WHISKY SOUR UNDER 48 PROOF"]
RUM = ["RUM LEMON FLAVORED", "RUM CHERRY FLAVORED", "RUM CHOCOLATE FLAVORED", "RUM MINT FLAVORED", "RUM PEPPERMINT FLAVORED", "UR.S. RUM (WHITE)", "PUERTO RICAN RUM (WHITE)", "VIRGIN ISLANDS RUM (WHITE)", "HAWAIIAN RUM (WHITE)", "FLORIDA RUM (WHITE)", "OTHER RUM (WHITE)", "U.S. RUM (GOLD)", "PUERTO RICAN RUM (GOLD)", "VIRGIN ISLAND RUM (GOLD)", "HAWAIIAN RUM (GOLD)", "FLORIDA RUM (GOLD)", "OTHER RUM (GOLD)", "RUM FLAVORED (BOLD)", "RUM ORANGE GLAVORED", "RUM GRAPE FLAVORED", "RUM LIME FLAVORED", "RUM LEMON FLAVORED", "RUM CHERRY FLAVORED", "RUM CHOCOLATE FLAVORED", "RUM MINT FLAVORED", "RUM PEPPERMINT FLAVORED", "RUM OTHER FLAVORED", "OTHER WHITE RUM", "FLAVORED RUM (BOLD)", "RUM ORANGE FLAVORED", "RUM GRAPE FLAVORED", "RUM LIME FLAVORED", "RUM OTHER FLAVORED", "DILUTED RUM (WHITE)", "DILUTED RUM (GOLD)", "DOMESTIC FLAVORED RUM", "FOREIGN RUM", "CUBAN RUM WHITE FB", "CUBAN RUM WHITE USB", "CUBAN RUM GOLD FB", "CUBAN RUM GOLD USB", "OTHER FORIEGN RUM", "JAMAICAN RUM WHITE FB", "JAMAICAN RUM WHITE USB", "JAMAICAN RUM GOLD FB", "JAMICAN RUM GOLD USB", "GUIANAN RUM WHITE FB", "GUIANAN RUM WHITE USB", "DUTCH GUIANAN RUM GOLD FB", "DUTCH GUIANAN RUM GOLD USB", "FRENCH GUIANAN RUM FB", "FRENCH GUIANAN RUM USB", "MARTINICAN RUM WHITE FB", "MARTINICAN RUM WHITE USB", "MARTINICAN RUM GOLD FB", "MARTINICAN RUM GOLD USB", "OTHER FOREIGN RUM", "OTHER RUM WHITE FB", "OTHER RUM WHITE USB", "OTHER RUM GOLD FB", "OTHER RUM GOLD USB", "DILUTED RUM WHITE FB", "DILUTED RUM WHITE USB", "DILUTED RUM GOLD FB", "DILUTED RUM GOLD USB", "IMPORTED FLAVORED RUM", "ROCK & RYE, RUM & BRANDY (ETC.)", "RUM SPECIALTIES", "LIQUEURS (RUM)", "ROCK & RYE, RUM & BRANDY (ETC)", "RUM SPECIALTIES", "LIQUEURS (RUM)"]
VODKA = ["VODKA", "VODKA 80-90 PROOF", "VODKA", "VODKA 90-99 PROOF", "VODKA", "VODKA 100 PROOF UP", "VODKA - FLAVORED", "VODKA - ORANGE FLAVORED", "VODKA - GRAPE FLAVORED", "VODKA - LIME FLAVORED", "VODKA - LEMON FLAVORED", "VODKA - CHERRY FLAVORED", "VODKA - CHOCOLATE FLAVORED", "VODKA - MINT FLAVORED", "VODKA - PEPPERMINT FLAVORED", "VODKA - OTHER FLAVORED", "OTHER VODKA", "DILUTED VODKA", "VODKA", "VODKA 80-90 PROOF FB", "VODKA 80-90 PROOF USB", "VODKA", "VODKA 90-99 PROOF FB", "VODKA 90-99 PROOF USB", "VODKA", "VODKA 100 PROOF UP FB", "VODKA 100 PROOF UP USB", "VODKA - FLAVORED", "VODKA - ORANGE FLAVORED", "VODKA - GRAPE FLAVORED", "VODKA - LIME FLAVORED", "VODKA - LEMON FLAVORED", "VODKA - CHERRY FLAVORED", "VODKA - CHOCOLATE FLAVORED", "VODKA - MINT FLAVORED", "VODKA - PEPPERMINT FLAVORED", "VODKA - OTHER FLAVORED", "OTHER VODKA", "DILUTED VODKA FB", "DILUTED VODKA USB", "VODKA SPECIALTIES", "LIQUEURS (VODKA)", "VODKA SPECIALTIES", "LIQUEURS (VODKA)", "VODKA MARTINI (48 PROOF UP)", "VODKA MARTINI (UNDER 48 PROOF)", "VODKA MARTINI 48 PROOF UP", "VODKA MARTINI UNDER 48 PROOF", "VODKA SOUR UNDER 48 PROOF"]

##### Select US states only from the main df

In [12]:
us_only = df.loc[df['OriginCode'].isin(states)]
us_only = us_only.loc[df['Status'] == 'APPROVED']  # approved only

mask = ('01/01/2016' <= us_only['recieve_date']) & (us_only['recieve_date'] < '01/01/2017')  #2016 only
us_only = us_only.loc[mask]
us_only['_id'].count()

38143

#### Add general category

We can group more broadly using the following categories

In [13]:
us_only['gen_class'] = us_only['Class/TypeCode']
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(SPIRITS_MISC)] = 'SPIRITS_MISC'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(BEER)] = 'BEER'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(WINE)] = 'WINE'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(BRANDY)] = 'BRANDY'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(GIN)] = 'GIN'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(WHISKY)] = 'WHISKY'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(RUM)] = 'RUM'
us_only['gen_class'].loc[us_only['Class/TypeCode'].isin(VODKA)] = 'VODKA'



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



# Toward one-hot encoding

### Merge the color space data with the requisite application data

In [14]:
color_dat_raw = pd.merge(us_only[['TTBID', 'Class/TypeCode', 'gen_class']], df_colors, left_on='TTBID', right_on='TTBID', how='inner')

### Discretize color space

Calculate our bins and the labels we want associated with the bins

In [15]:
bins = np.linspace(0, 255, 10)
labels = (bins[1:] + bins[:-1]) / 2

Apply a `binned` categorical label to each of the rgb values

In [16]:
color_dat_raw[['dr', 'dg', 'db']] = color_dat_raw[['r', 'g', 'b']].apply(lambda x: 
                                                                       pd.cut(x, bins, labels=labels, right=True, 
                                                                              include_lowest=True))

Create tuple strings, so we can more easily count the groupings of colors

In [17]:
color_dat_raw['str_color'] = ['rgb({}, {}, {})'.format(int(r),int(g),int(b)) for (r,g,b)
                             in zip(color_dat_raw['dr'], color_dat_raw['dg'], color_dat_raw['db'])]

In [18]:
color_dat_raw.head()

Unnamed: 0,TTBID,Class/TypeCode,gen_class,b,g,img_num,percentage,r,dr,dg,db,str_color
0,16001001000009,MALT BEVERAGES SPECIALITIES - FLAVORED,BEER,57.854447,56.692722,0,0.5,60.746631,70.833333,70.833333,70.833333,"rgb(70, 70, 70)"
1,16001001000009,MALT BEVERAGES SPECIALITIES - FLAVORED,BEER,239.106518,238.742448,0,0.5,239.810811,240.833333,240.833333,240.833333,"rgb(240, 240, 240)"
2,16001001000018,OTHER WHISKY (FLAVORED),WHISKY,166.517781,174.014225,0,0.5,179.84495,184.166667,184.166667,155.833333,"rgb(184, 184, 155)"
3,16001001000018,OTHER WHISKY (FLAVORED),WHISKY,48.377104,50.838384,0,0.5,53.707071,42.5,42.5,42.5,"rgb(42, 42, 42)"
4,16001001000018,OTHER WHISKY (FLAVORED),WHISKY,232.319809,234.338902,1,0.25,236.873508,240.833333,240.833333,240.833333,"rgb(240, 240, 240)"


Set gen_class to be a category instead of object

Note, you can get _int_ codes for a categorical variable like so: `color_dat_raw['gen_class'].cat.codes`

In [19]:
color_dat_raw['gen_class'] = color_dat_raw['gen_class'].astype('category')  #convert to category

## One hot encoding

##### Unregularized

In [20]:
color_dat_raw.shape

(215890, 12)

In [21]:
one_hot_color = pd.get_dummies(color_dat_raw['str_color'])
one_hot_color.shape

(215890, 715)

In [22]:
one_hot_labels = pd.get_dummies(color_dat_raw['gen_class'])
one_hot_labels.shape

(215890, 8)

##### Mean normalized

Find the number of occurance of each color for a given super class and the proportion

In [23]:
type_colors = pd.DataFrame(color_dat_raw.groupby(['gen_class', 'str_color']).count()['TTBID'])
type_colors.columns = ['counts']
type_colors['prop_freq'] = type_colors['counts']/type_colors['counts'].groupby('gen_class').sum()
type_colors['prop_freq2'] = type_colors['prop_freq']/type_colors['prop_freq'].groupby('gen_class').max() * 255
type_colors.reset_index(inplace=True)

Find the mean proportional frequency for each color

In [24]:
res = pd.DataFrame(type_colors.groupby('str_color').mean()['prop_freq'])
res.reset_index(inplace=True)
res.rename(columns={'prop_freq': 'color_mean_freq'}, inplace=True) 

Merge in our new data then normalize

In [25]:
color_dat_agg = pd.merge(type_colors, res, left_on='str_color', right_on='str_color', how='inner')

In [26]:
color_dat_agg['prop_mean_norm'] = color_dat_agg['prop_freq'].fillna(0) - color_dat_agg['color_mean_freq']

In [27]:
color_dat_agg.shape

(5720, 7)

Assign to old values for convenience

In [30]:
label_encoder = LabelEncoder()
labels_enc = label_encoder.fit_transform(color_dat_agg['gen_class'])

In [44]:
#one_hot_color = np.array(pd.get_dummies(color_dat_agg['prop_mean_norm'].round(4)))  
one_hot_color = np.array(color_dat_agg['prop_mean_norm']).reshape(-1, 1)
one_hot_labels = np.array(labels_enc).reshape(-1,1)

In [45]:
one_hot_color.shape

(5720, 126)

### Train and test split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(np.array(one_hot_color), np.array(one_hot_labels), 
                                                    test_size=0.33, random_state=42)

## Building our TensorFlow model

##### Parameters of our model

In [47]:
N_COLORS = one_hot_color.shape[1]
N_CLASSES = one_hot_labels.shape[1]
BATCH_SIZE = 100
LEARNING_RATE = 0.5

hidden_size = 64

##### Setting up the input for our model

In [48]:
reset_tf()  # reset our tf session

# input to our tensor flow model
x = tf.placeholder(tf.float32, [None, N_COLORS], name="colors")
y_label = tf.placeholder(tf.float32, [None, N_CLASSES], name="labels")
training = tf.placeholder(tf.bool, name="training")

# set up hidden layer
hidden = tf.layers.dense(x, hidden_size, activation=tf.nn.sigmoid, use_bias=True,
    kernel_initializer=tf.truncated_normal_initializer(stddev=N_COLORS**-0.5))

# set up output layer
y = tf.layers.dense(hidden, N_CLASSES, activation=None, use_bias=True,
    kernel_initializer=tf.truncated_normal_initializer(stddev=hidden_size**-0.5))

## Train the model

In [49]:
def train_network(x, y_label, loss, accuracy, train, training, steps_total, steps_print):
    metrics = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': []
    }
    
    reset_vars()
    for i in range(steps_total):
        #batch_x, batch_y = mnist.train.next_batch(BATCH_SIZE)
        
        j = np.random.choice(len(y_train), BATCH_SIZE, replace=False)
        batch_x = X_train[j,:]
        batch_y = y_train[j,:].reshape(-1,N_CLASSES)
                
        sess.run(train,
                 feed_dict={x: batch_x, y_label: batch_y, training: True})
        
        if i % steps_print == 0 or i == steps_total - 1:
            l, a = sess.run([loss, accuracy],
                            feed_dict={x: X_test,
                                       y_label: y_test,
                                       training: False})
            metrics['test_loss'].append(l)
            metrics['test_acc'].append(a)
            print( "Test:  %0.5f, %0.5f" % (l, a))
            l, a = sess.run([loss, accuracy],
                            feed_dict={x: X_train, 
                                       y_label: y_train,
                                       training: False})
            metrics['train_loss'].append(l)
            metrics['train_acc'].append(a)
            print( "Train: %0.5f, %0.5f" % (l, a))
    return metrics

In [50]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_label))

#train = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)  # gradient descent optimizer
train = tf.train.AdamOptimizer().minimize(loss)  # adam optimizer

accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(y_label, 1)), tf.float32))

total_steps = 1000
steps_print = 100
train_network(x, y_label, loss, accuracy, train, training, total_steps, steps_print);

Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000
Test:  0.00000, 1.00000
Train: 0.00000, 1.00000


# Visualize with tensorboard

In [None]:
g = tf.get_default_graph()  # get default graph

In [None]:
writer = tf.summary.FileWriter('./tb/intro_graph', g)  #write the graph

Run with:

```
tensorboard --logdir=tb/intro_graph
```

In [None]:
writer.close()

# Treat as NLP Problem with TF-IDF

Encode TTBID's as docs and colors as words

In [29]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

Encode the color as a single number

In [None]:
color_label_enc = LabelEncoder()
color_dat_raw['str_color_enc'] = color_label_enc.fit_transform(color_dat_raw['str_color'])

In [None]:
ttbs = color_dat_raw.groupby(['gen_class', 'TTBID'])

words = [None]*color_dat_raw.shape[0]
labels = [None]*color_dat_raw.shape[0]

for i, (name, group) in enumerate(ttbs):
    #print(name)
    #words[i] = [rgb for rgb in group['str_color']]  # old way
    tmp = [str(ccode) for ccode in group['str_color_enc']]
    words[i] = ' '.join(tmp)
    labels[i] = name[0]


For some reason I need to remove Nones...

In [None]:
clabels = [x for x in labels if x != None]
cwords = [x for x in words if x != None]

In [None]:
cwords[:10]

In [None]:
label_encoder = LabelEncoder()
labels_enc = label_encoder.fit_transform(clabels)

In [None]:
tf_idf = TfidfVectorizer(min_df=1e-9, max_df=0.9, lowercase=False)
cwords_tf_idf = tf_idf.fit_transform(cwords)

In [None]:
tfidf_pipeline = Pipeline([('TfidfVectorizer', TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_df=1.0)),
                           ('Ridge', Ridge())])

param_grid = {'Ridge__alpha': [0.1, 1.0, 10.0, 100, 500],
              'TfidfVectorizer__min_df': [1e-2, 1e-5, 1e-7, 1e-9],
              'TfidfVectorizer__max_df': [1.0, 0.9, 0.8, 0.75]}

grid = GridSearchCV(tfidf_pipeline, param_grid, n_jobs=4, verbose=3)

est = grid.fit(cwords, labels_enc)

In [None]:
est.best_params_

In [None]:
est.best_score_