# Imports

In [51]:
import datetime
import pymongo
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

from collections import defaultdict

import plotly
import plotly.plotly as py
plotly.offline.init_notebook_mode(connected=True)

# Connect to database

#### Connect to the database

In [100]:
# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB = db.TTB # collection for form data
TTB_labels = db.LabelImages # collection for the label image data

#### Load into pandas

In [103]:
df = pd.DataFrame(list(TTB.find()))
df_labels = pd.DataFrame(list(TTB_labels.find()))

#### Clean up data types

In [5]:
df['TTBID'] = df['TTBID'].apply(pd.to_numeric)

#### Helper list to select only domestic

In [54]:
# get list of all US states, convert to uppercase as that is what is used
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
states = [state.upper() for state in states]

us_state_abbrev = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}

# capitalized versions
abbrev_lookup=defaultdict(str)
for k, v in us_state_abbrev.items():
    abbrev_lookup[k.upper()] = v


# Exploration

#### Select domestic only that are approved

In [23]:
us_only = df.loc[df['OriginCode'].isin(states)]
us_only = us_only.loc[df['Status'] == 'APPROVED']
us_only['_id'].count()

38143

# Where are our breweries?

In [25]:
by_state = us_only.groupby('OriginCode')

In [60]:
state_counts = pd.DataFrame(by_state['TTBID'].count())
state_counts['state_abbrev'] = [abbrev_lookup[ind] for ind in state_counts.index]

In [61]:
state_counts

Unnamed: 0_level_0,TTBID,state_abbrev
OriginCode,Unnamed: 1_level_1,Unnamed: 2_level_1
ALABAMA,156,AL
ALASKA,18,AK
ARIZONA,317,AZ
ARKANSAS,38,AR
CALIFORNIA,13746,CA
COLORADO,990,CO
CONNECTICUT,357,CT
DELAWARE,194,DE
FLORIDA,840,FL
GEORGIA,565,GA


In [47]:
state_counts.TTBID.values

array([  156,    18,   317,    38, 13746,   990,   357,   194,   840,
         565,    56,   184,   942,   489,   252,   157,   688,   130,
         209,   425,   535,  1101,   452,    71,  1032,   120,    94,
         101,   252,   483,    98,  1969,   828,    80,   718,   210,
        1905,  1730,    69,   330,    44,   474,   981,   127,   194,
         776,  1824,    35,   736,    21])

In [63]:
plot_data = [ dict(
                type='choropleth',
                autocolorscale = True,
                locations = state_counts['state_abbrev'],
        z = state_counts.TTBID.values.astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Number of Applicaitons")
        ) ]

layout = dict(
        title = 'Applications by State (2016)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=plot_data, layout=layout )
plotly.offline.iplot( fig, filename='AppsByState.html' )

### Output the required div (still need to include js!)

In [64]:
plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')

'<div id="be1b23d3-cbd7-4a2b-8da7-d367f7853961" style="height: 100%; width: 100%;" class="plotly-graph-div"></div><script type="text/javascript">window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL="https://plot.ly";Plotly.newPlot("be1b23d3-cbd7-4a2b-8da7-d367f7853961", [{"type": "choropleth", "autocolorscale": true, "locations": ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"], "z": [156.0, 18.0, 317.0, 38.0, 13746.0, 990.0, 357.0, 194.0, 840.0, 565.0, 56.0, 184.0, 942.0, 489.0, 252.0, 157.0, 688.0, 130.0, 209.0, 425.0, 535.0, 1101.0, 452.0, 71.0, 1032.0, 120.0, 94.0, 101.0, 252.0, 483.0, 98.0, 1969.0, 828.0, 80.0, 718.0, 210.0, 1905.0, 1730.0, 69.0, 330.0, 44.0, 474.0, 981.0, 127.0, 194.0, 776.0, 1824.0, 35.0, 736.0, 21.0],

# What are people making?

In [81]:
alch_types = pd.DataFrame(us_only.groupby('Class/TypeCode').count()['TTBID'])
alch_types.columns = ['count']
alch_types = alch_types.reset_index()

In [83]:
alch_types

Unnamed: 0,Class/TypeCode,count
0,ALE,6954
1,AMARETTO,5
2,"ANISETTE, OUZO, OJEN",3
3,APPLE BRANDY,41
4,APRICOT BRANDY,1
5,BEER,1567
6,BLENDED APPLE JACK BRANDY,1
7,BLENDED BOURBON WHISKY,5
8,BLENDED CORN WHISKY,3
9,BLENDED LIGHT WHISKY,2


In [99]:
plot_data = [dict(
                type='bar',
                x=alch_types['Class/TypeCode'],
                y=alch_types['count'])]

layout = dict(
            title = 'Applications by Type',
            xaxis = dict(tickangle = -45),
            margin = dict(b = 200)
             )

# clean version, no labels
layout = dict(
            xaxis = dict(visible = False),
            yaxis = dict(visible = False)
             )

fig = dict( data=plot_data, layout=layout )
plotly.offline.iplot( fig, filename='TypeCounts.html' )

# How much art?

In [104]:
df_labels.head()

Unnamed: 0,_id,img_00_color_frac_00,img_00_color_frac_01,img_00_color_frac_02,img_00_color_frac_03,img_00_color_frac_04,img_00_color_hex_00,img_00_color_hex_01,img_00_color_hex_02,img_00_color_hex_03,...,img_05_color_frac_02,img_05_color_frac_03,img_05_color_frac_04,img_05_color_hex_00,img_05_color_hex_01,img_05_color_hex_02,img_05_color_hex_03,img_05_color_hex_04,img_05_label_type,img_05_label_url
0,16001001000001,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
1,16001001000002,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
2,16001001000003,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
3,16001001000004,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
4,16001001000005,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,


In [114]:
art_work_counts = pd.DataFrame([df_labels['img_00_color_hex_00'].count(),
                                df_labels['img_01_color_hex_00'].count(),
                                df_labels['img_02_color_hex_00'].count(),
                                df_labels['img_03_color_hex_00'].count(),
                                df_labels['img_04_color_hex_00'].count()])
art_work_counts = art_work_counts.reset_index()
art_work_counts.columns = ['num_imgs', 'count']
art_work_counts['num_imgs'] += 1


In [115]:
art_work_counts

Unnamed: 0,num_imgs,count
0,1,5047
1,2,3801
2,3,605
3,4,84
4,5,14


In [121]:
plot_data = [dict(
                type='bar',
                x=art_work_counts['num_imgs'],
                y=art_work_counts['count'])]

layout = dict(
              title = 'Amount of Art',
              xaxis = dict(title='Number of Images'),
              yaxis = dict(title='Count')
             )

# clean version, no labels
#layout = dict(
#            xaxis = dict(visible = False),
#            yaxis = dict(visible = False)
#             )

fig = dict( data=plot_data, layout=layout )
plotly.offline.iplot( fig, filename='AmountOfArt.html' )

# Color?

### Possible ideas

* Look at the Mahalanobis distances of each color in a pallet
* Make a network where the edge weight/length is the proportion
* Plot points in RGB space, point size denotes fraction, find center of mass/clusters

In [116]:
df_labels.head()

Unnamed: 0,_id,img_00_color_frac_00,img_00_color_frac_01,img_00_color_frac_02,img_00_color_frac_03,img_00_color_frac_04,img_00_color_hex_00,img_00_color_hex_01,img_00_color_hex_02,img_00_color_hex_03,...,img_05_color_frac_02,img_05_color_frac_03,img_05_color_frac_04,img_05_color_hex_00,img_05_color_hex_01,img_05_color_hex_02,img_05_color_hex_03,img_05_color_hex_04,img_05_label_type,img_05_label_url
0,16001001000001,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
1,16001001000002,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
2,16001001000003,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
3,16001001000004,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,
4,16001001000005,0.2,0.2,0.2,0.2,0.2,#e2e2e2,#7f7f7f,#fefefe,#a4a4a4,...,,,,,,,,,,


In [150]:
c1 = pd.DataFrame(df_labels['img_00_color_hex_00']).dropna(axis=0, how='any')  # strip out nans
c1 = c1.applymap(matplotlib.colors.hex2color)  # convert to rgb space
c1 = c1['img_00_color_hex_00'].apply(pd.Series)  # split up tuple
c1.columns = ['r', 'g', 'b']
c1 = c1 * 255 # convert to 0-255
colors = c1

In [151]:
colors.head()

Unnamed: 0,r,g,b
0,226.0,226.0,226.0
1,226.0,226.0,226.0
2,226.0,226.0,226.0
3,226.0,226.0,226.0
4,226.0,226.0,226.0


In [155]:
colors['r'].count()

5047

In [159]:
scatter[:3]

[{'marker': {'color': 'rgb(226.0, 226.0, 226.0)'},
  'mode': 'markers',
  'name': 'colors',
  'type': 'Scatter3d',
  'x': 226.0,
  'y': 226.0,
  'z': 226.0},
 {'marker': {'color': 'rgb(226.0, 226.0, 226.0)'},
  'mode': 'markers',
  'name': 'colors',
  'type': 'Scatter3d',
  'x': 226.0,
  'y': 226.0,
  'z': 226.0},
 {'marker': {'color': 'rgb(226.0, 226.0, 226.0)'},
  'mode': 'markers',
  'name': 'colors',
  'type': 'Scatter3d',
  'x': 226.0,
  'y': 226.0,
  'z': 226.0}]

In [181]:
scatter = [None]*colors['r'].count()
for i, (r, g, b) in enumerate(zip(colors['r'], colors['g'], colors['b'])):
    scatter[i] = dict(
        type = 'scatter3d',
        mode = 'markers',
        name = 'colors',
        x = r,
        y = g,
        z = b,
        marker = dict(size=2, color='rgb(125, 125, 125)')
        #marker = dict(color='rgb({r},{g},{b})'.format(r=int(r), g=int(g), b=int(b)))
    )

#clusters = dict(
#    alphahull = 7,
#    name = "y",
#    opacity = 0.1,
#    type = "mesh3d",    
#    x = df['x'], y = df['y'], z = df['z']
#)

layout = dict(
    title = '3d point clustering',
    scene = dict(
        xaxis = dict( zeroline=False ),
        yaxis = dict( zeroline=False ),
        zaxis = dict( zeroline=False ),
    )
)
#fig = dict( data=[scatter, clusters], layout=layout )
fig_s = dict( data=scatter, layout=layout )

plotly.offline.iplot( fig_s, filename='Colors.html' )

In [182]:
fig_s

{'data': [{'marker': {'color': 'rgb(125, 125, 125)', 'size': 2},
   'mode': 'markers',
   'name': 'colors',
   'type': 'scatter3d',
   'x': 226.0,
   'y': 226.0,
   'z': 226.0},
  {'marker': {'color': 'rgb(125, 125, 125)', 'size': 2},
   'mode': 'markers',
   'name': 'colors',
   'type': 'scatter3d',
   'x': 226.0,
   'y': 226.0,
   'z': 226.0},
  {'marker': {'color': 'rgb(125, 125, 125)', 'size': 2},
   'mode': 'markers',
   'name': 'colors',
   'type': 'scatter3d',
   'x': 226.0,
   'y': 226.0,
   'z': 226.0},
  {'marker': {'color': 'rgb(125, 125, 125)', 'size': 2},
   'mode': 'markers',
   'name': 'colors',
   'type': 'scatter3d',
   'x': 226.0,
   'y': 226.0,
   'z': 226.0},
  {'marker': {'color': 'rgb(125, 125, 125)', 'size': 2},
   'mode': 'markers',
   'name': 'colors',
   'type': 'scatter3d',
   'x': 226.0,
   'y': 226.0,
   'z': 226.0},
  {'marker': {'color': 'rgb(125, 125, 125)', 'size': 2},
   'mode': 'markers',
   'name': 'colors',
   'type': 'scatter3d',
   'x': 226.0,
   

In [163]:
fig

{'data': [{'marker': {'color': 'hsl(0.0,50%,50%)',
    'line': {'width': 1},
    'opacity': 0.3,
    'size': 14},
   'mode': 'markers',
   'name': 2000,
   'text': 0                  Alabama
   1                   Alaska
   2                  Arizona
   3                 Arkansas
   4               California
   5                 Colorado
   6              Connecticut
   7                 Delaware
   8     District of Columbia
   9                  Florida
   10                 Georgia
   11                  Hawaii
   12                   Idaho
   13                Illinois
   14                 Indiana
   15                    Iowa
   16                  Kansas
   17                Kentucky
   18               Louisiana
   19                   Maine
   20                Maryland
   21           Massachusetts
   22                Michigan
   23               Minnesota
   24             Mississippi
   25                Missouri
   26                 Montana
   27                Nebraska

In [162]:
import plotly.plotly as py
import plotly.graph_objs as go
import random
import numpy as np
import pandas as pd

l= []
y= []
data= pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/2014_usa_states.csv")
# Setting colors for plot.
N= 53
c= ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

for i in range(int(N)):
    y.append((2000+i))
    trace0= go.Scatter(
        x= data['rank'],
        y= data['pop']+(i*1000000),
        mode= 'markers',
        marker= dict(size= 14,
                    line= dict(width=1),
                    color= c[i],
                    opacity= 0.3
                   ),name= y[i],
        text= data['state']) # The hover text goes here... 
    l.append(trace0);

layout= go.Layout(
    title= 'Stats of USA States',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Pop',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Rank',
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= False
)
fig= go.Figure(data=l, layout=layout)
plotly.offline.iplot(fig)