# Imports

In [1]:
import datetime
import pymongo
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

import numpy as np
from collections import defaultdict

import plotly
import plotly.plotly as py
plotly.offline.init_notebook_mode(connected=True)

# Connect to the database

In [2]:
# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB = db.TTB # collection for form data
TTB_labels = db.LabelImages # collection for the label image data

# Load into pandas

In [3]:
df = pd.DataFrame(list(TTB.find()))
df_labels = pd.DataFrame(list(TTB_labels.find()))

#### Helper code to find only domestics (as that's all we processed for labels

In [7]:
# get list of all US states, convert to uppercase as that is what is used
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
states = [state.upper() for state in states]

us_state_abbrev = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}

# capitalized versions
abbrev_lookup=defaultdict(str)
for k, v in us_state_abbrev.items():
    abbrev_lookup[k.upper()] = v


# Domestic only

In [8]:
us_only = df.loc[df['OriginCode'].isin(states)]
us_only = us_only.loc[df['Status'] == 'APPROVED']
us_only['_id'].count()

38143

In [12]:
df_labels['_id'].count()

5167

Inner join on `_id`

In [14]:
in_both = us_only.join(df_labels.set_index('_id'), on='_id', how='inner')

In [16]:
in_both['_id'].count()

2750

<div class="alert alert-danger">
Some image labels that we processed are not in fact in the us_only list!
</div>

# Investigating more deeply

Completely merge both table together and take advantage fo the indicator option

In [34]:
compiled = pd.merge(us_only, df_labels, left_on='_id', right_on='_id', how='outer', indicator=True)

In [35]:
m = compiled['_merge']
m.value_counts()

left_only     35393
both           2750
right_only     2417
Name: _merge, dtype: int64

Almost half of the `_ids` are not found in the USA set?!?!

#### Find out where they came from

Get only rows that were only in the right table ie. df_labels

In [39]:
problems = compiled.loc[compiled['_merge'] == 'right_only']
problems['_id'].count()

2417

Find the corresponding `id`s in the df table

In [43]:
problems_full = df.loc[df['_id'].isin(problems['_id'])]
problems_full['_id'].count()

2417

Get counts of each origin

In [45]:
problems_full['OriginCode'].value_counts()

FRANCE                              1022
ITALY                                489
SPAIN                                210
CHILE                                 87
GERMANY                               65
AUSTRALIA                             61
ARGENTINA                             60
SOUTH AFRICA (UNION OF)               56
MEXICO                                48
AMERICAN                              45
SCOTLAND                              33
PORTUGAL                              31
NEW ZEALAND                           24
ISRAEL                                23
BELGIUM                               18
CANADA                                17
BULGARIA                              15
GREECE                                10
ENGLAND                                9
AUSTRIA                                8
POLAND                                 8
NETHERLANDS                            7
BRAZIL                                 7
SWEDEN                                 7
BARBADOS        

__Conclusion__: bad filtering before image processing