# Concept

The goal here is to build a sequential web scraper based off of the TTBID. I think that the simplest thing to do will just be to start with a basic easy date and then just increment until we get an error back. Since things are _supposedly_ sequential, we can interate easily. A good test will be to try for a small test range. It may also be worth trying to get those parallel scraping tools operational

<div class="alert alert-block alert-info">
TTB ID - This is a unique, 14 digit number assigned by TTB to track each COLA.  The first 5 digits represent the calendar year and Julian date the application was received by TTB. The next 3 digits tell how the application was received (001 = e-filed; 002 & 003 = mailed/overnight; 000 = hand delivered). The last 6 digits is a sequential number that resets for each day and for each received code.
</div>

# Imports

In [2]:
import requests
from bs4 import BeautifulSoup
import re

import datetime
import pymongo
import warnings

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

from PIL import Image
from sklearn import cluster
from sklearn.metrics import silhouette_score

from sklearn.utils import shuffle
import numpy as np

import sys
sys.path.append(r'../ScrapingTools')
from TTB_scraping import TTB_Scraper
from time import sleep


### Early prototypes

In [None]:
start_date = '01/30/2016'
stop_date = '01/01/2017'

In [None]:
f = open('logfile.txt', 'w')

# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB = db.TTB # the actual collection

# convert dates to datetime format
date_start = datetime.datetime.strptime(start_date, '%m/%d/%Y')
date_stop = datetime.datetime.strptime(stop_date, '%m/%d/%Y')

# iterate over each date
curr_date = date_start
while (curr_date < date_stop):
    print('Now on:  {}'.format(curr_date.strftime('%m/%d/%Y')))
    # iterate over each recieve code
    curr_reccode = 0
    while curr_reccode <= 3:

        # increment each sequence 
        cont_seq = True
        curr_seqnum = 1
        retry_count = 0
        while cont_seq:
            # prep the strings for the ttbid
            jdate='{year}{day}'.format(year=curr_date.strftime('%y'), day=curr_date.strftime('%j'))
            reccode='{:03d}'.format(curr_reccode)
            seqnum='{:06d}'.format(curr_seqnum)

            # prep the query
            ttbid = '{jdate}{reccode}{seqnum}'.format(jdate=jdate, reccode=reccode, seqnum=seqnum)

            query = TTB_Scraper(ttbid)
            parsed_data = query.get_basic_form_data()

            # if we got a valid response
            if parsed_data:
                query_data = {'_id': ttbid,
                         'recieve_date':curr_date.strftime('%m/%d/%Y'),
                         'recieve_code': reccode,
                         'seq_num': seqnum}
                
                # concatinated data we will add to our database
                output = {**query_data, **parsed_data}

                curr_seqnum += 1
                retry_count = 0
                # Insert result into database
                try:
                    TTB.insert_one(output)
                    #print('Successfully added: {}'.format(ttbid))
                    f.write('{},1\n'.format(ttbid))
                except pymongo.errors.DuplicateKeyError:
                    warnings.warn('_id: {ttbid} is already in database, skipping...'.format(ttbid=ttbid))
            else:
                # stick with this sequence
                if retry_count < 3:
                    curr_seqnum += 1
                    retry_count += 1
                else:
                    cont_seq = False
                f.write('{},0\n'.format(ttbid))

            sleep(0.1)
        curr_reccode += 1
    curr_date += datetime.timedelta(days=1)

f.close()

In [None]:
curr_reccode

In [None]:
ttbid

In [None]:
output

In [None]:
assert(output['TTBID'] == output['_id'])

In [None]:
# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB = db.TTB # the actual collection

In [None]:
try:
    res = TTB.insert_one(output)
except pymongo.errors.DuplicateKeyError:
    warnings.warn('_id: {ttbid} is already in database, skipping...'.format(ttbid=ttbid))

In [None]:
res.inserted

In [None]:
ttbid = 16004001000014
query = TTB_Scraper(ttbid)
data = query.get_basic_form_data()
data

In [None]:
def build_ttb_database(start_date, stop_date):

# Getting data from the Mongo DB

In [2]:
# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB = db.TTB # the actual collection

In [3]:
TTB.count() # number of elements in the database

85213

In [None]:
a = TTB.distinct('TTBID') # list of distinct TTBID's
len(a)

In [None]:
TTB.find_one('16001001000002')

We can preint out some basic stats like so:

In [None]:
# print collection statistics
#print(db.command("collstats", "TTB"))

# print database statistics
print(db.command({"dbstats": 1,  'scale': 1024}))

Estimate for one year's worth of entries

In [None]:
(208/408) * 147073

In [None]:
# WARNING: deletes database?
#client.drop_database('TTB')

# Mongo into Pandas

The following snippet _should_ turn every element of our mongodb into a list which is then parsed by pandas into a df

In [4]:
df = pd.DataFrame(list(TTB.find()))

In [None]:
df.columns

In [5]:
df['TTBID'] = df['TTBID'].apply(pd.to_numeric)

In [None]:
df['TTBID'].plot()
plt.show()

In [None]:
by_status = df.groupby('Status')

In [None]:
approved_only = df.loc[df['Status'] == 'APPROVED']

In [6]:
# get list of all US states, convert to uppercase as that is what is used
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
states = [state.upper() for state in states]

us_state_abbrev = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}

# capitalized versions
abbrev_lookup=defaultdict(str)
for k, v in us_state_abbrev.items():
    abbrev_lookup[k.upper()] = v


In [25]:
us_only = df.loc[df['OriginCode'].isin(states)]
us_only = us_only.loc[df['Status'] == 'APPROVED']
us_only['_id'].count()

38143

# Img Proc on the subset

In [8]:
def centroid_histogram(clt):
    """From pyimage search, gets us fraction of each color"""
    # grab the number of different clusters and create a histogram
    # based on the number of pixels assigned to each cluster
    numLabels = len(np.unique(clt.labels_))
    (hist, _) = np.histogram(clt.cluster_centers_, bins = numLabels)

    # normalize the histogram, such that it sums to one
    hist = hist.astype("float")
    hist /= hist.sum()

    # return the histogram (percentage described by each cluster)
    return hist.reshape(numLabels,1)

In [22]:
def dominant_colors(img, max_colors=10, n_init=25):
    """Uses k-means to find n_colors dominant colors, expects a PIL Image"""
    thumbnail = Image.Image.copy(img)  # needed b/c thumbnail operates in place
    thumbnail.thumbnail((128,128))  # reduce side to speed up

    thumb = np.array(thumbnail)
    
    try:
        # drop alpha, if it is present
        if thumb.shape[2] == 4:
            thumb = thumb[:,:,:3]
    except IndexError:
        # binary or gray scale image, replicate to make correct size
        o_shape = thumb.shape
        thumb = np.tile(thumb, 3).reshape((*o_shape, -1))
        
        
    w, h, d = original_shape = tuple(thumb.shape)
    assert d == 3
    image_array = np.reshape(img, (w * h, d))

    image_array_sample = shuffle(image_array, random_state=0)[:1000]  # take a random sample of 1000 points

    rand_state = 0  # same randome state is used for repeatability

    bestSilhouette = 0
    for n_colors in range(2, max_colors):
        clt = cluster.KMeans(n_clusters=n_colors, random_state=rand_state, n_init=n_init)
        clt.fit(image_array_sample)
        silhouette = silhouette_score(image_array_sample, clt.labels_, metric='euclidean')

        # Find the best one
        if silhouette > bestSilhouette:
            bestSilhouette = silhouette
            best_nClusters = n_colors;

    clt = cluster.KMeans(n_clusters=best_nClusters, random_state=rand_state, n_init=n_init)
    clusters = clt.fit(image_array_sample)
    hist = centroid_histogram(clusters)

    return [hist, cluster_labels.cluster_centers_]

## Iterate over all domestic ID's, create new table for results

In [10]:
# Set up connection to mongodb
client = pymongo.MongoClient() # Connect to default client
db = client.TTB # Get a database (note: lazy evaluation)
TTB_labels = db.LabelImages # the actual collection

In [24]:
TTB_labels.count()

5167

In [None]:
for curr_id in us_only['TTBID'][77:]:
    query = TTB_Scraper(curr_id)
    [meta, imgs] = query.get_images()
    output = {'_id': str(curr_id)}
    for im_num, (metadata, img) in enumerate(zip(meta, imgs)):
        [percentage, colors] = dominant_colors(img)
        
        colors = colors.clip(min=0, max=255)
        hex_color = [matplotlib.colors.rgb2hex(rgb_color/255) for rgb_color in colors]
        
        output['img_{:02d}_label_type'.format(im_num)] = metadata[0]
        output['img_{:02d}_label_url'.format(im_num)] = metadata[1]
        
        # add histograms using img.histogram()
        
        for num, (percent, color) in enumerate(zip(percentage, hex_color)):
            output['img_{:02d}_color_frac_{:02d}'.format(im_num, num)] = percent[0]
            output['img_{:02d}_color_hex_{:02d}'.format(im_num, num)] = color
            
    # Insert result into database
    try:
        TTB_labels.insert_one(output)
        #print('Successfully added: {}'.format(ttbid))
    except pymongo.errors.DuplicateKeyError:
        #warnings.warn('_id: {ttbid} is already in database, skipping...'.format(ttbid=curr_id))
        print('Entry already present, skipping: {}'.format(curr_id))
    sleep(0.1)

Entry already present, skipping: 16003001000007




In [12]:
curr_id= 16003001000008

In [64]:
output

{'_id': '16001001000052'}

In [61]:
colors.clip(0)

array([[ 254.30569307,  252.82549505,  240.86262376],
       [ 102.        ,  111.44444444,  102.33333333],
       [ 205.81481481,  190.48148148,  171.55555556],
       [ 225.33333333,    0.        ,   59.66666667],
       [ 235.41269841,  235.32539683,  226.34126984]])

In [37]:
hex_color = [matplotlib.colors.rgb2hex(rgb_color/255) for rgb_color in colors]
hex_color

['#e2e2e2', '#7f7f7f', '#fefefe', '#a4a4a4', '#c6c6c6']

In [66]:
colors

array([[ 254.3478803 ,  252.67456359,  241.0074813 ],
       [ 162.55      ,  108.85      ,   91.95      ],
       [ 197.3902439 ,  172.14634146,  147.65853659],
       [  78.42857143,   56.85714286,   56.57142857],
       [ 230.67479675,  219.39837398,  200.16260163]])

In [14]:
tmp = Image.Image.copy(img)
tmp.thumbnail((128,128))  # reduce side to speed up
tmp = np.array(tmp)
tmp.shape

(128, 128)

In [19]:
original_shape = tmp.shape
np.tile(tmp, 3).reshape((*original_shape, -1)).shape

(128, 128, 3)

In [78]:
dominant_colors(img)

[array([[ 0.2       ],
        [ 0.13333333],
        [ 0.06666667],
        [ 0.33333333],
        [ 0.26666667]]), array([[   0.29711752,    0.36141907,    0.37694013],
        [ 178.22072072,  161.26801802,  159.99099099],
        [ 120.89473684,  110.28947368,  108.63157895],
        [  73.92307692,   68.38461538,   67.53846154],
        [ 151.02439024,  136.29268293,  134.82926829]])]

In [None]:
query = TTB_Scraper(curr_id)
[meta, imgs] = query.get_images()

In [None]:
plt.imshow(img)
plt.show()

In [None]:
db.ProductData.find_one_and_update({'_id': curr_id},
                                   {'$set': {'dcolor_frac': dcolor_frac},
                                    '$set': {'dcolor_val': dcolor_val}}, upsert=False)

In [None]:
TTB.find_one({'_id': '16001001000001'})

In [None]:
TTB.find_one_and_update()