# Duplicate Testing

###### Imports

In [1]:
import pandas as pd
import numpy as np
import requests

from __future__ import print_function, division
import matplotlib.pylab as pl
import seaborn as sns
sns.set_style('whitegrid')
# import json

# Spatial
import geopandas as gpd
import fiona
import shapely

import urllib.request
import urllib

#import statsmodels.formula.api as smf
#import statsmodels.api as sm

%pylab inline

Populating the interactive namespace from numpy and matplotlib


###### Functions for Gathering Data  
This code is from the Bureau of Labor Statistics sample functions Python 3.x Example: https://data.bls.gov/cew/doc/access/data_access_examples.htm#PYTHON

In [2]:
# *******************************************************************************
# qcewCreateDataRows : This function takes a raw csv string and splits it into
# a two-dimensional array containing the data and the header row of the csv file
# a try/except block is used to handle for both binary and char encoding
def qcewCreateDataRows(csv):
    dataRows = []
    try: dataLines = csv.decode().split('\r\n')
    except er: dataLines = csv.split('\r\n');
    for row in dataLines:
        dataRows.append(row.split(','))
    return dataRows
# *******************************************************************************


# *******************************************************************************
# qcewGetAreaData : This function takes a year, quarter, and area argument and
# returns an array containing the associated area data. Use 'a' for annual
# averages. 
# For all area codes and titles see:
# http://www.bls.gov/cew/doc/titles/area/area_titles.htm
#
def qcewGetAreaData(year,qtr,area):
    urlPath = "http://data.bls.gov/cew/data/api/[YEAR]/[QTR]/area/[AREA].csv"
    urlPath = urlPath.replace("[YEAR]",year)
    urlPath = urlPath.replace("[QTR]",qtr.lower())
    urlPath = urlPath.replace("[AREA]",area.upper())
    httpStream = urllib.request.urlopen(urlPath)
    csv = httpStream.read()
    httpStream.close()
    return qcewCreateDataRows(csv)
# *******************************************************************************


# *******************************************************************************
# qcewGetIndustryData : This function takes a year, quarter, and industry code
# and returns an array containing the associated industry data. Use 'a' for 
# annual averages. Some industry codes contain hyphens. The CSV files use
# underscores instead of hyphens. So 31-33 becomes 31_33. 
# For all industry codes and titles see:
# http://www.bls.gov/cew/doc/titles/industry/industry_titles.htm
#
def qcewGetIndustryData(year,qtr,industry):
    urlPath = "http://data.bls.gov/cew/data/api/[YEAR]/[QTR]/industry/[IND].csv"
    urlPath = urlPath.replace("[YEAR]",year)
    urlPath = urlPath.replace("[QTR]",qtr.lower())
    urlPath = urlPath.replace("[IND]",industry)
    httpStream = urllib.request.urlopen(urlPath)
    csv = httpStream.read()
    httpStream.close()
    return qcewCreateDataRows(csv)
# *******************************************************************************


# *******************************************************************************
# qcewGetSizeData : This function takes a year and establishment size class code
# and returns an array containing the associated size data. Size data
# is only available for the first quarter of each year.
# For all establishment size classes and titles see:
# http://www.bls.gov/cew/doc/titles/size/size_titles.htm
#
def qcewGetSizeData(year,size):
    urlPath = "http://data.bls.gov/cew/data/api/[YEAR]/1/size/[SIZE].csv"
    urlPath = urlPath.replace("[YEAR]",year)
    urlPath = urlPath.replace("[SIZE]",size)
    httpStream = urllib.request.urlopen(urlPath)
    csv = httpStream.read()
    httpStream.close()
    return qcewCreateDataRows(csv)

In [12]:
twenty17 = qcewGetAreaData('2017', 'A', '47189')
twenty17 = pd.DataFrame(twenty17) # put it in a pandas table
twenty17.columns = twenty17.iloc[0] # first row to headers
twenty17 = twenty17[1:] # same
#test.columns = [i.replace('"', '') for i in test.columns] # cleaning data
#test = test.replace({'"':''}, regex=True) # cleaning data

In [13]:
twenty17

Unnamed: 0,"""area_fips""","""own_code""","""industry_code""","""agglvl_code""","""size_code""","""year""","""qtr""","""disclosure_code""","""annual_avg_estabs""","""annual_avg_emplvl""",...,"""oty_total_annual_wages_chg""","""oty_total_annual_wages_pct_chg""","""oty_taxable_annual_wages_chg""","""oty_taxable_annual_wages_pct_chg""","""oty_annual_contributions_chg""","""oty_annual_contributions_pct_chg""","""oty_annual_avg_wkly_wage_chg""","""oty_annual_avg_wkly_wage_pct_chg""","""oty_avg_annual_pay_chg""","""oty_avg_annual_pay_pct_chg"""
1,"""47189""","""0""","""10""","""70""","""0""","""2017""","""A""","""""",2783,42269,...,121738429,7.3,24598694,6.8,44394,1.1,16,2.0,826,2.0
2,"""47189""","""1""","""10""","""71""","""0""","""2017""","""A""","""""",16,211,...,512624,4.1,0,0.0,0,0.0,30,2.6,1568,2.6
3,"""47189""","""1""","""102""","""72""","""0""","""2017""","""A""","""""",16,211,...,512624,4.1,0,0.0,0,0.0,30,2.6,1568,2.6
4,"""47189""","""1""","""1021""","""73""","""0""","""2017""","""A""","""""",7,170,...,418576,4.5,0,0.0,0,0.0,28,2.6,1413,2.5
5,"""47189""","""1""","""1028""","""73""","""0""","""2017""","""A""","""""",9,40,...,94048,2.9,0,0.0,0,0.0,51,3.3,2668,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360,"""47189""","""5""","""999""","""75""","""0""","""2017""","""A""","""""",2,2,...,47987,3979.0,21987,1823.1,1314,1684.6,381,680.4,19810,684.5
1361,"""47189""","""5""","""9999""","""76""","""0""","""2017""","""A""","""""",2,2,...,47987,3979.0,21987,1823.1,1314,1684.6,381,680.4,19810,684.5
1362,"""47189""","""5""","""99999""","""77""","""0""","""2017""","""A""","""""",2,2,...,47987,3979.0,21987,1823.1,1314,1684.6,381,680.4,19810,684.5
1363,"""47189""","""5""","""999999""","""78""","""0""","""2017""","""A""","""""",2,2,...,47987,3979.0,21987,1823.1,1314,1684.6,381,680.4,19810,684.5


## Range of Years  

Say you want to get industry data for one area for a span of years. 

In [32]:
#try the same with a for loop for years

appended_data = []
for i in range(2015, 2020+1):
    for num in range(1, 189+1):
        if num % 2 != 0:
            data = qcewGetAreaData('{}'.format(i), '1', '47{numb:03d}'.format(numb=num))
            data = pd.DataFrame(data)
            data.columns = data.iloc[0] # first row to headers
            data = data[1:] # same
            appended_data.append(data)
    
appended_data = pd.concat(appended_data)


In [None]:
#go ahead and get the column names out from any of those API loops

In [33]:
appended_data.head()

Unnamed: 0,"""area_fips""","""own_code""","""industry_code""","""agglvl_code""","""size_code""","""year""","""qtr""","""disclosure_code""","""qtrly_estabs""","""month1_emplvl""",...,"""oty_month3_emplvl_chg""","""oty_month3_emplvl_pct_chg""","""oty_total_qtrly_wages_chg""","""oty_total_qtrly_wages_pct_chg""","""oty_taxable_qtrly_wages_chg""","""oty_taxable_qtrly_wages_pct_chg""","""oty_qtrly_contributions_chg""","""oty_qtrly_contributions_pct_chg""","""oty_avg_wkly_wage_chg""","""oty_avg_wkly_wage_pct_chg"""
1,"""47001""","""0""","""10""","""70""","""0""","""2015""","""1""","""""",1650,39099,...,502,1.3,6618154,1.3,-8654850,-3.7,-502662,-12.9,-5,-0.5
2,"""47001""","""1""","""10""","""71""","""0""","""2015""","""1""","""""",18,856,...,-47,-5.2,-903711,-4.3,0,0.0,0,0.0,16,0.9
3,"""47001""","""1""","""102""","""72""","""0""","""2015""","""1""","""""",18,856,...,-47,-5.2,-903711,-4.3,0,0.0,0,0.0,16,0.9
4,"""47001""","""1""","""1021""","""73""","""0""","""2015""","""1""","""""",6,161,...,-10,-5.8,-285551,-7.7,0,0.0,0,0.0,-34,-2.1
5,"""47001""","""1""","""1028""","""73""","""0""","""2015""","""1""","""""",12,695,...,-30,-4.2,-396659,-2.3,0,0.0,0,0.0,34,1.9


In [31]:
appended_data.tail()

Unnamed: 0,"""area_fips""","""own_code""","""industry_code""","""agglvl_code""","""size_code""","""year""","""qtr""","""disclosure_code""","""annual_avg_estabs""","""annual_avg_emplvl""",...,"""oty_total_annual_wages_chg""","""oty_total_annual_wages_pct_chg""","""oty_taxable_annual_wages_chg""","""oty_taxable_annual_wages_pct_chg""","""oty_annual_contributions_chg""","""oty_annual_contributions_pct_chg""","""oty_annual_avg_wkly_wage_chg""","""oty_annual_avg_wkly_wage_pct_chg""","""oty_avg_annual_pay_chg""","""oty_avg_annual_pay_pct_chg"""
1369,"""47189""","""5""","""814""","""75""","""0""","""2020""","""A""","""""",37.0,37.0,...,-886865.0,-49.3,-442803.0,-56.4,-13065.0,-64.2,101.0,27.3,5251.0,27.3
1370,"""47189""","""5""","""8141""","""76""","""0""","""2020""","""A""","""""",37.0,37.0,...,-886865.0,-49.3,-442803.0,-56.4,-13065.0,-64.2,101.0,27.3,5251.0,27.3
1371,"""47189""","""5""","""81411""","""77""","""0""","""2020""","""A""","""""",37.0,37.0,...,-886865.0,-49.3,-442803.0,-56.4,-13065.0,-64.2,101.0,27.3,5251.0,27.3
1372,"""47189""","""5""","""814110""","""78""","""0""","""2020""","""A""","""""",37.0,37.0,...,-886865.0,-49.3,-442803.0,-56.4,-13065.0,-64.2,101.0,27.3,5251.0,27.3
1373,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# TEST ONE COUNTY
# this one is a test of one STCO -- will not be stored

test = qcewGetAreaData("2017","A",'06041') # using BLS code (only after 2013!)
test = pd.DataFrame(test) # put it in a pandas table
test.columns = test.iloc[0] # first row to headers
test = test[1:] # same
test.columns = [i.replace('"', '') for i in test.columns] # cleaning data
test = test.replace({'"':''}, regex=True) # cleaning data
test = test[['area_fips', 'own_code', 'industry_code', 'annual_avg_emplvl']] # selecting only relevant columns

# index to relevant row by ownership and industry
test = test.loc[(test['own_code'] == ownership) & test['industry_code'].isin(industry['office'])] 

# summing all rows to total and create new row
test.annual_avg_emplvl = test.annual_avg_emplvl.astype(int)
test = test.append(test.sum(numeric_only=True), ignore_index=True)

# assigning fipa, own, industry data to the new row
test['area_fips'][-1:] = test['area_fips'][:1]
test['own_code'][-1:] = test['own_code'][:1]
test['industry_code'][-1:] = 'office'

test = test[-1:] # dropping all rows but the sum
test.annual_avg_emplvl = test.annual_avg_emplvl.astype(int)

print(test.shape)
print(test.dtypes)
test.head()