In [1]:
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline

In [2]:
# Load cumilative datasets created in step 1
cumllaborcnty = pd.read_csv('cumllaborcnty.csv')
cumllaborvt = pd.read_csv('cumllaborvt.csv')

In [3]:
cumllaborcnty = pd.melt(cumllaborcnty,id_vars=['Year','County','Type'],var_name='Time', value_name='Count')
cumllaborcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2017,Addison County,Total Covered - all ownerships,Jan,13094
1,2017,Addison County,Private ownership,Jan,11078
2,2017,Addison County,Goods Producing domain,Jan,3259
3,2017,Addison County,Natural Resources and Mining supersector,Jan,459
4,2017,Addison County,"Agriculture, forestry, fishing and hunting",Jan,421


In [4]:
# There are missing values or occasions where the occurance is too small to show, each we are replacing with NaNs
cumllaborcnty = cumllaborcnty.replace('(c)', np.nan)
cumllaborcnty = cumllaborcnty.replace('-', np.nan)
cumllaborcnty = cumllaborcnty.replace('na', np.nan)
cumllaborcnty = cumllaborcnty.replace(' ', np.nan)
cumllaborcnty = cumllaborcnty.replace('<1', 1)
cumllaborcnty['Count'] = pd.to_numeric(cumllaborcnty['Count'])

In [5]:
cumllaborcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2017,Addison County,Total Covered - all ownerships,Jan,13094.0
1,2017,Addison County,Private ownership,Jan,11078.0
2,2017,Addison County,Goods Producing domain,Jan,3259.0
3,2017,Addison County,Natural Resources and Mining supersector,Jan,459.0
4,2017,Addison County,"Agriculture, forestry, fishing and hunting",Jan,421.0


In [6]:
# Set up lists of how these are going to be split out into similiar types
monthlist = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

quarterslist = ['Average Employment Qtr 1', 'Average Employment Qtr 2', 'Average Employment Qtr 3', 'Average Employment Qtr 4', 
            'Average Female Employment Qtr 1', 'Average Female Employment Qtr 2','Average Female Employment Qtr 3',
            'Average Female Employment Qtr 4','Total Wages Qtr 1', 'Total Wages 2', 'Total Wages 3', 
            'Total Wages 4','Average Wage Qtr 1', 'Average Wage Qtr 2', 'Average Wage Qtr 3', 
            'Average Wage Qtr 4','Number of Establishments Qtr 1', 'Number of Establishments Qtr 2',
            'Number of Establishments Qtr 3', 'Number of EstablishmentsQtr 4'
           ]

annuallist = ['Average Employment Annual', 'Average Female Employment Annual', 'Total Wages Annual',  'Average Wage Annual',
           'Number of Establishments Annual'
          ]

In [7]:
months = cumllaborcnty[cumllaborcnty['Time'].isin(monthlist)].copy()
quarters = cumllaborcnty[cumllaborcnty['Time'].isin(quarterslist)].copy()
annual = cumllaborcnty[cumllaborcnty['Time'].isin(annuallist)].copy()

In [8]:
quarters.head()

Unnamed: 0,Year,County,Type,Time,Count
114156,2017,Addison County,Total Covered - all ownerships,Average Employment Qtr 1,13162.0
114157,2017,Addison County,Private ownership,Average Employment Qtr 1,11121.0
114158,2017,Addison County,Goods Producing domain,Average Employment Qtr 1,3270.0
114159,2017,Addison County,Natural Resources and Mining supersector,Average Employment Qtr 1,474.0
114160,2017,Addison County,"Agriculture, forestry, fishing and hunting",Average Employment Qtr 1,437.0


In [9]:
quarters['Quarter'] = np.nan

In [10]:
quarters.loc[quarters['Time'].str.endswith("Qtr 1"), "Quarter"] = "1"
quarters.loc[quarters['Time'].str.endswith("Qtr 2"), "Quarter"] = "2"
quarters.loc[quarters['Time'].str.endswith("Qtr 3"), "Quarter"] = "3"
quarters.loc[quarters['Time'].str.endswith("Qtr 4"), "Quarter"] = "4"
quarters['Time'] = quarters['Time'].str[:-6]

In [11]:
pd.unique(quarters['Time'])

array(['Average Employment', 'Average Female Employment', 'Total Wages',
       'Average Wage', 'Number of Establishments'], dtype=object)

In [12]:
quarters.columns

Index(['Year', 'County', 'Type', 'Time', 'Count', 'Quarter'], dtype='object')

In [16]:
quarters.columns = ['Year', 'County', 'Type', 'Stat', 'Count', 'Quarter']
quarters = quarters[['Year', 'Quarter', 'County', 'Type', 'Stat', 'Count', ]]

In [17]:
quarters.head()

Unnamed: 0,Year,Quarter,County,Type,Stat,Count
114156,2017,1,Addison County,Total Covered - all ownerships,Average Employment,13162.0
114157,2017,1,Addison County,Private ownership,Average Employment,11121.0
114158,2017,1,Addison County,Goods Producing domain,Average Employment,3270.0
114159,2017,1,Addison County,Natural Resources and Mining supersector,Average Employment,474.0
114160,2017,1,Addison County,"Agriculture, forestry, fishing and hunting",Average Employment,437.0


In [28]:
test = quarters.pivot(columns='Stat', values='Count')

test2 = pd.pivot_table(quarters, values='Count', index=['Year','Quarter','County','Type'], columns=['Stat'])
test2 = test2.reset_index() 

In [48]:
test2.tail(40)

Stat,Year,Quarter,County,Type,Average Employment,Average Female Employment,Average Wage,Number of Establishments,Total Wages
2244,2017,4,Windham County,released: November 2017,,,,,
2245,2017,4,Windham County,released: September 2016,,,,,
2246,2017,4,Windsor County,Accommodation and food services,2991.235294,1775.375,4814.588235,,
2247,2017,4,Windsor County,Administrative and waste services,693.75,262.571429,8390.25,,
2248,2017,4,Windsor County,"Agriculture, forestry, fishing and hunting",191.647059,60.5,6218.882353,,
2249,2017,4,Windsor County,"Arts, entertainment, and recreation",385.941176,220.625,5489.823529,,
2250,2017,4,Windsor County,Construction supersector,1447.647059,181.4375,10267.764706,,
2251,2017,4,Windsor County,Durable Goods manufacturing,1322.529412,359.25,10822.529412,,
2252,2017,4,Windsor County,Education and Health Services supersector,3056.470588,2447.0625,9749.941176,,
2253,2017,4,Windsor County,Educational services,358.058824,233.1875,10773.058824,,


In [34]:
addison = test2[test2['County'] == "Addison County"]

In [47]:

conaddison = addison[addison['Type'] == 'Construction supersector']

conaddison.head(40)                                         
#sns.jointplot(x='Average Employment', y='Average Wage', data=conaddison)

Stat,Year,Quarter,County,Type,Average Employment,Average Female Employment,Average Wage,Number of Establishments,Total Wages
4,2017,1,Addison County,Construction supersector,636.0,87.8125,9351.444444,174.666667,5953.166667
575,2017,2,Addison County,Construction supersector,741.444444,97.6875,8969.777778,177.555556,
1146,2017,3,Addison County,Construction supersector,803.411765,96.9375,9016.176471,179.764706,
1717,2017,4,Addison County,Construction supersector,750.647059,97.375,9802.411765,,


In [None]:
# Clean up the column order and then sort everything
months.sort_values(by=['County','Year','Time'])
months.shape

## Exploring the Data

At this point what we have is month over month employment counts for 17 years by industry at various levels. First we are going to tackle the supersectors and see what they look like in box plots. Box plots are a standardized way of displaying the distribution of data based on the five number summary: 
* minimum (bottom T of the line)
* first quartile (bottom of the box)
* median (horizontal line bisecting the box)
* third quartile (top of the box)
* maximum. (top T of the line

In [None]:
natres = quarters[quarters['Type'] == 'Natural Resources and Mining supersector']

data = natres
sns.boxplot(x='Time', y='Count',  data=data)
plt.xticks(rotation=90)

In [None]:
data = natres
sns.boxplot(x='Time', y='Count', hue='County',  data=data)
plt.xticks(rotation=45)

In [None]:
const = months[months['Type'] == 'Construction supersector']

data = const
sns.boxplot(x='Month', y='Count',  data=data)
plt.xticks(rotation=90)

In [None]:
manf = months[months['Type'] == 'Manufacturing supersector']

data = manf
sns.boxplot(x='Month', y='Count',  data=data)
plt.xticks(rotation=90)

In [None]:
trade = months[months['Type'] == 'Trade, Transportation, and Utilities supersector']

data = trade
sns.boxplot(x='Month', y='Count',  data=data)
plt.xticks(rotation=90)

In [None]:
info = months[months['Type'] == 'Information supersector']
data = info
sns.boxplot(x='Month', y='Count', data=data)
plt.xticks(rotation=90)

In [None]:
fin = months[months['Type'] == 'Financial Activities supersector']
data = fin
sns.boxplot(x='Month', y='Count',  data=data)
plt.xticks(rotation=90)

In [None]:
busin = months[months['Type'] == 'Professional and Business Services supersector']
data = busin
sns.boxplot(x='Month', y='Count', data=data)
plt.xticks(rotation=90)

In [None]:
edhealth = months[months['Type'] == 'Education and Health Services supersector']
data = edhealth
sns.boxplot(x='Month', y='Count', data=data)
plt.xticks(rotation=90)

In [None]:
leis = months[months['Type'] == 'Leisure and Hospitality supersector']
data = leis
sns.boxplot(x='Month', y='Count', data=data)
plt.xticks(rotation=90)

In [None]:
other = months[months['Type'] == 'Other services, except public administration supersector']
data = other
sns.boxplot(x='Month', y='Count', data=data)
plt.xticks(rotation=90)

In [None]:
art = months[months['Type'] == 'Arts, entertainment, and recreation']
#art = months[months['Type'] == 'Leisure and Hospitality supersector']

art.head()

In [None]:
data = art[art['County'] == 'Chittenden County']
sns.boxplot(x='Month', y='Count',   data=data)
plt.xticks(rotation=90)

In [None]:
data = art[art['County'] == 'Washington County']
sns.boxplot(x='Month', y='Count',   data=data)
plt.xticks(rotation=90)

In [None]:
data = art
data = data[(data['County'] == 'Bennington County') |
          (data['County'] == 'Washington County') | 
          (data['County'] == 'Chittenden County') | 
          (data['County'] == 'Rutland County')]
sns.boxplot(x='Month', y='Count', hue='County',  data=data)
plt.xticks(rotation=90)

In [None]:
health = months[months['Type'] == 'Education and Health Services supersector']

In [None]:
sns.boxplot(x='County', y='Count', data=health)
plt.xticks(rotation=90)

In [None]:
pd.unique(months['Type'])

In [None]:
fin = months[months['Type'] == 'Financial Activities supersector']

In [None]:
sns.boxplot(x='County', y='Count',  data=fin)
plt.xticks(rotation=90)

In [None]:
tech = months[months['Type'] == 'Professional and Business Services supersector']

In [None]:
sns.boxplot(x='County', y='Count',  data=tech)
plt.xticks(rotation=90)

In [None]:
info = months[months['Type'] == 'Information supersector']

In [None]:
sns.boxplot(x='County', y='Count',  data=info)
plt.xticks(rotation=90)

In [None]:
healthed = months[months['Type'] == 'Education and Health Services supersector']

In [None]:
sns.boxplot(x='County', y='Count',  data=healthed)
plt.xticks(rotation=90)

In [None]:
fed = months[months['Type'] == 'Federal Government']

In [None]:
sns.boxplot(x='County', y='Count',  data=fed)
plt.xticks(rotation=90)

In [None]:
state = months[months['Type'] == 'State government']

In [None]:
sns.boxplot(x='County', y='Count',  data=state)
plt.xticks(rotation=90)