In [1]:
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline

## Information and Instructions

IN this notebook the data from step one will be transformed into a Tidy Data type structure split into Montly, Quarterly and Annual observations. There are additional data cleaning steps performed as well. This is to prepare it for analysis in the next step.

In [2]:
# Load cumilative datasets created in step 1
cumllaborcnty = pd.read_csv('cumllaborcnty.csv')
cumllaborvt = pd.read_csv('cumllaborvt.csv')

# County

## Melting Data

In order to split this out correctly and reorder the data, first I melt it into few columns so months, quarters and annual measurements are in a single row.

In [3]:
cumllaborcnty = pd.melt(cumllaborcnty,id_vars=['Year','County','Type'],var_name='Time', value_name='Count')
cumllaborcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2000,Addison County,Total Covered - all ownerships,Apr,13656
1,2000,Addison County,Private ownership,Apr,11545
2,2000,Addison County,Goods Producing domain,Apr,3366
3,2000,Addison County,Natural Resources and Mining supersector,Apr,533
4,2000,Addison County,"Agriculture, forestry, fishing and hunting",Apr,484


In [4]:
# There are missing values or occasions where the occurance is too small to show, each we are replacing with NaNs
cumllaborcnty = cumllaborcnty.replace('(c)', np.nan)
cumllaborcnty = cumllaborcnty.replace('-', np.nan)
cumllaborcnty = cumllaborcnty.replace('na', np.nan)
cumllaborcnty = cumllaborcnty.replace(' ', np.nan)
cumllaborcnty = cumllaborcnty.replace('<1', 1)

# Numeric was stored as a string and needed to be converted
cumllaborcnty['Count'] = pd.to_numeric(cumllaborcnty['Count'])

In [5]:
# Set up lists of how these are going to be split out into similiar types
monthlist = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

quarterslist = ['Average Employment Qtr 1', 'Average Employment Qtr 2', 'Average Employment Qtr 3', 'Average Employment Qtr 4', 
            'Average Female Employment Qtr 1', 'Average Female Employment Qtr 2','Average Female Employment Qtr 3',
            'Average Female Employment Qtr 4','Total Wages Qtr 1', 'Total Wages 2', 'Total Wages 3', 
            'Total Wages 4','Average Wage Qtr 1', 'Average Wage Qtr 2', 'Average Wage Qtr 3', 
            'Average Wage Qtr 4','Number of Establishments Qtr 1', 'Number of Establishments Qtr 2',
            'Number of Establishments Qtr 3', 'Number of EstablishmentsQtr 4'
           ]

annuallist = ['Average Employment Annual', 'Average Female Employment Annual', 'Total Wages Annual',  'Average Wage Annual',
           'Number of Establishments Annual'
          ]

In [6]:
# Creates the three data sets by type of time period
monthscnty = cumllaborcnty[cumllaborcnty['Time'].isin(monthlist)].copy()
quarterscnty = cumllaborcnty[cumllaborcnty['Time'].isin(quarterslist)].copy()
annualcnty = cumllaborcnty[cumllaborcnty['Time'].isin(annuallist)].copy()

In [7]:
monthscnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2000,Addison County,Total Covered - all ownerships,Apr,13656.0
1,2000,Addison County,Private ownership,Apr,11545.0
2,2000,Addison County,Goods Producing domain,Apr,3366.0
3,2000,Addison County,Natural Resources and Mining supersector,Apr,533.0
4,2000,Addison County,"Agriculture, forestry, fishing and hunting",Apr,484.0


In [8]:
quarterscnty.head()

Unnamed: 0,Year,County,Type,Time,Count
28350,2000,Addison County,Total Covered - all ownerships,Average Employment Qtr 1,13162.0
28351,2000,Addison County,Private ownership,Average Employment Qtr 1,11121.0
28352,2000,Addison County,Goods Producing domain,Average Employment Qtr 1,3270.0
28353,2000,Addison County,Natural Resources and Mining supersector,Average Employment Qtr 1,474.0
28354,2000,Addison County,"Agriculture, forestry, fishing and hunting",Average Employment Qtr 1,437.0


In [9]:
annualcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
18900,2000,Addison County,Total Covered - all ownerships,Average Employment Annual,13730.0
18901,2000,Addison County,Private ownership,Average Employment Annual,11752.0
18902,2000,Addison County,Goods Producing domain,Average Employment Annual,3445.0
18903,2000,Addison County,Natural Resources and Mining supersector,Average Employment Annual,538.0
18904,2000,Addison County,"Agriculture, forestry, fishing and hunting",Average Employment Annual,488.0


## Months

There are some little changes that make the monthly section more useful. Renaming columns, changing months to their full name and working with datetime.

In [10]:
monthscnty.columns = ['Year', 'County', 'Type', 'Month', 'Employment']
monthscnty = monthscnty[['Year', 'Month','County', 'Type', 'Employment']]
monthscnty.head()

Unnamed: 0,Year,Month,County,Type,Employment
0,2000,Apr,Addison County,Total Covered - all ownerships,13656.0
1,2000,Apr,Addison County,Private ownership,11545.0
2,2000,Apr,Addison County,Goods Producing domain,3366.0
3,2000,Apr,Addison County,Natural Resources and Mining supersector,533.0
4,2000,Apr,Addison County,"Agriculture, forestry, fishing and hunting",484.0


In [11]:
# creates a corrected list of month names
months = {'Jan':'January', 'Feb':'February', 'Mar':'March', 'Apr':'April', 'May':'May', 'Jun':'June', 'Jul':'July', 
             'Aug':'August','Sep':'September', 'Oct':'October', 'Nov':'November', 'Dec':'December'}

monthscnty['Month'].replace(months, inplace=True)

In [12]:
pd.unique(monthscnty['Month'])

array(['April', 'August', 'December', 'February', 'January', 'July',
       'June', 'March', 'May', 'November', 'October', 'September'], dtype=object)

In [13]:
monthscnty.head()

Unnamed: 0,Year,Month,County,Type,Employment
0,2000,April,Addison County,Total Covered - all ownerships,13656.0
1,2000,April,Addison County,Private ownership,11545.0
2,2000,April,Addison County,Goods Producing domain,3366.0
3,2000,April,Addison County,Natural Resources and Mining supersector,533.0
4,2000,April,Addison County,"Agriculture, forestry, fishing and hunting",484.0


## Quarters

Originally each quarter's observations were split into columns. Now I want the quarter(1,2,3,4) to be stored as a column, and the type of observation ('Average Employment', 'Average Female Employment', 'Total Wages', 'Average Wage', 'Number of Establishments') to be split into columns. First split out the quarter value, then 

In [14]:
# create an empty column for quarters, cut the numeric value and place them into the quarter column
quarterscnty['Quarter'] = np.nan
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 1"), "Quarter"] = "1"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 2"), "Quarter"] = "2"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 3"), "Quarter"] = "3"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 4"), "Quarter"] = "4"
quarterscnty['Time'] = quarterscnty['Time'].str[:-6]

In [15]:
# Make sure there are not any weird mistakes
pd.unique(quarterscnty['Time'])

array(['Average Employment', 'Average Female Employment', 'Average Wage',
       'Number of Establishments', 'Total Wages'], dtype=object)

In [16]:
# Rename and reorder the columns
quarterscnty.columns = ['Year', 'County', 'Type', 'Stat', 'Count', 'Quarter']
quarterscnty = quarterscnty[['Year', 'Quarter', 'County', 'Type', 'Stat', 'Count', ]]

In [17]:
quarterscnty = pd.pivot_table(quarterscnty, values='Count', index=['Year','Quarter','County','Type'], columns=['Stat'])
quarterscnty = quarterscnty.reset_index() 

In [18]:
quarterscnty.head()

Stat,Year,Quarter,County,Type,Average Employment,Average Female Employment,Average Wage,Number of Establishments,Total Wages
0,2000,1,Addison County,Accommodation and food services,869.0,480.0,2901.0,71.0,2521.0
1,2000,1,Addison County,Administrative and waste services,165.0,63.0,5233.0,38.0,863.0
2,2000,1,Addison County,"Agriculture, forestry, fishing and hunting",437.0,107.0,4957.0,61.0,2166.0
3,2000,1,Addison County,"Arts, entertainment, and recreation",91.0,56.0,3601.0,12.0,329.0
4,2000,1,Addison County,Construction supersector,602.0,84.0,8348.0,138.0,5029.0


In [19]:
annualcnty['Time'] = annualcnty['Time'].str[:-7]

In [20]:
# Rename and reorder the columns
annualcnty.columns = ['Year', 'County', 'Type', 'Stat', 'Count']
annualcnty = annualcnty[['Year',  'County', 'Type', 'Stat', 'Count' ]]

In [21]:
annualcnty = pd.pivot_table(annualcnty, values='Count', index=['Year','County','Type'], columns=['Stat'])
annualcnty = annualcnty.reset_index() 

# Vermont Level Data

In [22]:
cumllaborvt.head()

Unnamed: 0,Apr,Aug,Average Employment Annual,Average Employment Qtr 1,Average Employment Qtr 2,Average Employment Qtr 3,Average Employment Qtr 4,Average Female Employment Annual,Average Female Employment Qtr 1,Average Female Employment Qtr 2,...,Number of Establishments Qtr 4,Oct,Sep,Total Wages Annual,Total Wages Qtr 1,Total Wages Qtr 2,Total Wages Qtr 3,Total Wages Qtr 4,Type,Year
0,290568,293413,296468,293478,296038,294632,301722,na,na,na,...,23958,301137,299962,8575450,2067514,2128115,2076698,2303123,Total Covered - all ownerships,2000
1,241513,253631,249122,244811,246679,252711,252288,115581,112692,115454,...,22276,251934,251854,7150813,1716961,1738357,1765630,1929865,Private ownership,2000
2,63132,67282,64803,61163,65030,67020,65999,16116,15213,16259,...,4327,66600,66739,2405516,578053,587003,596561,643899,Goods Producing domain,2000
3,3055,3456,3144,2700,3268,3428,3180,852,723,998,...,381,3373,3390,76224,16820,19047,20157,20201,Natural Resources and Mining supersector,2000
4,2117,2448,2206,1910,2297,2433,2185,739,618,882,...,311,2362,2394,43072,9575,10684,11508,11305,"Agriculture, forestry, fishing and hunting",2000


In [23]:
cumllaborvt = pd.melt(cumllaborvt,id_vars=['Year','County','Type'],var_name='Time', value_name='Count')
cumllaborvt.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2000,Vermont,Total Covered - all ownerships,Apr,290568
1,2000,Vermont,Private ownership,Apr,241513
2,2000,Vermont,Goods Producing domain,Apr,63132
3,2000,Vermont,Natural Resources and Mining supersector,Apr,3055
4,2000,Vermont,"Agriculture, forestry, fishing and hunting",Apr,2117


In [24]:
# There are missing values or occasions where the occurance is too small to show, each we are replacing with NaNs
cumllaborvt = cumllaborvt.replace('(c)', np.nan)
cumllaborvt = cumllaborvt.replace('-', np.nan)
cumllaborvt = cumllaborvt.replace('na', np.nan)
cumllaborvt = cumllaborvt.replace(' ', np.nan)
cumllaborvt = cumllaborvt.replace('<1', 1)

# Numeric was stored as a string and needed to be converted
cumllaborvt['Count'] = pd.to_numeric(cumllaborvt['Count'])

In [25]:
# Creates the three data sets by type of time period
monthsvt = cumllaborvt[cumllaborvt['Time'].isin(monthlist)].copy()
quartersvt = cumllaborvt[cumllaborvt['Time'].isin(quarterslist)].copy()
annualvt = cumllaborvt[cumllaborvt['Time'].isin(annuallist)].copy()

In [26]:
monthsvt.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2000,Vermont,Total Covered - all ownerships,Apr,290568.0
1,2000,Vermont,Private ownership,Apr,241513.0
2,2000,Vermont,Goods Producing domain,Apr,63132.0
3,2000,Vermont,Natural Resources and Mining supersector,Apr,3055.0
4,2000,Vermont,"Agriculture, forestry, fishing and hunting",Apr,2117.0


In [27]:
quartersvt.head()

Unnamed: 0,Year,County,Type,Time,Count
1890,2000,Vermont,Total Covered - all ownerships,Average Employment Qtr 1,293478.0
1891,2000,Vermont,Private ownership,Average Employment Qtr 1,244811.0
1892,2000,Vermont,Goods Producing domain,Average Employment Qtr 1,61163.0
1893,2000,Vermont,Natural Resources and Mining supersector,Average Employment Qtr 1,2700.0
1894,2000,Vermont,"Agriculture, forestry, fishing and hunting",Average Employment Qtr 1,1910.0


In [28]:
annualvt.head()

Unnamed: 0,Year,County,Type,Time,Count
1260,2000,Vermont,Total Covered - all ownerships,Average Employment Annual,296468.0
1261,2000,Vermont,Private ownership,Average Employment Annual,249122.0
1262,2000,Vermont,Goods Producing domain,Average Employment Annual,64803.0
1263,2000,Vermont,Natural Resources and Mining supersector,Average Employment Annual,3144.0
1264,2000,Vermont,"Agriculture, forestry, fishing and hunting",Average Employment Annual,2206.0


In [29]:
monthsvt.columns = ['Year', 'Location', 'Type', 'Month', 'Employment']
monthsvt = monthsvt[['Year', 'Month','Location', 'Type', 'Employment']]
monthsvt.head()

monthsvt['Month'].replace(months, inplace=True)

In [30]:
# create an empty column for quarters, cut the numeric value and place them into the quarter column
quartersvt['Quarter'] = np.nan
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 1"), "Quarter"] = "1"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 2"), "Quarter"] = "2"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 3"), "Quarter"] = "3"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 4"), "Quarter"] = "4"
quartersvt['Time'] = quartersvt['Time'].str[:-6]

In [31]:
# Rename and reorder the columns
quartersvt.columns = ['Year', 'Location', 'Type', 'Stat', 'Count', 'Quarter']
quartersvt = quartersvt[['Year', 'Quarter', 'Location', 'Type', 'Stat', 'Count', ]]

In [32]:
quartersvt = pd.pivot_table(quartersvt, values='Count', index=['Year','Quarter','Location','Type'], columns=['Stat'])
quartersvt = quartersvt.reset_index() 

In [33]:
annualvt['Time'] = annualvt['Time'].str[:-7]

In [34]:
# Rename and reorder the columns
annualvt.columns = ['Year', 'Location', 'Type', 'Stat', 'Count']
annualvt = annualvt[['Year',  'Location', 'Type', 'Stat', 'Count' ]]

In [35]:
annualvt = pd.pivot_table(annualvt, values='Count', index=['Year','Location','Type'], columns=['Stat'])
annualvt = annualvt.reset_index() 

In [36]:
monthscnty.to_csv("monthlylaborcnty.csv", index=False)
quarterscnty.to_csv("quarterlylaborcnty.csv", index=False)
annualcnty.to_csv("annuallaborcnty.csv", index=False)

In [37]:
monthsvt.to_csv("monthlylaborvt.csv", index=False)
quartersvt.to_csv("quarterlylaborvt.csv", index=False)
annualvt.to_csv("annuallaborvt.csv", index=False)