In [1]:
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline

## Information and Instructions

IN this notebook the data from step one will be transformed into a Tidy Data type structure split into Montly, Quarterly and Annual observations. There are additional data cleaning steps performed as well. This is to prepare it for analysis in the next step.

In [2]:
# Load cumilative datasets created in step 1
cumllaborcnty = pd.read_csv('cumllaborcnty.csv')
cumllaborvt = pd.read_csv('cumllaborvt.csv')

## Melting Data

In order to split this out correctly and reorder the data, first I melt it into few columns so months, quarters and annual measurements are in a single row.

In [3]:
cumllaborcnty = pd.melt(cumllaborcnty,id_vars=['Year','County','Type'],var_name='Time', value_name='Count')
cumllaborcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2000,Addison County,Total Covered - all ownerships,Jan,13094
1,2000,Addison County,Private ownership,Jan,11078
2,2000,Addison County,Goods Producing domain,Jan,3259
3,2000,Addison County,Natural Resources and Mining supersector,Jan,459
4,2000,Addison County,"Agriculture, forestry, fishing and hunting",Jan,421


In [4]:
# There are missing values or occasions where the occurance is too small to show, each we are replacing with NaNs
cumllaborcnty = cumllaborcnty.replace('(c)', np.nan)
cumllaborcnty = cumllaborcnty.replace('-', np.nan)
cumllaborcnty = cumllaborcnty.replace('na', np.nan)
cumllaborcnty = cumllaborcnty.replace(' ', np.nan)
cumllaborcnty = cumllaborcnty.replace('<1', 1)

# Numeric was stored as a string and needed to be converted
cumllaborcnty['Count'] = pd.to_numeric(cumllaborcnty['Count'])

In [5]:
# Set up lists of how these are going to be split out into similiar types
monthlist = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

quarterslist = ['Average Employment Qtr 1', 'Average Employment Qtr 2', 'Average Employment Qtr 3', 'Average Employment Qtr 4', 
            'Average Female Employment Qtr 1', 'Average Female Employment Qtr 2','Average Female Employment Qtr 3',
            'Average Female Employment Qtr 4','Total Wages Qtr 1', 'Total Wages 2', 'Total Wages 3', 
            'Total Wages 4','Average Wage Qtr 1', 'Average Wage Qtr 2', 'Average Wage Qtr 3', 
            'Average Wage Qtr 4','Number of Establishments Qtr 1', 'Number of Establishments Qtr 2',
            'Number of Establishments Qtr 3', 'Number of EstablishmentsQtr 4'
           ]

annuallist = ['Average Employment Annual', 'Average Female Employment Annual', 'Total Wages Annual',  'Average Wage Annual',
           'Number of Establishments Annual'
          ]

In [6]:
# Creates the three data sets by type of time period
monthscnty = cumllaborcnty[cumllaborcnty['Time'].isin(monthlist)].copy()
quarterscnty = cumllaborcnty[cumllaborcnty['Time'].isin(quarterslist)].copy()
annualcnty = cumllaborcnty[cumllaborcnty['Time'].isin(annuallist)].copy()

In [7]:
monthscnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2000,Addison County,Total Covered - all ownerships,Jan,13094.0
1,2000,Addison County,Private ownership,Jan,11078.0
2,2000,Addison County,Goods Producing domain,Jan,3259.0
3,2000,Addison County,Natural Resources and Mining supersector,Jan,459.0
4,2000,Addison County,"Agriculture, forestry, fishing and hunting",Jan,421.0


In [8]:
quarterscnty.head()

Unnamed: 0,Year,County,Type,Time,Count
114156,2000,Addison County,Total Covered - all ownerships,Average Employment Qtr 1,13162.0
114157,2000,Addison County,Private ownership,Average Employment Qtr 1,11121.0
114158,2000,Addison County,Goods Producing domain,Average Employment Qtr 1,3270.0
114159,2000,Addison County,Natural Resources and Mining supersector,Average Employment Qtr 1,474.0
114160,2000,Addison County,"Agriculture, forestry, fishing and hunting",Average Employment Qtr 1,437.0


In [9]:
annualcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
152208,2000,Addison County,Total Covered - all ownerships,Average Employment Annual,13730.0
152209,2000,Addison County,Private ownership,Average Employment Annual,11752.0
152210,2000,Addison County,Goods Producing domain,Average Employment Annual,3445.0
152211,2000,Addison County,Natural Resources and Mining supersector,Average Employment Annual,538.0
152212,2000,Addison County,"Agriculture, forestry, fishing and hunting",Average Employment Annual,488.0


## Quarters

Originally each quarter's observations were split into columns. Now I want the quarter(1,2,3,4) to be stored as a column, and the type of observation ('Average Employment', 'Average Female Employment', 'Total Wages', 'Average Wage', 'Number of Establishments') to be split into columns. First split out the quarter value, then 

In [10]:
# create an empty column for quarters, cut the numeric value and place them into the quarter column
quarterscnty['Quarter'] = np.nan
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 1"), "Quarter"] = "1"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 2"), "Quarter"] = "2"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 3"), "Quarter"] = "3"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 4"), "Quarter"] = "4"
quarterscnty['Time'] = quarterscnty['Time'].str[:-6]

In [13]:
# Make sure there are not any weird mistakes
pd.unique(quarterscnty['Time'])

array(['Average Employment', 'Average Female Employment', 'Total Wages',
       'Average Wage', 'Number of Establishments'], dtype=object)

In [14]:
# Rename and reorder the columns
quarterscnty.columns = ['Year', 'County', 'Type', 'Stat', 'Count', 'Quarter']
quarterscnty = quarterscnty[['Year', 'Quarter', 'County', 'Type', 'Stat', 'Count', ]]

In [15]:
quarterscnty = pd.pivot_table(quarterscnty, values='Count', index=['Year','Quarter','County','Type'], columns=['Stat'])
quarterscnty = quarterscnty.reset_index() 

In [16]:
quarterscnty.head()

Stat,Year,Quarter,County,Type,Average Employment,Average Female Employment,Average Wage,Number of Establishments,Total Wages
0,2000,1,Addison County,Accommodation and food services,869.0,480.0,2901.0,71.0,2521.0
1,2000,1,Addison County,Administrative and waste services,165.0,63.0,5233.0,38.0,863.0
2,2000,1,Addison County,"Agriculture, forestry, fishing and hunting",437.0,107.0,4957.0,61.0,2166.0
3,2000,1,Addison County,"Arts, entertainment, and recreation",91.0,56.0,3601.0,12.0,329.0
4,2000,1,Addison County,Construction supersector,602.0,84.0,8348.0,138.0,5029.0


In [19]:
test2 = quarterscnty
test2['Average Male Employment'] = test2['Average Employment'] - test2['Average Female Employment'] 
test2['Perc Male Employment'] = test2['Average Male Employment'] / test2['Average Employment']   
test2['Perc Female Employment'] = test2['Average Female Employment']  / test2['Average Employment'] 

In [20]:
test2.head().sort_values(by=['Perc Female Employment'], ascending=False)

Stat,Year,Quarter,County,Type,Average Employment,Average Female Employment,Average Wage,Number of Establishments,Total Wages,Average Male Employment,Perc Male Employment,Perc Female Employment
3,2000,1,Addison County,"Arts, entertainment, and recreation",91.0,56.0,3601.0,12.0,329.0,35.0,0.384615,0.615385
0,2000,1,Addison County,Accommodation and food services,869.0,480.0,2901.0,71.0,2521.0,389.0,0.447641,0.552359
1,2000,1,Addison County,Administrative and waste services,165.0,63.0,5233.0,38.0,863.0,102.0,0.618182,0.381818
2,2000,1,Addison County,"Agriculture, forestry, fishing and hunting",437.0,107.0,4957.0,61.0,2166.0,330.0,0.755149,0.244851
4,2000,1,Addison County,Construction supersector,602.0,84.0,8348.0,138.0,5029.0,518.0,0.860465,0.139535


In [21]:
# Clean up the column order and then sort everything
monthscnty.sort_values(by=['County','Year','Time'])
monthscnty.shape

(114156, 5)

In [22]:
cumllaborvt.head()

Unnamed: 0,Year,County,Type,Jan,Feb,Mar,Apr,May,Jun,Jul,...,Average Wage Qtr 1,Average Wage Qtr 2,Average Wage Qtr 3,Average Wage Qtr 4,Average Wage Annual,Number of Establishments Qtr 1,Number of Establishments Qtr 2,Number of Establishments Qtr 3,Number of Establishments Qtr 4,Number of Establishments Annual
0,2000,Vermont,Total Covered - all ownerships,292696,292997,294742,290568,296122,301424,290522,...,7045,7189,7048,7633,28925,23542,23802,23920,23958,23806
1,2000,Vermont,Private ownership,244877,244645,244912,241513,245675,252850,252648,...,7013,7047,6987,7649,28704,21872,22132,22238,22276,22130
2,2000,Vermont,Goods Producing domain,61219,60802,61468,63132,64940,67019,67040,...,9451,9027,8901,9756,37120,4218,4305,4337,4327,4297
3,2000,Vermont,Natural Resources and Mining supersector,2646,2640,2814,3055,3280,3469,3439,...,6229,5828,5880,6353,24244,382,386,383,381,383
4,2000,Vermont,"Agriculture, forestry, fishing and hunting",1870,1891,1969,2117,2300,2474,2457,...,5013,4651,4730,5175,19523,313,316,314,311,314


In [23]:
cumllaborvt = pd.melt(cumllaborvt,id_vars=['Year','County','Type'],var_name='Time', value_name='Count')
cumllaborvt.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2000,Vermont,Total Covered - all ownerships,Jan,292696
1,2000,Vermont,Private ownership,Jan,244877
2,2000,Vermont,Goods Producing domain,Jan,61219
3,2000,Vermont,Natural Resources and Mining supersector,Jan,2646
4,2000,Vermont,"Agriculture, forestry, fishing and hunting",Jan,1870


In [24]:
# There are missing values or occasions where the occurance is too small to show, each we are replacing with NaNs
cumllaborvt = cumllaborvt.replace('(c)', np.nan)
cumllaborvt = cumllaborvt.replace('-', np.nan)
cumllaborvt = cumllaborvt.replace('na', np.nan)
cumllaborvt = cumllaborvt.replace(' ', np.nan)
cumllaborvt = cumllaborvt.replace('<1', 1)

# Numeric was stored as a string and needed to be converted
cumllaborvt['Count'] = pd.to_numeric(cumllaborvt['Count'])

In [25]:
# Creates the three data sets by type of time period
monthsvt = cumllaborvt[cumllaborvt['Time'].isin(monthlist)].copy()
quartersvt = cumllaborvt[cumllaborvt['Time'].isin(quarterslist)].copy()
annualvt = cumllaborvt[cumllaborvt['Time'].isin(annuallist)].copy()

In [26]:
# create an empty column for quarters, cut the numeric value and place them into the quarter column
quartersvt['Quarter'] = np.nan
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 1"), "Quarter"] = "1"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 2"), "Quarter"] = "2"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 3"), "Quarter"] = "3"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 4"), "Quarter"] = "4"
quartersvt['Time'] = quartersvt['Time'].str[:-6]

In [27]:
# Rename and reorder the columns
quartersvt.columns = ['Year', 'County', 'Type', 'Stat', 'Count', 'Quarter']
quartersvt = quartersvt[['Year', 'Quarter', 'County', 'Type', 'Stat', 'Count', ]]

In [28]:
quartersvt = pd.pivot_table(quartersvt, values='Count', index=['Year','Quarter','County','Type'], columns=['Stat'])
quartersvt = quartersvt.reset_index() 

In [29]:
#months.to_csv("monthlylaborcnty.csv", index=False)
#quarters.to_csv("quarterlylaborcnty.csv", index=False)
#annuals.to_csv("annuallaborcnty.csv", index=False)