In [1]:
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline

from time import strptime
import datetime

## Information and Instructions

IN this notebook the data from step one will be transformed into a Tidy Data type structure split into Montly, Quarterly and Annual observations. There are additional data cleaning steps performed as well. This is to prepare it for analysis in the next step.

In [5]:
# Load cumilative datasets created in step 1
cumllaborcnty = pd.read_csv('cumllaborcnty.csv')
cumllaborvt = pd.read_csv('cumllaborvt.csv')

# County

## Melting Data

In order to split this out correctly and reorder the data, first I melt it into few columns so months, quarters and annual measurements are in a single row.

In [6]:
cumllaborcnty = pd.melt(cumllaborcnty,id_vars=['Year','County','Type'],var_name='Time', value_name='Count')
cumllaborcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2017,Brattleboro Labor Market Area,Brattleboro Labor Market Area,Apr,
1,2017,Brattleboro Labor Market Area,Total Covered - all ownerships,Apr,20507.0
2,2017,Brattleboro Labor Market Area,Private ownership,Apr,17492.0
3,2017,Brattleboro Labor Market Area,Goods Producing domain,Apr,3556.0
4,2017,Brattleboro Labor Market Area,Natural Resources and Mining supersector,Apr,323.0


In [7]:
# There are missing values or occasions where the occurance is too small to show, each we are replacing with NaNs
cumllaborcnty = cumllaborcnty.replace('(c)', np.nan)
cumllaborcnty = cumllaborcnty.replace('-', np.nan)
cumllaborcnty = cumllaborcnty.replace('na', np.nan)
cumllaborcnty = cumllaborcnty.replace(' ', np.nan)
cumllaborcnty = cumllaborcnty.replace('<1', 1)

# Numeric was stored as a string and needed to be converted
cumllaborcnty['Count'] = pd.to_numeric(cumllaborcnty['Count'])
#cumllaborcnty['County'] = cumllaborcnty['County'].str[:-7]

In [8]:
# Set up lists of how these are going to be split out into similiar types
monthlist = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

quarterslist = ['Average Employment Qtr 1', 'Average Employment Qtr 2', 'Average Employment Qtr 3', 'Average Employment Qtr 4', 
            'Average Female Employment Qtr 1', 'Average Female Employment Qtr 2','Average Female Employment Qtr 3',
            'Average Female Employment Qtr 4','Total Wages Qtr 1', 'Total Wages 2', 'Total Wages 3', 
            'Total Wages 4','Average Wage Qtr 1', 'Average Wage Qtr 2', 'Average Wage Qtr 3', 
            'Average Wage Qtr 4','Number of Establishments Qtr 1', 'Number of Establishments Qtr 2',
            'Number of Establishments Qtr 3', 'Number of EstablishmentsQtr 4'
           ]

annuallist = ['Average Employment Annual', 'Average Female Employment Annual', 'Total Wages Annual',  'Average Wage Annual',
           'Number of Establishments Annual'
          ]

In [9]:
# Creates the three data sets by type of time period
monthscnty = cumllaborcnty[cumllaborcnty['Time'].isin(monthlist)].copy()
quarterscnty = cumllaborcnty[cumllaborcnty['Time'].isin(quarterslist)].copy()
annualcnty = cumllaborcnty[cumllaborcnty['Time'].isin(annuallist)].copy()

In [10]:
monthscnty.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2017,Brattleboro Labor Market Area,Brattleboro Labor Market Area,Apr,
1,2017,Brattleboro Labor Market Area,Total Covered - all ownerships,Apr,20507.0
2,2017,Brattleboro Labor Market Area,Private ownership,Apr,17492.0
3,2017,Brattleboro Labor Market Area,Goods Producing domain,Apr,3556.0
4,2017,Brattleboro Labor Market Area,Natural Resources and Mining supersector,Apr,323.0


In [11]:
quarterscnty.head()

Unnamed: 0,Year,County,Type,Time,Count
42834,2017,Brattleboro Labor Market Area,Brattleboro Labor Market Area,Average Employment Qtr 1,
42835,2017,Brattleboro Labor Market Area,Total Covered - all ownerships,Average Employment Qtr 1,21029.0
42836,2017,Brattleboro Labor Market Area,Private ownership,Average Employment Qtr 1,18111.0
42837,2017,Brattleboro Labor Market Area,Goods Producing domain,Average Employment Qtr 1,3490.0
42838,2017,Brattleboro Labor Market Area,Natural Resources and Mining supersector,Average Employment Qtr 1,311.0


In [12]:
annualcnty.head()

Unnamed: 0,Year,County,Type,Time,Count
28556,2017,Brattleboro Labor Market Area,Brattleboro Labor Market Area,Average Employment Annual,
28557,2017,Brattleboro Labor Market Area,Total Covered - all ownerships,Average Employment Annual,20665.0
28558,2017,Brattleboro Labor Market Area,Private ownership,Average Employment Annual,17839.0
28559,2017,Brattleboro Labor Market Area,Goods Producing domain,Average Employment Annual,3630.0
28560,2017,Brattleboro Labor Market Area,Natural Resources and Mining supersector,Average Employment Annual,341.0


## Months

There are some little changes that make the monthly section more useful. Renaming columns, changing months to their full name and working with datetime.

In [13]:
monthscnty.columns = ['Year', 'County', 'Type', 'Month', 'Employment']
monthscnty = monthscnty[['Year', 'Month','County', 'Type', 'Employment']]
monthscnty.head()

Unnamed: 0,Year,Month,County,Type,Employment
0,2017,Apr,Brattleboro Labor Market Area,Brattleboro Labor Market Area,
1,2017,Apr,Brattleboro Labor Market Area,Total Covered - all ownerships,20507.0
2,2017,Apr,Brattleboro Labor Market Area,Private ownership,17492.0
3,2017,Apr,Brattleboro Labor Market Area,Goods Producing domain,3556.0
4,2017,Apr,Brattleboro Labor Market Area,Natural Resources and Mining supersector,323.0


In [14]:
month_name = list(monthscnty['Month'])

monthnum = []

for month in month_name:
    month_number = strptime(month, '%b').tm_mon
    monthnum.append(month_number)

    
monthscnty['Month_Number'] = monthnum

In [15]:
dateassemble = monthscnty[['Year','Month_Number']]
dateassemble.columns = ['year','month']
dateassemble['day'] = 1
dateassemble = pd.to_datetime(dateassemble)
monthscnty['Start'] = dateassemble

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
monthscnty.head(10)

Unnamed: 0,Year,Month,County,Type,Employment,Month_Number,Start
0,2017,Apr,Brattleboro Labor Market Area,Brattleboro Labor Market Area,,4,2017-04-01
1,2017,Apr,Brattleboro Labor Market Area,Total Covered - all ownerships,20507.0,4,2017-04-01
2,2017,Apr,Brattleboro Labor Market Area,Private ownership,17492.0,4,2017-04-01
3,2017,Apr,Brattleboro Labor Market Area,Goods Producing domain,3556.0,4,2017-04-01
4,2017,Apr,Brattleboro Labor Market Area,Natural Resources and Mining supersector,323.0,4,2017-04-01
5,2017,Apr,Brattleboro Labor Market Area,"Agriculture, forestry, fishing and hunting",,4,2017-04-01
6,2017,Apr,Brattleboro Labor Market Area,Mining,,4,2017-04-01
7,2017,Apr,Brattleboro Labor Market Area,Construction supersector,959.0,4,2017-04-01
8,2017,Apr,Brattleboro Labor Market Area,Manufacturing supersector,2274.0,4,2017-04-01
9,2017,Apr,Brattleboro Labor Market Area,Durable Goods manufacturing,1378.0,4,2017-04-01


In [17]:
pd.unique(monthscnty['Month'])

array(['Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May',
       'Nov', 'Oct', 'Sep'], dtype=object)

In [18]:
monthscnty = monthscnty[['Year','Month','Month_Number','Start','County','Type','Employment']]

In [19]:
monthscnty.head()

Unnamed: 0,Year,Month,Month_Number,Start,County,Type,Employment
0,2017,Apr,4,2017-04-01,Brattleboro Labor Market Area,Brattleboro Labor Market Area,
1,2017,Apr,4,2017-04-01,Brattleboro Labor Market Area,Total Covered - all ownerships,20507.0
2,2017,Apr,4,2017-04-01,Brattleboro Labor Market Area,Private ownership,17492.0
3,2017,Apr,4,2017-04-01,Brattleboro Labor Market Area,Goods Producing domain,3556.0
4,2017,Apr,4,2017-04-01,Brattleboro Labor Market Area,Natural Resources and Mining supersector,323.0


## Quarters

Originally each quarter's observations were split into columns. Now I want the quarter(1,2,3,4) to be stored as a column, and the type of observation ('Average Employment', 'Average Female Employment', 'Total Wages', 'Average Wage', 'Number of Establishments') to be split into columns. First split out the quarter value, then 

In [20]:
# create an empty column for quarters, cut the numeric value and place them into the quarter column
quarterscnty['Quarter'] = np.nan
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 1"), "Quarter"] = "1"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 2"), "Quarter"] = "2"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 3"), "Quarter"] = "3"
quarterscnty.loc[quarterscnty['Time'].str.endswith("Qtr 4"), "Quarter"] = "4"
quarterscnty['Time'] = quarterscnty['Time'].str[:-6]

In [21]:
# Make sure there are not any weird mistakes
pd.unique(quarterscnty['Time'])

array(['Average Employment', 'Average Female Employment', 'Average Wage',
       'Number of Establishments', 'Total Wages'], dtype=object)

In [22]:
# Rename and reorder the columns
quarterscnty.columns = ['Year', 'County', 'Type', 'Stat', 'Count', 'Quarter']
quarterscnty = quarterscnty[['Year', 'Quarter', 'County', 'Type', 'Stat', 'Count', ]]

In [23]:
quarterscnty = pd.pivot_table(quarterscnty, values='Count', index=['Year','Quarter','County','Type'], columns=['Stat'])
quarterscnty = quarterscnty.reset_index() 

In [24]:
quarterscnty.head()

Stat,Year,Quarter,County,Type,Average Employment,Average Female Employment,Average Wage,Number of Establishments,Total Wages
0,2000,1,All Other Labor Market Area,Accommodation and food services,,,,7.0,
1,2000,1,All Other Labor Market Area,Administrative and waste services,433.0,127.0,5543.0,75.0,2398.0
2,2000,1,All Other Labor Market Area,"Agriculture, forestry, fishing and hunting",,,,2.0,
3,2000,1,All Other Labor Market Area,All Other Labor Market Area,,,,,
4,2000,1,All Other Labor Market Area,"Arts, entertainment, and recreation",,,,5.0,


In [25]:
annualcnty['Time'] = annualcnty['Time'].str[:-7]

In [26]:
# Rename and reorder the columns
annualcnty.columns = ['Year', 'County', 'Type', 'Stat', 'Count']
annualcnty = annualcnty[['Year',  'County', 'Type', 'Stat', 'Count' ]]

In [27]:
annualcnty = pd.pivot_table(annualcnty, values='Count', index=['Year','County','Type'], columns=['Stat'])
annualcnty = annualcnty.reset_index() 

# Vermont Level Data

In [28]:
cumllaborvt.head()

Unnamed: 0,Apr,Aug,Average Employment Annual,Average Employment Qtr 1,Average Employment Qtr 2,Average Employment Qtr 3,Average Employment Qtr 4,Average Female Employment Annual,Average Female Employment Qtr 1,Average Female Employment Qtr 2,...,Number of Establishments Qtr 4,Oct,Sep,Total Wages Annual,Total Wages Qtr 1,Total Wages Qtr 2,Total Wages Qtr 3,Total Wages Qtr 4,Type,Year
0,304895,310528,309326,305860,309129,310104,312210,,,,...,25666,311584,310871,14264598,3536580,3495776,3502935,3729307,Total Covered - all ownerships,2017
1,250471,261522,256295,252197,254911,260313,257760,,,,...,24179,257081,257480,11568969,2847786,2791423,2873155,3056605,Private ownership,2017
2,47261,50527,48617,45978,48921,50323,49245,,,,...,4561,49905,49883,2618235,625176,646155,658927,687976,Goods Producing domain,2017
3,3783,4201,3900,3518,3985,4210,3888,,,,...,553,4071,4148,145193,32223,36386,37829,38755,Natural Resources and Mining supersector,2017
4,3170,3539,3285,2981,3346,3551,3260,,,,...,495,3422,3494,108446,24152,27134,28323,28838,"Agriculture, forestry, fishing and hunting",2017


In [29]:
cumllaborvt = pd.melt(cumllaborvt,id_vars=['Year','County','Type'],var_name='Time', value_name='Count')
cumllaborvt.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2017,Vermont,Total Covered - all ownerships,Apr,304895
1,2017,Vermont,Private ownership,Apr,250471
2,2017,Vermont,Goods Producing domain,Apr,47261
3,2017,Vermont,Natural Resources and Mining supersector,Apr,3783
4,2017,Vermont,"Agriculture, forestry, fishing and hunting",Apr,3170


In [30]:
# There are missing values or occasions where the occurance is too small to show, each we are replacing with NaNs
cumllaborvt = cumllaborvt.replace('(c)', np.nan)
cumllaborvt = cumllaborvt.replace('-', np.nan)
cumllaborvt = cumllaborvt.replace('na', np.nan)
cumllaborvt = cumllaborvt.replace(' ', np.nan)
cumllaborvt = cumllaborvt.replace('<1', 1)

# Numeric was stored as a string and needed to be converted
cumllaborvt['Count'] = pd.to_numeric(cumllaborvt['Count'])

In [31]:
# Creates the three data sets by type of time period
monthsvt = cumllaborvt[cumllaborvt['Time'].isin(monthlist)].copy()
quartersvt = cumllaborvt[cumllaborvt['Time'].isin(quarterslist)].copy()
annualvt = cumllaborvt[cumllaborvt['Time'].isin(annuallist)].copy()

In [32]:
monthsvt.head()

Unnamed: 0,Year,County,Type,Time,Count
0,2017,Vermont,Total Covered - all ownerships,Apr,304895.0
1,2017,Vermont,Private ownership,Apr,250471.0
2,2017,Vermont,Goods Producing domain,Apr,47261.0
3,2017,Vermont,Natural Resources and Mining supersector,Apr,3783.0
4,2017,Vermont,"Agriculture, forestry, fishing and hunting",Apr,3170.0


In [33]:
quartersvt.head()

Unnamed: 0,Year,County,Type,Time,Count
1893,2017,Vermont,Total Covered - all ownerships,Average Employment Qtr 1,305860.0
1894,2017,Vermont,Private ownership,Average Employment Qtr 1,252197.0
1895,2017,Vermont,Goods Producing domain,Average Employment Qtr 1,45978.0
1896,2017,Vermont,Natural Resources and Mining supersector,Average Employment Qtr 1,3518.0
1897,2017,Vermont,"Agriculture, forestry, fishing and hunting",Average Employment Qtr 1,2981.0


In [34]:
annualvt.head()

Unnamed: 0,Year,County,Type,Time,Count
1262,2017,Vermont,Total Covered - all ownerships,Average Employment Annual,309326.0
1263,2017,Vermont,Private ownership,Average Employment Annual,256295.0
1264,2017,Vermont,Goods Producing domain,Average Employment Annual,48617.0
1265,2017,Vermont,Natural Resources and Mining supersector,Average Employment Annual,3900.0
1266,2017,Vermont,"Agriculture, forestry, fishing and hunting",Average Employment Annual,3285.0


In [35]:
monthsvt.columns = ['Year', 'Location', 'Type', 'Month', 'Employment']
monthsvt = monthsvt[['Year', 'Month','Location', 'Type', 'Employment']]
monthsvt.head()

Unnamed: 0,Year,Month,Location,Type,Employment
0,2017,Apr,Vermont,Total Covered - all ownerships,304895.0
1,2017,Apr,Vermont,Private ownership,250471.0
2,2017,Apr,Vermont,Goods Producing domain,47261.0
3,2017,Apr,Vermont,Natural Resources and Mining supersector,3783.0
4,2017,Apr,Vermont,"Agriculture, forestry, fishing and hunting",3170.0


In [36]:

month_name = list(monthsvt['Month'])

monthnum = []

for month in month_name:
    month_number = strptime(month, '%b').tm_mon
    monthnum.append(month_number)

    
monthsvt['Month_Number'] = monthnum

dateassemble = monthsvt[['Year','Month_Number']]
dateassemble.columns = ['year','month']
dateassemble['day'] = 1
dateassemble = pd.to_datetime(dateassemble)
monthscnty['Start'] = dateassemble

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [37]:
monthscnty = monthscnty[['Year','Month','Month_Number','Start','County','Type','Employment']]

In [38]:
# create an empty column for quarters, cut the numeric value and place them into the quarter column
quartersvt['Quarter'] = np.nan
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 1"), "Quarter"] = "1"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 2"), "Quarter"] = "2"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 3"), "Quarter"] = "3"
quartersvt.loc[quartersvt['Time'].str.endswith("Qtr 4"), "Quarter"] = "4"
quartersvt['Time'] = quartersvt['Time'].str[:-6]

In [39]:
# Rename and reorder the columns
quartersvt.columns = ['Year', 'Location', 'Type', 'Stat', 'Count', 'Quarter']
quartersvt = quartersvt[['Year', 'Quarter', 'Location', 'Type', 'Stat', 'Count', ]]

In [40]:
quartersvt = pd.pivot_table(quartersvt, values='Count', index=['Year','Quarter','Location','Type'], columns=['Stat'])
quartersvt = quartersvt.reset_index() 

In [41]:
annualvt['Time'] = annualvt['Time'].str[:-7]

In [42]:
# Rename and reorder the columns
annualvt.columns = ['Year', 'Location', 'Type', 'Stat', 'Count']
annualvt = annualvt[['Year',  'Location', 'Type', 'Stat', 'Count' ]]

In [43]:
annualvt = pd.pivot_table(annualvt, values='Count', index=['Year','Location','Type'], columns=['Stat'])
annualvt = annualvt.reset_index() 

In [44]:
monthscnty.to_csv("monthlylaborcnty.csv", index=False)
quarterscnty.to_csv("quarterlylaborcnty.csv", index=False)
annualcnty.to_csv("annuallaborcnty.csv", index=False)

In [45]:
monthsvt.to_csv("monthlylaborvt.csv", index=False)
quartersvt.to_csv("quarterlylaborvt.csv", index=False)
annualvt.to_csv("annuallaborvt.csv", index=False)