# North Carolina IRS Individual Income Tax Statistics by Zip Code 2014
* ZIP Code data shows selected income and tax items classified by State, ZIP Code, and size of adjusted gross income. 
* Data are based on individual income tax returns filed with the IRS and are available for Tax Years 1998, 2001, and 2004 through 2016.
* This data is aviable at: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-zip-code-data-soi
* We suggest mapping IRS Income Tax data for the tax year that covers a majority of a particular school year. 
* For example, you will find the tax data for 2014 in the 2014-2015 school year folder. This is why the files for NCDPI and IRS data may appear one year off.  
* However, the data is keyed by zip code, so users may merge years however they see fit!  

In [1]:
#import required Libraries
import pandas as pd
import numpy as np
import urllib

#**********************************************************************************
# Set the following variables before running this code!!!
#**********************************************************************************

#Location where copies of the raw data files will be downloaded and saved as csv files.
dataDir = 'C:/Users/Jake/Documents/GitHub/EducationDataNC/2015/Raw Datasets/'

#All raw data files are filtered for the year below
taxYear = 2014

### Save Original Copy of the Data

In [2]:
#Download and save an original copy of the raw data 
#North Carolina - SOI Tax Stats - Individual Income Tax Statistics - 2016 ZIP Code Data (SOI)
urlFile = urllib.URLopener()
url="https://www.irs.gov/pub/irs-soi/14zp34nc.xls"
urlFile.retrieve(url, dataDir + '14zp34nc.xls')

('C:/Users/Jake/Documents/GitHub/EducationDataNC/2015/Raw Datasets/14zp34nc.xls',
 <httplib.HTTPMessage instance at 0x000000000830A908>)

### Clean up the Column Names

In [3]:
#Read in the locally saved file for all futher processing 
path= dataDir + '14zp34nc.xls'
incomeTaxData = pd.read_excel(path, header=[0,1], skiprows=3, skipfooter=17, index_col=None) #, dtype={'unit_code': object})

In [4]:
#Combine multiple index column into single index column
incomeTaxData.columns = [' '.join(col).strip() for col in incomeTaxData.columns.values]
incomeTaxData.reset_index(inplace=True)
incomeTaxData.rename(columns={'index': 'Zip Code'}, inplace=True)

#Get rid of unnamed values levels from second column index 
incomeTaxData.columns = [ col[ : np.where(col.find("Unnamed:") >= 0, col.find("Unnamed:") , len(col)  )] for col in incomeTaxData.columns.values]

#Clean up and shorten remaining column names 
incomeTaxData.columns = [ col.replace('[2]','')
                             .replace('[3]','')
                             .replace('[4]','')
                             .replace('[5]','')
                             .replace('[6]','')
                             .replace('[7]','')
                             .replace('[8]','')
                             .replace('[9]','')
                             .replace('[10]','')
                             .replace('[11]','')
                             .replace('[12]','')
                             .replace('\r','')
                             .replace('\n','')
                             .replace(' Number of returns',' Ct')
                             .replace('Amount','Amt')
                             .replace('Total','Tot')
                             .replace('Additional','Add')
                             .replace('additional','Add')
                             .replace('miscellaneous','misc')
                             .replace('education','edu')
                             .replace('  ',' ')
                             .replace('Number of volunteer prepared returns Number of volunteer income tax assistance (VITA) prepared returns','(VITA) prepared returns Ct')
                             .replace('Number of volunteer prepared returns Number of tax counseling for the elderly (TCE) prepared returns','(TCE) prepared returns Ct')
                             .replace('Number of volunteer prepared returns Number of volunteer prepared returns withEarned Income Credit','volunteer prepared w Earned Income Credit')
                         for col in incomeTaxData.columns.values]


incomeTaxData.columns = [col.strip() for col in incomeTaxData.columns.values]

### Remove Blank Rows, Non-Zip Code Summary Data, and Masking

In [5]:
#Delete rows with no zip code
incomeTaxData = incomeTaxData[pd.notnull(incomeTaxData['Zip Code'])]
#Delete state-wide totals 
incomeTaxData = incomeTaxData[incomeTaxData['Zip Code'] != 0]
#Delete any zip codes that the IRS obfuscates by placing into the "other" category 99999
incomeTaxData = incomeTaxData[incomeTaxData['Zip Code'] != 99999]
#Convert all masked zip code data ("**") to 0
incomeTaxData = incomeTaxData.replace({"**":0})

### Clean up Adjusted Gross Income Category Names Before Table Pivot

In [6]:
#Turn off the copy waring for the multiple updates we are about to perform
pd.options.mode.chained_assignment = None
#Shorten all "Size of adjusted gross income" field values, since  
#these will become part of the column names after the table pivot
incomeTaxData.rename(columns={'Size of adjusted gross income' : 'AGI'}, inplace=True)
incomeTaxData['AGI'].fillna('All', inplace=True)
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$1 under $25,000'] = 'LT25K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$25,000 under $50,000'] = '25KLT50K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$50,000 under $75,000'] = '50KLT75K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$75,000 under $100,000'] = '75KLT100K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$100,000 under $200,000'] = '100KLT200K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$200,000 or more'] = 'GE200K'

In [7]:
#Look at column names before pivot 
incomeTaxData.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5061 entries, 9 to 5791
Data columns (total 125 columns):
Zip Code                                                      float64
AGI                                                           object
Number of returns                                             float64
Number of single returns                                      float64
Number of joint returns                                       float64
Number of head of household returns                           float64
Number with paid preparer's signature                         float64
Number of exemptions                                          float64
Number of dependents                                          float64
Number of volunteer prepared returns Tot                      float64
(VITA) prepared returns Ct                                    float64
(TCE) prepared returns Ct                                     float64
Adjusted gross income (AGI)                          

In [8]:
#Look at the data before pivot
incomeTaxData 

Unnamed: 0,Zip Code,AGI,Number of returns,Number of single returns,Number of joint returns,Number of head of household returns,Number with paid preparer's signature,Number of exemptions,Number of dependents,Number of volunteer prepared returns Tot,...,Tot tax liability Ct,Tot tax liability Amt,Add Medicare tax Ct,Add Medicare tax Amt,Net investment income tax Ct,Net investment income tax Amt,Tax due at time of filing Ct,Tax due at time of filing Amt,Overpayments refunded Ct,Overpayments refunded Amt
9,27006.0,All,6530.0,2460.0,3470.0,480.0,3790.0,13080.0,3590.0,140.0,...,5330.0,83149.0,250.0,489.0,310.0,805.0,1780.0,11210.0,4280.0,11522.0
10,27006.0,LT25K,1950.0,1400.0,330.0,190.0,1000.0,2230.0,460.0,80.0,...,960.0,801.0,0.0,0.0,0.0,0.0,240.0,178.0,1500.0,2180.0
11,27006.0,25KLT50K,1170.0,540.0,430.0,150.0,630.0,2140.0,540.0,60.0,...,1020.0,2565.0,0.0,0.0,0.0,0.0,230.0,285.0,890.0,2066.0
12,27006.0,50KLT75K,900.0,240.0,560.0,70.0,540.0,1990.0,530.0,0.0,...,860.0,4489.0,0.0,0.0,0.0,0.0,270.0,500.0,590.0,1617.0
13,27006.0,75KLT100K,740.0,120.0,590.0,30.0,460.0,1830.0,500.0,0.0,...,730.0,5981.0,0.0,0.0,0.0,0.0,250.0,671.0,470.0,1417.0
14,27006.0,100KLT200K,1250.0,110.0,1100.0,40.0,760.0,3460.0,1110.0,0.0,...,1240.0,21656.0,0.0,0.0,0.0,0.0,520.0,2339.0,660.0,2551.0
15,27006.0,GE200K,520.0,50.0,460.0,0.0,400.0,1430.0,450.0,0.0,...,520.0,47657.0,250.0,489.0,310.0,805.0,270.0,7237.0,170.0,1691.0
17,27007.0,All,860.0,280.0,500.0,90.0,570.0,1930.0,610.0,30.0,...,670.0,3272.0,0.0,0.0,0.0,0.0,140.0,365.0,710.0,1799.0
18,27007.0,LT25K,320.0,170.0,100.0,50.0,190.0,540.0,170.0,30.0,...,160.0,118.0,0.0,0.0,0.0,0.0,30.0,15.0,270.0,566.0
19,27007.0,25KLT50K,220.0,80.0,120.0,40.0,140.0,490.0,150.0,0.0,...,190.0,444.0,0.0,0.0,0.0,0.0,30.0,50.0,190.0,524.0


### Create Table Pivot Dataset and Adjust Pivot Column Names 
* We create a new view of the IRS data consolidating to one record per zip code. 
* New fields are created for each Adjusted Gross Income ("AGI") range.  
* This creates a total of 724 fields duplicating each original field in the dataset one time for each individual 
  adjusted gross income range.
* This data is saved as IncomeTaxDataByZipCode_*(taxYear)*.csv 

In [9]:
#Get a list of all the incomeTaxData columns we want to pivot
c= incomeTaxData.columns.values
valCols = c[(c != 'AGI') & (c != 'Zip Code')]           

#Pivot income tax data using these columns
incomeTaxData = pd.pivot_table(incomeTaxData, values=valCols,index=['Zip Code'],columns=['AGI'])

#Combine multiple index column names into single index column names
incomeTaxData.columns = [' '.join(col).strip() for col in incomeTaxData.columns.values]

#Make our index a column for merges later
incomeTaxData.reset_index(level=0, inplace=True)

#Inspect pivoted income tax field names
incomeTaxData.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723 entries, 0 to 722
Data columns (total 862 columns):
Zip Code                                                                 float64
(TCE) prepared returns Ct 100KLT200K                                     float64
(TCE) prepared returns Ct 25KLT50K                                       float64
(TCE) prepared returns Ct 50KLT75K                                       float64
(TCE) prepared returns Ct 75KLT100K                                      float64
(TCE) prepared returns Ct All                                            float64
(TCE) prepared returns Ct GE200K                                         float64
(TCE) prepared returns Ct LT25K                                          float64
(VITA) prepared returns Ct 100KLT200K                                    float64
(VITA) prepared returns Ct 25KLT50K                                      float64
(VITA) prepared returns Ct 50KLT75K                                      float64
(VI

## Summarize IRS Tax Data by School District 
**Income tax data counts and amounts are organized by Adjusted Gross Income Ranges within each Zip Code**
* **All** - Income tax data represents the entire zip code 
* **LT25K** - Income tax data represents adjusted gross income from \$1 under \$25,000 within a zip code.
* **25KLT50K** - Income tax data represents adjusted gross income >= \$25,000 and < \$50,000 within a zip code.
* **50KLT75K** - Income tax data represents adjusted gross income >= \$50,000 and < \$75,000 within a zip code.
* **75KLT100K** - Income tax data represents adjusted gross income >= \$75,000 and < \$100,000 within a zip code.
* **100KLT200K** - Income tax data represents adjusted gross income >= \$100,000 and < \$200,000 within a zip code.
* **GE200K** - Income tax data represents adjusted gross income >= \$200,000 within a zip code.

In [10]:
#Inspect pivoted income tax data
incomeTaxData

Unnamed: 0,Zip Code,(TCE) prepared returns Ct 100KLT200K,(TCE) prepared returns Ct 25KLT50K,(TCE) prepared returns Ct 50KLT75K,(TCE) prepared returns Ct 75KLT100K,(TCE) prepared returns Ct All,(TCE) prepared returns Ct GE200K,(TCE) prepared returns Ct LT25K,(VITA) prepared returns Ct 100KLT200K,(VITA) prepared returns Ct 25KLT50K,...,Unemployment compensation Amt All,Unemployment compensation Amt GE200K,Unemployment compensation Amt LT25K,Unemployment compensation Ct 100KLT200K,Unemployment compensation Ct 25KLT50K,Unemployment compensation Ct 50KLT75K,Unemployment compensation Ct 75KLT100K,Unemployment compensation Ct All,Unemployment compensation Ct GE200K,Unemployment compensation Ct LT25K
0,27006.0,0.0,20.0,0.0,0.0,50.0,0.0,30.0,0.0,40.0,...,577.0,0.0,230.0,20.0,40.0,40.0,0.0,150.0,0.0,50.0
1,27007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52.0,0.0,52.0,0.0,0.0,0.0,0.0,20.0,0.0,20.0
2,27009.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,165.0,0.0,0.0,0.0,40.0,0.0,0.0,40.0,0.0,0.0
3,27011.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,...,144.0,0.0,73.0,0.0,30.0,0.0,0.0,60.0,0.0,30.0
4,27012.0,0.0,30.0,0.0,0.0,80.0,0.0,50.0,0.0,80.0,...,1332.0,0.0,397.0,60.0,90.0,50.0,30.0,340.0,0.0,110.0
5,27013.0,0.0,0.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,...,488.0,0.0,92.0,0.0,40.0,40.0,0.0,120.0,0.0,40.0
6,27016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,27017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,...,271.0,0.0,113.0,0.0,40.0,30.0,0.0,110.0,0.0,40.0
8,27018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,...,333.0,0.0,136.0,0.0,20.0,40.0,0.0,110.0,0.0,50.0
9,27019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,140.0,0.0,77.0,0.0,20.0,0.0,0.0,50.0,0.0,30.0


In [11]:
#Save the pivot table raw data to disk
incomeTaxData.to_csv(dataDir + 'IncomeTaxDataByZipCode_' + str(taxYear) + '.csv', sep=',', index=False)