## Term Project Milestone 3
### Course: DSC 540
### Author: Holly Figueroa
### Objective: Transformations of website data source (State Spending sourced by Kaiser Family Foundation)

In [1]:
# Load libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
import numpy as np

In [2]:
# Create url variable for webpage
url = 'https://www.kff.org/other/state-indicator/distribution-of-state-spending/?dataView=0&currentTimeframe=0&print=true&sortModel={"colId":"Location","sort":"asc"}'

# Request data from the url
page = requests.get(url)

# Create bs4 object to better view and identify page contents
soup = BeautifulSoup(page.content, 'html.parser')

Using inspector, I have found the table items to be spread accross a number of deeply nested tags. My attempts to get to them have failed. By chance, I noticed a dictionary containing all the table data nested within a script tag. So I will use this to create my table. 

In [3]:
# Using inspector
soup1 = soup.find_all('script')

# Create a print output that will help identify the index of the desired script tag html
# Create a counter that increases by 1 after each item starting at 0 to match the index count
count = 0

# Iterate through items in the list printing the index count along with the contents of each item
for item in soup1:
    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n", count)
    print(item)
    count +=1  

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 0
<script type="text/javascript"> if (!window.console) console = {log: function() {}}; </script>
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 1
<script type="text/javascript">
	var kff_context = { id: 32150, type: "post", post_type: "state-indicator" };</script>
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 2
<script type="text/javascript">
	var kff_twitter_handle = "@kff";</script>
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 3
<script type="text/javascript">var ajaxurl = 'https://www.kff.org/wp-admin/admin-ajax.php';</script>
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 4
<script type="text/javascript">
window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/www.kff.org\/wp-includes\/js\/wp-e

In [4]:
# The dictionary I want at the 30th index. Check output
soup1[31]

<script type="text/javascript">
	var appJs = appJs || {};
	appJs = jQuery.extend(appJs, {"post_type":"state-indicator","itype":"state","table_viewer_css":"https:\/\/www.kff.org\/wp-content\/themes\/kaiser-foundation-2016\/static\/stylesheets\/table_viewer.css","geo_url":"https:\/\/www.kff.org\/wp-content\/themes\/kaiser-foundation-2016\/static\/js\/maps\/medicare_geos.json","is_preview":false,"base":"\/other\/state-indicator\/distribution-of-state-spending\/","gdocs_key":"1rVYkmCxQec_chOcJ4JCNDswdc5ltyyF3N2hAO4g6nAE","gdocsObject":[["SFY 2020",[["","Elementary & Secondary Education","Higher Education","Public Assistance","Medicaid","Corrections","Transportation","All Other","Total","Elementary & Secondary Education","Higher Education","Public Assistance","Medicaid","Corrections","Transportation","All Other","Total"],["","Currency","Currency","Currency","Currency","Currency","Currency","Currency","Currency","Percent","Percent","Percent","Percent","Percent","Percent","Percent","Percent"]

In [5]:
# The dictionary is surrounded by unecessary data
# Convert object to string and remove unwanted sections at the beginning and end by character count
soup2 = soup1[31].string.strip()[55:-2]

In [6]:
# Check that string starts with' '{'
soup2[:500]

'{"post_type":"state-indicator","itype":"state","table_viewer_css":"https:\\/\\/www.kff.org\\/wp-content\\/themes\\/kaiser-foundation-2016\\/static\\/stylesheets\\/table_viewer.css","geo_url":"https:\\/\\/www.kff.org\\/wp-content\\/themes\\/kaiser-foundation-2016\\/static\\/js\\/maps\\/medicare_geos.json","is_preview":false,"base":"\\/other\\/state-indicator\\/distribution-of-state-spending\\/","gdocs_key":"1rVYkmCxQec_chOcJ4JCNDswdc5ltyyF3N2hAO4g6nAE","gdocsObject":[["SFY 2020",[["","Elementary & Secondary Education'

In [7]:
# Check the string ends with '}'
soup2[-500:]

'ms (i.e., environmental projects, housing). States with lotteries were asked to exclude prizes paid to lottery winners. States were also asked to exclude expenditures for state-owned utilities and liquor stores. States were also asked to separately detail debt service spending. \\n\\n*N\\/A*: Data not available."],["any","any","Percent","any","Precision","1"],["any","","","","Locked","no"],["any","Region","","United States","Footnote","US totals exclude the District of Columbia."]]]],"postBody":""}'

In [8]:
# load string as json
data = json.loads(soup2)

# Check output
data

{'post_type': 'state-indicator',
 'itype': 'state',
 'table_viewer_css': 'https://www.kff.org/wp-content/themes/kaiser-foundation-2016/static/stylesheets/table_viewer.css',
 'geo_url': 'https://www.kff.org/wp-content/themes/kaiser-foundation-2016/static/js/maps/medicare_geos.json',
 'is_preview': False,
 'base': '/other/state-indicator/distribution-of-state-spending/',
 'gdocs_key': '1rVYkmCxQec_chOcJ4JCNDswdc5ltyyF3N2hAO4g6nAE',
 'gdocsObject': [['SFY 2020',
   [['',
     'Elementary & Secondary Education',
     'Higher Education',
     'Public Assistance',
     'Medicaid',
     'Corrections',
     'Transportation',
     'All Other',
     'Total',
     'Elementary & Secondary Education',
     'Higher Education',
     'Public Assistance',
     'Medicaid',
     'Corrections',
     'Transportation',
     'All Other',
     'Total'],
    ['',
     'Currency',
     'Currency',
     'Currency',
     'Currency',
     'Currency',
     'Currency',
     'Currency',
     'Currency',
     'Percent

In [9]:
# The table data I want is paired with key value 'gdocsObject'
library = data['gdocsObject']
# Check result
library

[['SFY 2020',
  [['',
    'Elementary & Secondary Education',
    'Higher Education',
    'Public Assistance',
    'Medicaid',
    'Corrections',
    'Transportation',
    'All Other',
    'Total',
    'Elementary & Secondary Education',
    'Higher Education',
    'Public Assistance',
    'Medicaid',
    'Corrections',
    'Transportation',
    'All Other',
    'Total'],
   ['',
    'Currency',
    'Currency',
    'Currency',
    'Currency',
    'Currency',
    'Currency',
    'Currency',
    'Currency',
    'Percent',
    'Percent',
    'Percent',
    'Percent',
    'Percent',
    'Percent',
    'Percent',
    'Percent'],
   ['United States',
    '365817',
    '184859',
    '10106',
    '225889',
    '62461',
    '112510',
    '489209',
    '1450854',
    '0.252',
    '0.127',
    '0.007',
    '0.156',
    '0.043',
    '0.078',
    '0.337',
    '1.00'],
   ['Alabama',
    '5094',
    '4985',
    '0',
    '1730',
    '690',
    '749',
    '5890',
    '19138',
    '0.266',
    '0.260',

In [10]:
# result is a list of items, the item I want is a list item. I want the second item [1] list from the first list [0]
table = library[0][1]
# Check results
table

[['',
  'Elementary & Secondary Education',
  'Higher Education',
  'Public Assistance',
  'Medicaid',
  'Corrections',
  'Transportation',
  'All Other',
  'Total',
  'Elementary & Secondary Education',
  'Higher Education',
  'Public Assistance',
  'Medicaid',
  'Corrections',
  'Transportation',
  'All Other',
  'Total'],
 ['',
  'Currency',
  'Currency',
  'Currency',
  'Currency',
  'Currency',
  'Currency',
  'Currency',
  'Currency',
  'Percent',
  'Percent',
  'Percent',
  'Percent',
  'Percent',
  'Percent',
  'Percent',
  'Percent'],
 ['United States',
  '365817',
  '184859',
  '10106',
  '225889',
  '62461',
  '112510',
  '489209',
  '1450854',
  '0.252',
  '0.127',
  '0.007',
  '0.156',
  '0.043',
  '0.078',
  '0.337',
  '1.00'],
 ['Alabama',
  '5094',
  '4985',
  '0',
  '1730',
  '690',
  '749',
  '5890',
  '19138',
  '0.266',
  '0.260',
  '0.000',
  '0.090',
  '0.036',
  '0.039',
  '0.308',
  '1.00'],
 ['Alaska',
  '1417',
  '611',
  '59',
  '644',
  '352',
  '514',
  '35

In [11]:
# define headers - found as list at the 0 index
headers = table[0]
headers

['',
 'Elementary & Secondary Education',
 'Higher Education',
 'Public Assistance',
 'Medicaid',
 'Corrections',
 'Transportation',
 'All Other',
 'Total',
 'Elementary & Secondary Education',
 'Higher Education',
 'Public Assistance',
 'Medicaid',
 'Corrections',
 'Transportation',
 'All Other',
 'Total']

In [12]:
# Get row data found as list staring at index 2
rows = table[2:]

In [13]:
# Combine data and headers into a dataframe object
spending_df = pd.DataFrame(data = rows, columns = headers)
spending_df.head()

Unnamed: 0,Unnamed: 1,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total,Elementary & Secondary Education.1,Higher Education.1,Public Assistance.1,Medicaid.1,Corrections.1,Transportation.1,All Other.1,Total.1
0,United States,365817,184859,10106,225889,62461,112510,489209,1450854,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
1,Alabama,5094,4985,0,1730,690,749,5890,19138,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
2,Alaska,1417,611,59,644,352,514,3527,7123,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
3,Arizona,5607,5773,0,3357,1322,2142,5200,23401,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
4,Arkansas,3072,3795,200,1629,532,1078,7915,18221,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0


Now that the dataframe is created, I will clean the headers and set the index to location

In [14]:
# Transformation[1]
# Change first header to 'Location' instead of being blank
spending_df.rename(columns = {'': 'Location'}, inplace = True)
spending_df.head()

Unnamed: 0,Location,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total,Elementary & Secondary Education.1,Higher Education.1,Public Assistance.1,Medicaid.1,Corrections.1,Transportation.1,All Other.1,Total.1
0,United States,365817,184859,10106,225889,62461,112510,489209,1450854,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
1,Alabama,5094,4985,0,1730,690,749,5890,19138,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
2,Alaska,1417,611,59,644,352,514,3527,7123,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
3,Arizona,5607,5773,0,3357,1322,2142,5200,23401,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
4,Arkansas,3072,3795,200,1629,532,1078,7915,18221,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0


In [15]:
# Transformation [2]
# Change the index to 'location'
spending_df.set_index('Location', inplace = True)

In [16]:
# Check Results
spending_df.head(10)

Unnamed: 0_level_0,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
United States,365817.0,184859.0,10106.0,225889.0,62461.0,112510.0,489209.0,1450854.0,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
Alabama,5094.0,4985.0,0.0,1730.0,690.0,749.0,5890.0,19138.0,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
Alaska,1417.0,611.0,59.0,644.0,352.0,514.0,3527.0,7123.0,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
Arizona,5607.0,5773.0,0.0,3357.0,1322.0,2142.0,5200.0,23401.0,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
Arkansas,3072.0,3795.0,200.0,1629.0,532.0,1078.0,7915.0,18221.0,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0
California,52419.0,16518.0,5465.0,35347.0,15753.0,14340.0,61530.0,201372.0,0.26,0.082,0.027,0.176,0.078,0.071,0.306,1.0
Colorado,5526.0,4084.0,45.0,4402.0,950.0,1484.0,7154.0,23645.0,0.234,0.173,0.002,0.186,0.04,0.063,0.303,1.0
Connecticut,3240.0,3322.0,381.0,4474.0,672.0,1702.0,11756.0,25547.0,0.127,0.13,0.015,0.175,0.026,0.067,0.46,1.0
Delaware,2419.0,378.0,19.0,738.0,373.0,759.0,4230.0,8916.0,0.271,0.042,0.002,0.083,0.042,0.085,0.474,1.0
District of Columbia,,,,,,,,,,,,,,,,


Next I will replace missing values to NaN, and remove rows with missing values

In [17]:
# Transformation [3]
# Removing rows with null values - It appears only one row is null and is using the string "N/A"
spending_df.replace('N/A', np.nan, inplace = True)
spending_df.head(10)

Unnamed: 0_level_0,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
United States,365817.0,184859.0,10106.0,225889.0,62461.0,112510.0,489209.0,1450854.0,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
Alabama,5094.0,4985.0,0.0,1730.0,690.0,749.0,5890.0,19138.0,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
Alaska,1417.0,611.0,59.0,644.0,352.0,514.0,3527.0,7123.0,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
Arizona,5607.0,5773.0,0.0,3357.0,1322.0,2142.0,5200.0,23401.0,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
Arkansas,3072.0,3795.0,200.0,1629.0,532.0,1078.0,7915.0,18221.0,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0
California,52419.0,16518.0,5465.0,35347.0,15753.0,14340.0,61530.0,201372.0,0.26,0.082,0.027,0.176,0.078,0.071,0.306,1.0
Colorado,5526.0,4084.0,45.0,4402.0,950.0,1484.0,7154.0,23645.0,0.234,0.173,0.002,0.186,0.04,0.063,0.303,1.0
Connecticut,3240.0,3322.0,381.0,4474.0,672.0,1702.0,11756.0,25547.0,0.127,0.13,0.015,0.175,0.026,0.067,0.46,1.0
Delaware,2419.0,378.0,19.0,738.0,373.0,759.0,4230.0,8916.0,0.271,0.042,0.002,0.083,0.042,0.085,0.474,1.0
District of Columbia,,,,,,,,,,,,,,,,


In [18]:
# Transformation [4]
# Drop rows that are missing all data
spending_df.dropna(axis = 0, how = 'all', inplace = True)
spending_df.head(10)

Unnamed: 0_level_0,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
United States,365817,184859,10106,225889,62461,112510,489209,1450854,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
Alabama,5094,4985,0,1730,690,749,5890,19138,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
Alaska,1417,611,59,644,352,514,3527,7123,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
Arizona,5607,5773,0,3357,1322,2142,5200,23401,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
Arkansas,3072,3795,200,1629,532,1078,7915,18221,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0
California,52419,16518,5465,35347,15753,14340,61530,201372,0.26,0.082,0.027,0.176,0.078,0.071,0.306,1.0
Colorado,5526,4084,45,4402,950,1484,7154,23645,0.234,0.173,0.002,0.186,0.04,0.063,0.303,1.0
Connecticut,3240,3322,381,4474,672,1702,11756,25547,0.127,0.13,0.015,0.175,0.026,0.067,0.46,1.0
Delaware,2419,378,19,738,373,759,4230,8916,0.271,0.042,0.002,0.083,0.042,0.085,0.474,1.0
Florida,13404,8004,120,10361,3066,7035,10279,52269,0.256,0.153,0.002,0.198,0.059,0.135,0.197,1.0


Finally, I will create a multi-index column header to distinguish between currency and percentage values given for each state

In [19]:
# First I must get the array from table data describing each column as currency or percent
# Take all expect the first value, belonging to the location column which is now the dataframe's index
units = table[1][1:]
units

['Currency',
 'Currency',
 'Currency',
 'Currency',
 'Currency',
 'Currency',
 'Currency',
 'Currency',
 'Percent',
 'Percent',
 'Percent',
 'Percent',
 'Percent',
 'Percent',
 'Percent',
 'Percent']

In [20]:
# Will use arrays to create the multi index for columns
# Create an arrays variable, a list of arrays being the value type array and the current dataframe columns
arrays =[units, spending_df.columns]

In [21]:
# Create the multi level index
mulindx = pd.MultiIndex.from_arrays(arrays)

In [22]:
# Create copy of df
spending_df2 = spending_df

In [23]:
# Transformation [5] Create a multi-index column header
# Replace dataframe columns with the multiindex just created
spending_df2.columns = mulindx

In [24]:
# Check results
spending_df2.head()

Unnamed: 0_level_0,Currency,Currency,Currency,Currency,Currency,Currency,Currency,Currency,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
Unnamed: 0_level_1,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total
Location,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
United States,365817,184859,10106,225889,62461,112510,489209,1450854,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
Alabama,5094,4985,0,1730,690,749,5890,19138,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
Alaska,1417,611,59,644,352,514,3527,7123,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
Arizona,5607,5773,0,3357,1322,2142,5200,23401,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
Arkansas,3072,3795,200,1629,532,1078,7915,18221,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0


In [25]:
# Use multi-index to view only the percentage data
spending_df2.Percent.head()

Unnamed: 0_level_0,Elementary & Secondary Education,Higher Education,Public Assistance,Medicaid,Corrections,Transportation,All Other,Total
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
United States,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
Alabama,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
Alaska,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
Arizona,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
Arkansas,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0


My final dataframe now has state spending summarized in percentages and dollar amounts. The multi-index columns will allow me to view and access the data I want more easily. I am not sure, however, how the multi index will impact my efforts to combine this dataframe with another, or others. I anticipate changes will have to me made to address that these data sets use full state names verses abbreviations. 

CHANGES TO ALLOW FOR DATABASE ENTRY AND TABLE MERGING

In [26]:
# Change columns to differentiate between PC and dollar Values
spending_df.columns = ['Basic_Ed', 'Higher_Ed',
       'Public_Assist', 'Medicaid', 'Corrections', 'Transportation',
       'Other', 'Total', 'PC_Basic_Ed',
       'PC_Higher_Ed', 'PC_Public_Assist', 'PC_Medicaid', 'PC_Corrections',
       'PC_Transportation', 'PC_Other', 'PC_Total']

In [27]:
spending_df.head()

Unnamed: 0_level_0,Basic_Ed,Higher_Ed,Public_Assist,Medicaid,Corrections,Transportation,Other,Total,PC_Basic_Ed,PC_Higher_Ed,PC_Public_Assist,PC_Medicaid,PC_Corrections,PC_Transportation,PC_Other,PC_Total
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
United States,365817,184859,10106,225889,62461,112510,489209,1450854,0.252,0.127,0.007,0.156,0.043,0.078,0.337,1.0
Alabama,5094,4985,0,1730,690,749,5890,19138,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
Alaska,1417,611,59,644,352,514,3527,7123,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
Arizona,5607,5773,0,3357,1322,2142,5200,23401,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
Arkansas,3072,3795,200,1629,532,1078,7915,18221,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0


In [28]:
spending_df = spending_df.reset_index()

In [29]:
state_spending = spending_df.iloc[:,[0,8,9,10,11,12,13,14,15,16]]

In [30]:
state_spending = state_spending.drop([0])
state_spending = state_spending.rename(columns = {'Total': 'Millions', 'Location': 'State'})

In [31]:
state_spending.to_csv('state_spending.csv', index = False)

In [32]:
state_spending.head()

Unnamed: 0,State,Millions,PC_Basic_Ed,PC_Higher_Ed,PC_Public_Assist,PC_Medicaid,PC_Corrections,PC_Transportation,PC_Other,PC_Total
1,Alabama,19138,0.266,0.26,0.0,0.09,0.036,0.039,0.308,1.0
2,Alaska,7123,0.199,0.086,0.008,0.09,0.049,0.072,0.495,1.0
3,Arizona,23401,0.24,0.247,0.0,0.143,0.056,0.092,0.222,1.0
4,Arkansas,18221,0.169,0.208,0.011,0.089,0.029,0.059,0.434,1.0
5,California,201372,0.26,0.082,0.027,0.176,0.078,0.071,0.306,1.0
