# Exploration and Analysis
- <Description goes here>

In [1]:
# Dependencies and Imports
import pandas as pd
import requests
import json
import matplotlib as mlt
import gmaps
from census import Census
from us import states
import numpy as np

## Need to move to config file
# Census API Key
census_API = 'ad9e020616792d8cbe0f666089dc3980b9d379e0'
c=Census(census_API)


#KCMO 311 Dataset
path = './Resources/311_Pothole_History.csv'
df = pd.DataFrame(pd.read_csv(path))

# Preview the dataset
print(f"Total # of complaints in the dataset: {len(df)}")
df.head()

Total # of complaints in the dataset: 50073


Unnamed: 0,CASE ID,SOURCE,DEPARTMENT,WORK GROUP,REQUEST TYPE,CREATION MONTH,CREATION YEAR,STATUS,EXCEEDED EST TIMEFRAME,CLOSED DATE,...,CLOSED YEAR,DAYS TO CLOSE,STREET ADDRESS,ADDRESS WITH GEOCODE,ZIP CODE,NEIGHBORHOOD,COUNCIL DISTRICT,PARCEL ID NO,LATITUDE,LONGITUDE
0,2019169222,PHONE,Public Works,Public Works-Street and Traffic-District 3,Streets / Roadways / Alleys-Pothole-District 3,9,2019,RESOL,Y,11/7/2019,...,2019.0,48.0,13110 E 57th St,"13110 E 57th St64133\n(39.018605, -94.426046)",64133.0,Blue Vue Hills,5.0,63787,39.018605,-94.426046
1,2019126105,PHONE,Public Works,Public Works-Street and Traffic-District 3,Streets / Roadways / Alleys-Pothole-District 3,7,2019,RESOL,Y,10/24/2019,...,2019.0,113.0,1420 E 75th Ter,"1420 E 75th Ter64131\n(38.990601, -94.570141)",64131.0,East Meyer 6,5.0,115894,38.990601,-94.570141
2,2019126468,WEB,Public Works,Public Works-Street and Traffic-District 3,Streets / Roadways / Alleys-Pothole-District 3,7,2019,RESOL,Y,10/24/2019,...,2019.0,113.0,5500 Michigan Ave,"5500 Michigan Ave64130\n(39.026622, -94.563667)",64130.0,Blue Hills,5.0,147750,39.026622,-94.563667
3,2019183619,PHONE,Public Works,Public Works-Street and Traffic-District 1,Streets / Roadways / Alleys-Pothole-District 1,10,2019,RESOL,Y,11/22/2019,...,2019.0,30.0,6344 NE Pleasant Valley Rd,"6344 NE Pleasant Valley Rd64119\n(39.216929, -...",64119.0,Shoal Creek,1.0,89139,39.216929,-94.504961
4,2019190590,PHONE,Public Works,Public Works-Street and Traffic-District 3,Streets / Roadways / Alleys-Pothole-District 3,11,2019,RESOL,Y,11/18/2019,...,2019.0,10.0,9500 Blue Ridge Blvd,"9500 Blue Ridge Blvd64134\n(38.952042, -94.508...",64134.0,Fairlane,5.0,56861,38.952042,-94.508463


In [2]:
# Investigating values for non-standard columns
print(f"Source values: {df['SOURCE'].unique()}")
print(f"Work group values: {df['WORK GROUP'].unique()}")
print(f"Request type values: {df['REQUEST TYPE'].unique()}")
print(f"Status values: {df['STATUS'].unique()}")

Source values: ['PHONE' 'WEB' 'EMAIL' 'BOT' 'TWIR' 'WALK' 'EDC' 'VOICE' 'FAX' 'INSPE'
 'SYS' 'MAIL' 'EIP' 'SPNSH']
Work group values: ['Public Works-Street and Traffic-District 3'
 'Public Works-Street and Traffic-District 1'
 'Public Works-Street and Traffic-District 2'
 'City Managers Office-311 Call Center-Support'
 'Public Works-Capital Projects-Sidewalks' 'Parks and Rec-South Region-'
 'Public Works-Capital Projects-Traffic Permits'
 'Parks and Rec-Central Region-' 'Parks and Rec-Administration-']
Request type values: ['Streets / Roadways / Alleys-Pothole-District 3'
 'Streets / Roadways / Alleys-Pothole-District 1'
 'Streets / Roadways / Alleys-Pothole-District 2'
 'Streets / Roadways / Alleys-Pothole-Bridge' 'Pothole (North of River)'
 'Pothole (South of 47th Street and West Of Blue PKWY)'
 'Pothole (South of 47th Street)'
 'Pothole (South of River to 47th Street)'
 'Pothole (River south to 47th Street)'
 'Pothole (River south to 47th Street and East of Blue PKWY)'
 'Streets / R

In [3]:
# Identifying data ranges
print(f"Dataset covers {df['CREATION YEAR'].nunique()} years, {df['ZIP CODE'].nunique()} zipcodes, and {df['NEIGHBORHOOD'].nunique()} neighborhoods")

Dataset covers 14 years, 55 zipcodes, and 249 neighborhoods


In [4]:
# Begin cleaning data. We'll start by only using resolved complaints (open/canceled complaints lack some of the data we'd need if we want to use closing dates)
resolved_df = df.loc[(df['STATUS']=='RESOL')]
print(f"Current number of rows: {len(resolved_df)}")

# Cut down the number of columns to main identifying data. We'll primarily be using zip codes to pull census data
# Extra location data and "closed" data has been removed for ease.
reduced_resolved_df = resolved_df[['CASE ID', 'SOURCE', 'CREATION MONTH', 
                                   'CREATION YEAR', 'STATUS', 'EXCEEDED EST TIMEFRAME', 
                                   'DAYS TO CLOSE', 'ZIP CODE', 'NEIGHBORHOOD']]

# Drop any rows that still have missing data
clean_df = reduced_resolved_df.dropna(how='any').copy()
clean_df.reset_index(inplace=True)
print(f"Clean number of rows: {len(clean_df)}")

# Format columns to drop decimal places
clean_df['ZIP CODE'] = clean_df['ZIP CODE'].astype(int)
clean_df['DAYS TO CLOSE'] = clean_df['DAYS TO CLOSE'].astype(int)

# Output the dataframe to a csv and preview the final cleaned dataframe
clean_df.to_csv('./Resources/PotholeData.csv', index=False)
clean_df

Current number of rows: 49499
Clean number of rows: 48070


Unnamed: 0,index,CASE ID,SOURCE,CREATION MONTH,CREATION YEAR,STATUS,EXCEEDED EST TIMEFRAME,DAYS TO CLOSE,ZIP CODE,NEIGHBORHOOD
0,0,2019169222,PHONE,9,2019,RESOL,Y,48,64133,Blue Vue Hills
1,1,2019126105,PHONE,7,2019,RESOL,Y,113,64131,East Meyer 6
2,2,2019126468,WEB,7,2019,RESOL,Y,113,64130,Blue Hills
3,3,2019183619,PHONE,10,2019,RESOL,Y,30,64119,Shoal Creek
4,4,2019190590,PHONE,11,2019,RESOL,Y,10,64134,Fairlane
...,...,...,...,...,...,...,...,...,...,...
48065,50068,2019170484,PHONE,9,2019,RESOL,N,1,64126,East Blue Valley
48066,50069,2019170942,PHONE,9,2019,RESOL,Y,52,64110,Western 49-63
48067,50070,2019171791,PHONE,9,2019,RESOL,Y,50,64134,Hickman Mills South
48068,50071,2019170397,TWIR,9,2019,RESOL,N,2,64108,Wendell Phillips


In [5]:
# Assembling a clean list of the zip codes in the data set
zip_codes = df['ZIP CODE'].dropna().unique()
zip_codes = zip_codes.astype(int)
zip_codes

array([64133, 64131, 64130, 64119, 64134, 64108, 64113, 64132, 64112,
       64146, 64127, 64125, 64149, 64138, 64114, 64145, 64105, 64123,
       64110, 64137, 64155, 64109, 64154, 64151, 64124, 64116, 64120,
       64139, 64152, 64129, 64158, 64106, 64111, 64128, 64118, 64117,
       64136, 64126, 64157, 64156, 64101, 64163, 64153, 64161, 64147,
       64167, 64102, 64166, 64165, 64164, 64160, 64068, 64012, 64052,
       64030])

In [None]:
# generate an empty list to fill with the census data
data = []

# iterate through the list of zip codes to pull the census data
for ea in zip_codes:
    census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E","B19301_001E","B17001_002E","C02003_003E","B01002A_002E","B01002A_003E","B01001A_023E","B01001A_024E","B01001A_025E","B01001A_026E","B01001A_027E"), {'for': 'zip code tabulation area:'+str(ea)})
    try:
        data.append(census_data[0])
    except:
        next

In [27]:
# Column Reordering & Renaming
data = data.rename(
    columns={
        "B01003_001E": "Population",
        "B01002_001E": "Median Age",
        "B19013_001E": "Household Income",
        "B19301_001E": "Per Capita Income",
        "B17001_002E": "Poverty Count",
        "C02003_003E": "White Population",
        "B01002A_002E": "Male Median Age",
        "B01002A_003E": "Female Median Age",
        "B01001A_023E": "F 20-24",
        "B01001A_024E": "F 25-29",
        "B01001A_025E": "F 30-34",
        "B01001A_026E": "F 35-44",
        "B01001A_027E": "F 45-54",
        "NAME": "Name",
        "zip code tabulation area": "ZIP CODE"
    }
)

data = data[[
    'Zipcode',
    'Household Income',
    'Population',
    'Median Age',
    'Per Capita Income',
    'Poverty Count',
    'White Population',
    'Male Median Age',
    'Female Median Age',
    'F 20-24',
    'F 25-29',
    'F 30-34',
    'F 35-44',
    'F 45-54'
]]
data = data.loc[(data['Population']!=0) & (data['Household Income']!=-666666666.0)]
data['% White'] = data['White Population']/data['Population']
data['% F20-24'] = data['F 20-24']/data['Population']
data['% F25-29'] = data['F 25-29']/data['Population']
data['% F30-34'] = data['F 30-34']/data['Population']
data['% F35-44'] = data['F 35-44']/data['Population']
data['% F45-54'] = data['F 45-54']/data['Population']

data.head()

Unnamed: 0,Zipcode,Household Income,Population,Median Age,Per Capita Income,Poverty Count,White Population,Male Median Age,Female Median Age,F 20-24,F 25-29,F 30-34,F 35-44,F 45-54,% White,% F20-24,% F25-29,% F30-34,% F35-44,% F45-54
0,ZCTA5 64133,50681.0,35163.0,40.8,26720.0,4690.0,20850.0,44.1,49.4,539.0,630.0,627.0,1280.0,1598.0,0.592953,0.015329,0.017917,0.017831,0.036402,0.045445
1,ZCTA5 64131,45688.0,22932.0,37.2,30411.0,4021.0,11811.0,38.9,43.2,428.0,461.0,777.0,573.0,748.0,0.515044,0.018664,0.020103,0.033883,0.024987,0.032618
2,ZCTA5 64130,31166.0,19996.0,39.3,21925.0,5238.0,1474.0,29.6,29.5,79.0,139.0,46.0,153.0,35.0,0.073715,0.003951,0.006951,0.0023,0.007652,0.00175
3,ZCTA5 64119,68841.0,27848.0,38.3,33224.0,1785.0,23323.0,39.7,43.2,712.0,982.0,959.0,1433.0,1713.0,0.837511,0.025567,0.035263,0.034437,0.051458,0.061512
4,ZCTA5 64134,41543.0,23874.0,31.3,20078.0,5507.0,6793.0,47.1,45.5,147.0,283.0,308.0,386.0,384.0,0.284535,0.006157,0.011854,0.012901,0.016168,0.016084


In [28]:
data.to_csv('./Resources/CensusData_Rev1.csv', index=False)