# Premise:
we have a list of survey responses where each column is a unique survey response with the zip code tag. The goal will be to aggregate these responses and create a visualization tool where someone could easily identify the zip code with most responses. 

In [216]:
import os
import pandas as pd
import numpy as np
import pgeocode
import matplotlib.pyplot as plt
from matplotlib import cm

from bokeh.plotting import figure
from bokeh.io import output_file, show
from bokeh.models import HoverTool, ColumnDataSource

# Load and clean datasets
Specifically we will be transforming all zip codes to zip-5 and then counting all responses within each zip code

In [152]:
# load datasets 
df = pd.read_csv('Dataset_3.csv')

# clean zip codes to 5 digits
df['Response_Zip_Code'] = df['Response_Zip_Code'].apply(lambda x: str(x)[0:5])
                                                                         
# group zip codes by total responses. Assumption that ResponseID is id of one individual.Thus we do count  
df = df.groupby('Response_Zip_Code').aggregate('count').reset_index()

Here we will use the pgeocode python package to pull 'longitude' and 'latitude' values for each unique zip code

In [153]:
# get latitude/longitude of zip codes
nomi = pgeocode.Nominatim('us')
def get_lat_long(zipcode):
    vals = nomi.query_postal_code(zipcode)
    return vals['latitude'], vals['longitude']

lat_long_raw = [get_lat_long(k) for k in df['Response_Zip_Code']]
lat_long_insert =np.array([np.array(xi) for xi in lat_long_raw])

df['latitude'] = lat_long_insert[:,0]
df['longitude'] = lat_long_insert[:,1]

# Show results via an interactive plot
First we will perform a color mapping of response values to a color. Darker colors = more responses. We choose here a sequential colormap as our values are linear. 

In [212]:
# set up color maps into df
cmap = cm.get_cmap('Reds', df['ResponseID'].max())
df['color'] = (cmap(df['ResponseID'])*256).tolist()
df['color'] = df['color'].apply(lambda x: 'rgb' + str(tuple(x[0:3] + [1])))

Secondly we will plop the data into an interactive chart and save the result as an html file

In [237]:
# collect data into ColumnDataSource and ColorMapper
df = df.sort_values('ResponseID') # sort so that larger values will be on the surface when plotting
source = ColumnDataSource(df)

# figure set up
p = figure(x_axis_label = 'Longitude', 
           y_axis_label = 'Latitude',
          title = 'Responses by Zip Code')

# figure scatter using CategoricalColorMapper
p.circle('longitude', 'latitude', source=source, size = 3, 
        fill_color='color', line_color = 'color')

hover = HoverTool(tooltips = [
                ('ZipCode', '@Response_Zip_Code'),
                ('Responses', '@ResponseID')])
p.add_tools(hover)

output_file('./products/assignment_3.html')
show(p)

In [236]:
# output sorted values df with zip and response
df[['Response_Zip_Code','ResponseID']].sort_values('ResponseID',ascending=False).reset_index(drop=True).rename(columns = {'ResponseID':'NumResponses'}).to_csv('./products/assignment_3_results.csv')