In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
import sys

#figure making packages
import bokeh
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.models import ColumnDataSource, HoverTool
import seaborn as sns
import matplotlib.pyplot as plt

#import customized modules
src_dir = os.path.abspath(os.path.join(os.pardir, 'src'))
sys.path[0] = src_dir
from modules import *

# Challenge Questions

Please choose a specific business and tell us which business you chose; any kind of business will do. Imagine you’re helping that business owner in Chicago and s/he is looking to open a new location. In the form of writing, potentially supplemented by sketches (computer-drawn or hand-drawn) and links, we want to see your response to these questions:

- What questions could you potentially explore/answer with this data?
- Ideally, what other data would you gather or combine to learn even more?
- How would you want to see data presented, to make it actionable by you or others?
- How could an algorithm or model be used? How might your client interact with that algorithm or model?

Furthermore, we want to see the results of 2–3 hours of work, using the real data, towards making those ideas a reality. The results could include findings from the data, code, Python/R notebooks, a visualization, results of a statistical model you built, etc. Try not to hide things or throw them away— we want to see your work!

# Some additional guidelines

- We're not expecting perfection here; this is intended to be something you spend an afternoon or so on. Send us whatever you used to tackle the problem, even if it’s not pretty.
- You're not required to use any specific tools— pick your favorites. Colored pencils are just as valid as d3. Think of this as an opportunity to showcase your strengths.
- Feel free to aggregate or filter the data however you see fit— if you want to focus on a particular train line, time period, season, stop, neighborhood, etc, go for it. "Big Data" isn't necessarily going to impress us more than a thoughtful approach or interesting findings from a small slice, especially if that aligns with the story you’re telling.

# Why are we doing this?

This challenge is not all that different from what happens at the beginning of a project at IDEO. We have to go from large amounts of ambiguity to valuable work in short periods of time, and we do it by empathizing with the needs of our client, imagining what’s possible, selecting the most promising ideas, and working swiftly and iteratively to share our work early and build towards the bigger ideas. If you find this fun, you’re going to like it here.

This also gives you an opportunity to show us what you’re capable of; this is what we do instead of unrealistic and unreliable whiteboard coding interviews. And afterwards, you’ll have a little data project to add to your portfolio that you can build on further if you want to.

In [3]:
# file names of the data
cta_entry_fname = '../data/CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv'
cta_station_fname = '../data/CTA_-_System_Information_-_List_of__L__Stops.csv'

In [4]:
# read data into data frames
df_cta_entry = pd.read_csv(cta_entry_fname)
df_cta_entry['date'] = df_cta_entry.date.apply(lambda x: str_to_date(x))
df_cta_station = pd.read_csv(cta_station_fname)

In [5]:
# rename columns of the data frame for easier access
df_cta_station.columns = df_cta_station.columns.str.lower().str.replace(' ', '_')

In [6]:
#assign colors to each stations
df_cta_station['color'] = df_cta_station.apply(lambda row: find_color(row), axis=1)

In [7]:
#substitute for missing color
df_cta_station.at[263,'color'] = ['red']
df_cta_station.at[264,'color'] = ['red']

In [8]:
# get longitude and latitude coordinates
df_cta_station['coords_x'] = df_cta_station['location'].apply(lambda x: merc(x)[0])
df_cta_station['coords_y'] = df_cta_station['location'].apply(lambda x: merc(x)[1])

In [9]:
# merge the information about the stations with the ridership data
df_mean_ridership = pd.DataFrame(df_cta_entry.groupby(['station_id']).rides.mean())
df_merge = pd.merge(df_cta_station, df_mean_ridership, left_on='map_id', right_index=True)
df_merge['circle_size'] = df_merge['rides']/500

In [10]:
#define one station color for those where there are multiple lines crossing
df_merge['station_color'] = df_merge['color'].apply(lambda x: x[0])

In [11]:
#initialize data format to plot interactive map for CTA train stations
source = ColumnDataSource(
                data=dict(x=[], 
                y=[],
                ridership=[],
                sizes=[],
                stationname=[],
                line_color=[],
                fill_color=[]))

for c, group in df_merge.groupby('station_color'):
    group = group.drop_duplicates('station_name')
    data = dict(
                x=list(group['coords_x']), 
                y=list(group['coords_y']),
                ridership=list(group['rides']),
                sizes=list(group['circle_size']),
                stationname=list(group['station_descriptive_name']),
                line_color=[c]*len(group),
                fill_color=[c]*len(group))
    #append data 
    source.stream(data)

    hover = HoverTool(tooltips=[
    ("station", "@stationname"),
    ("ridership","@ridership")
    
])

p = figure(x_range=(-9780000, -9745000), y_range=(5130000, 5160000),
           x_axis_type="mercator", y_axis_type="mercator",
          tools=[hover, 'wheel_zoom', 'save'])

p.add_tile(CARTODBPOSITRON)


p.grid.grid_line_color = None

p.circle(x='x', y='y', 
        source=source,
        size='sizes',
        line_color='line_color',
        fill_color='fill_color',
        fill_alpha=0.05)

output_notebook()
show(p)