In [12]:
SQL_SOURCE = 'https://yc3300.carto.com/api/v2/sql?q='

import urllib2
import urllib
import StringIO
import ast
import pandas as pd

def queryCartoDB(query, format='CSV', source=SQL_SOURCE):
    '''queries carto datasets from a given carto account
    Arguments: 
    query - string: a valid sql query string
    format - outlut format  OPTIONAL (default CSV)
    source - a valid sql api endpoint OPTIONAL (default carto fb55 account)
    Returns:
    the return of the sql query AS A STRING
    NOTES:
    designed for the carto API, tested only with CSV return format'''
    
    data = urllib.urlencode({'format': format, 'q': query})
    try:
        response = urllib2.urlopen(source, data)
    except urllib2.HTTPError, e:
        raise ValueError('\n'.join(ast.literal_eval(e.readline())['error']))
    except Exception:
        raise
    return response.read()

## Task 1 — Familiarize with SQL Clauses
• Sort data by start_station_id, tripduration                 
- Only checking trips with duration <= 3 hours
• Only show the top/last 10 records (aka head and tail in SQL)               
• List all unique start_station_id values         
• Aggregation functions:                
- Count the number of trips (aka wc -l in SQL)
- Find the average/min/max trip duration

In [52]:
task1 = '''
SELECT DISTINCT start_station_id, 
COUNT(start_station_id) AS trip_counts, 
AVG(tripduration) AS ave_tripduration,
MIN(tripduration) AS min_tripduration,
MAX(tripduration) AS max_tripduration
FROM citibike
WHERE tripduration<=10800
GROUP BY start_station_id
ORDER BY start_station_id DESC, ave_tripduration DESC
LIMIT 10
'''

In [53]:
pd.read_csv(StringIO.StringIO(queryCartoDB(task1)), sep=',')

Unnamed: 0,start_station_id,trip_counts,ave_tripduration,min_tripduration,max_tripduration
0,3002,184,714.646739,79,7225
1,2023,91,771.978022,108,2454
2,2022,96,979.0625,107,2495
3,2021,141,738.815603,90,6592
4,2017,86,763.383721,109,2476
5,2012,256,714.519531,79,2026
6,2010,76,763.184211,167,2610
7,2009,107,777.794393,60,2765
8,2008,86,832.22093,94,3476
9,2006,79,1130.531646,90,3459


## Task 2 — Working with date/time
• Selecting trips started on Feb-02-2015 only                                    
• Selecting trips started on the weekends 
    -What are average trip duration during weekends?            
• Can we do the same for weekday?

In [70]:
task2_1 = '''
SELECT gender, birth_year, end_station_id, start_station_id,tripduration,usertype, stoptime, starttime FROM citibike 
WHERE starttime >= '2015-02-02 00:00'
AND starttime < '2015-02-03 00:00'
ORDER BY starttime DESC
LIMIT 10
'''

In [71]:
pd.read_csv(StringIO.StringIO(queryCartoDB(task2_1)), sep=',')

Unnamed: 0,gender,birth_year,end_station_id,start_station_id,tripduration,usertype,stoptime,starttime
0,2,1976,302,285,733,Subscriber,2015-02-03 00:10:00+00,2015-02-02 23:58:00+00
1,1,1961,537,453,634,Subscriber,2015-02-03 00:05:00+00,2015-02-02 23:55:00+00
2,1,1988,394,345,689,Subscriber,2015-02-03 00:02:00+00,2015-02-02 23:50:00+00
3,1,1988,417,224,188,Subscriber,2015-02-02 23:50:00+00,2015-02-02 23:47:00+00
4,1,1977,440,484,371,Subscriber,2015-02-02 23:52:00+00,2015-02-02 23:45:00+00
5,1,1979,397,496,2111,Subscriber,2015-02-03 00:18:00+00,2015-02-02 23:43:00+00
6,1,1983,439,280,518,Subscriber,2015-02-02 23:52:00+00,2015-02-02 23:43:00+00
7,2,1982,489,505,813,Subscriber,2015-02-02 23:56:00+00,2015-02-02 23:42:00+00
8,1,1987,507,509,764,Subscriber,2015-02-02 23:55:00+00,2015-02-02 23:42:00+00
9,1,1986,368,402,560,Subscriber,2015-02-02 23:46:00+00,2015-02-02 23:37:00+00


In [98]:
task2_2 = '''
SELECT EXTRACT(DOW FROM starttime) AS date_of_week,
AVG(tripduration) AS ave_tripduration 
FROM citibike 
WHERE extract(DOW FROM starttime) IN (0,6)
GROUP BY date_of_week
ORDER BY ave_tripduration DESC
LIMIT 10
'''

In [99]:
pd.read_csv(StringIO.StringIO(queryCartoDB(task2_2)), sep=',')

Unnamed: 0,date_of_week,ave_tripduration
0,6,686.460825
1,0,638.1492


In [102]:
task2_3 = '''
SELECT EXTRACT(DOW FROM starttime) AS date_of_week,
AVG(tripduration) AS ave_tripduration 
FROM citibike 
WHERE extract(DOW FROM starttime) NOT IN (0,6)
GROUP BY date_of_week
ORDER BY ave_tripduration DESC
LIMIT 10
'''

In [103]:
pd.read_csv(StringIO.StringIO(queryCartoDB(task2_3)), sep=',')

Unnamed: 0,date_of_week,ave_tripduration
0,1,920.862234
1,2,767.224443
2,3,697.556559
3,5,637.116968
4,4,623.39672


## Task 3 — Working with Space
• Showing the list of start station locations 
- Using GROUP BY
• Showing the number of trips started per station        
• … but only for stations within 500m of Time Square!
- The coordinates of Time Square is (40.7577,-73.9857)

In [105]:
task3_1 = '''
SELECT CDB_TransformToWebmercator(CDB_LatLng( start_station_latitude,start_station_longitude)) as the_geom_webmercator,
MIN(cartodb_id) as cartodb_id
FROM citibike
GROUP BY start_station_latitude, start_station_longitude 
,,,

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-105-b4380ee2f30b>, line 6)