In this notebook
- get raw data from CityBikes
- make a table of trips between stations
- use co-ordinates to get best routes between stations
- connect to a database

In [153]:
!pip install -q geojson geoalchemy2 geopandas requests shapely pandas sqlalchemy psycopg2-binary tqdm

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m
^C
[31mERROR: Operation cancelled by user[0m


In [210]:
import csv 
import json
import os
import requests
import sys

import geojson
from geoalchemy2 import Geometry, WKTElement
from geojson import Feature, Point, FeatureCollection
import geopandas as gpd
from geopandas import GeoDataFrame
import pandas as pd
from sqlalchemy import *
from shapely.geometry import Point
import psycopg2
import datetime
from datetime import datetime

### 1. Import from source ###

In [251]:
data = pd.read_csv("data/201908-citibike-tripdata.csv")

In [252]:
data.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,393,2019-08-01 00:00:01.4680,2019-08-01 00:06:35.3780,531.0,Forsyth St & Broome St,40.718939,-73.992663,408.0,Market St & Cherry St,40.710762,-73.994004,35305,Subscriber,1996,2
1,627,2019-08-01 00:00:01.9290,2019-08-01 00:10:29.7840,274.0,Lafayette Ave & Fort Greene Pl,40.686919,-73.976682,3409.0,Bergen St & Smith St,40.686744,-73.990632,38822,Subscriber,1998,2
2,1132,2019-08-01 00:00:04.0480,2019-08-01 00:18:56.1650,2000.0,Front St & Washington St,40.702551,-73.989402,3388.0,President St & Henry St,40.6828,-73.999904,18373,Subscriber,1988,1
3,1780,2019-08-01 00:00:04.1630,2019-08-01 00:29:44.7940,479.0,9 Ave & W 45 St,40.760193,-73.991255,473.0,Rivington St & Chrystie St,40.721101,-73.991925,25002,Subscriber,1988,1
4,1517,2019-08-01 00:00:05.4580,2019-08-01 00:25:23.4550,3312.0,1 Ave & E 94 St,40.781721,-73.94594,3312.0,1 Ave & E 94 St,40.781721,-73.94594,31198,Subscriber,1965,2


In [253]:
list(data)

['tripduration',
 'starttime',
 'stoptime',
 'start station id',
 'start station name',
 'start station latitude',
 'start station longitude',
 'end station id',
 'end station name',
 'end station latitude',
 'end station longitude',
 'bikeid',
 'usertype',
 'birth year',
 'gender']

### 2. Subset by date  ###


In [254]:
starttimes = data['starttime'] 
data['starttime'] = [datetime.strptime(time, "%Y-%m-%d %H:%M:%S.%f") for time in starttimes]

In [231]:
mydate = datetime.strptime('2019-08-31', '%Y-%m-%d')

In [257]:
 data = data[data.starttime.dt.date  == pd.to_datetime('2019-08-31').date()]
#mydate.date()

In [258]:
data.shape

(64237, 15)

In [259]:
od = data.drop(['tripduration', 'starttime', 'stoptime',  'usertype', 'birth year', 'gender'], axis  =1 )

In [260]:
od.head()

Unnamed: 0,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid
2279987,501.0,FDR Drive & E 35 St,40.744219,-73.971212,3733.0,Avenue C & E 18 St,40.730563,-73.973984,21551
2279988,128.0,MacDougal St & Prince St,40.727103,-74.002971,3016.0,Kent Ave & N 7 St,40.720368,-73.961651,32313
2279989,3285.0,W 87 St & Amsterdam Ave,40.78839,-73.9747,3285.0,W 87 St & Amsterdam Ave,40.78839,-73.9747,30825
2279990,3087.0,Metropolitan Ave & Meeker Ave,40.714133,-73.952344,460.0,S 4 St & Wythe Ave,40.712859,-73.965903,35046
2279991,485.0,W 37 St & 5 Ave,40.75038,-73.98339,3136.0,5 Ave & E 63 St,40.766368,-73.971518,33038


In [261]:
pairs = od.groupby(['start station id', 'start station latitude', 'start station longitude' , 
                'end station id', 'end station latitude', 'end station longitude']).size().reset_index(name='count')

In [263]:
pairs.shape

(34656, 7)

In [72]:
ends = od[['end station name']].rename(columns={'end station name': 'station'})

stations = pd.concat([starts, ends])
stations = stations.drop_duplicates().reset_index(drop=True)

stations.shape

stations.head()

Unnamed: 0,station
0,Forsyth St & Broome St
1,Lafayette Ave & Fort Greene Pl
2,Front St & Washington St
3,9 Ave & W 45 St
4,1 Ave & E 94 St


In [186]:
# Creating SQLAlchemy's engine to use
engine = create_engine('postgresql://postgres:postgres@gisdata.cpu2z0a5bugq.us-east-2.rds.amazonaws.com:5432/postgres')


In [89]:
r = requests.get('https://gbfs.citibikenyc.com/gbfs/es/station_information.json')

bikeshare_stations = pd.DataFrame(json.loads(r.content)['data']['stations'])[['station_id', 'name', 'lat', 'lon']].astype({
    'station_id': 'float64',
})



In [91]:
bikeshare_stations

Unnamed: 0,station_id,name,lat,lon
0,237.0,E 11 St & 2 Ave,40.730473,-73.986724
1,301.0,E 2 St & Avenue B,40.722174,-73.983688
2,307.0,Canal St & Rutgers St,40.714275,-73.989900
3,350.0,Clinton St & Grand St,40.715595,-73.987030
4,358.0,Christopher St & Greenwich St,40.732916,-74.007114
...,...,...,...,...
858,3853.0,Harrison Pl & Porter Ave,40.706860,-73.928510
859,3854.0,Morgan Ave & Maspeth Ave,40.716657,-73.936370
860,3855.0,Frost St & Debevoise Ave,40.718820,-73.939480
861,3857.0,Engert Ave & McGuinness Blvd,40.721580,-73.945460


In [194]:
df = bikeshare_stations

## geometry 
gdf = GeoDataFrame(
    df.drop(['lon', 'lat'], axis=1),
    crs={'init': 'epsg:4326'},
    geometry=[Point(xy) for xy in zip(df.lon, df.lat)]
)

def create_wkt_element(geom):
    return WKTElement(geom.wkt, srid=4326)

In [195]:
gdf.head()

Unnamed: 0,station_id,name,geometry
0,237.0,E 11 St & 2 Ave,POINT (-73.98672378000001 40.73047309)
1,301.0,E 2 St & Avenue B,POINT (-73.98368779 40.72217444)
2,307.0,Canal St & Rutgers St,POINT (-73.98990025000001 40.71427487)
3,350.0,Clinton St & Grand St,POINT (-73.98702950000001 40.71559509)
4,358.0,Christopher St & Greenwich St,POINT (-74.00711384 40.73291553)


In [196]:
gdf['geometry'] = gdf['geometry'].apply(create_wkt_element)
gdf.head()

Unnamed: 0,station_id,name,geometry
0,237.0,E 11 St & 2 Ave,POINT (-73.98672378000001 40.73047309)
1,301.0,E 2 St & Avenue B,POINT (-73.98368779 40.72217444)
2,307.0,Canal St & Rutgers St,POINT (-73.98990025000001 40.71427487)
3,350.0,Clinton St & Grand St,POINT (-73.98702950000001 40.71559509)
4,358.0,Christopher St & Greenwich St,POINT (-74.00711384 40.73291553)


In [197]:
 #   if_exists = replace: If table exists, drop it, recreate it, and insert data.
 #   if_exists = fail: If table exists, do nothing.
 #   if_exists = append: If table exists, insert data. Create if does not exist.

Create a table in PostGIS from a geodataframe

In [198]:
gdf.to_sql("stations", engine, if_exists='replace', index=False, dtype={'geometry': Geometry('POINT', 4326)})

SRID 4326 = WGS84 

World Geodetic System. 

![WGS84](images/WGS84.png)

Global Positioning System uses the World Geodetic System (WGS84) as its reference coordinate system.

PostGIS opens up the ability to store your data in a single coordinate system such as WGS84 (SRID 4326), and when you need something like Area, Distance, or Length, you use a function to create that column from your datain a projected coordinate system that will give you a local interpretation of your data in units that you want.

So for example, I could store students and schools in PostGIS both in WGS84/SRID:4326. When I want to calculate the distance between students and the schools they attend, I call a distance function on my geometry column, but also wrap a ST_Transform function around the geometry column first to 'project' the data into State Plane CO Central (SRID: 2877). This gives me a column for the distance of each student to their closest school in feet because SRID:2877 is a projected coordinate system that stores data in Feet.

## Fun fact ! ##

Well-known text (WKT) is a text markup language for representing vector geometry objects on a map. 


In [264]:

features = []

number_of_elements = pairs.shape[0]

item = 0

for index, pair in pairs.iterrows():
    
    source_coordinates = str(pair['start station longitude']) + ',' + str(pair['start station latitude']) + ';' 
    #print(type(source_coordinates))
    
    dest_coordinates = str(pair['end station longitude']) + ',' + str(pair['end station latitude']) 

    item += 1
    
    url =  'http://router.project-osrm.org/route/v1/driving/'+source_coordinates+dest_coordinates

    payload = {"steps":"true","geometries":"geojson"}

    try:
        response = requests.get(url,params=payload)
        data = response.json()
        if data:
            if 'routes' in data:
                d = data['routes'][0]['geometry']
                if d: 
                    d.update( {'count' : pair['count']} )
                    features.append(d)

    
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        print(e)
   
    print(index)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


KeyboardInterrupt: 

Please iterate for all trips


In [268]:
features


[]

In [118]:
from shapely.geometry import Point, Polygon, MultiPolygon, LineString
new_features = []


for feature in features:
  line = LineString(feature['coordinates'])
  cnt = feature['count']
  
  feature = Feature(
    geometry=line,
    properties=cnt
  )

  new_features.append(feature)

In [119]:
len(new_features)

28

In [120]:

feature_collection = FeatureCollection(new_features)

with open('paths.geojson', 'w') as f:
  f.write(geojson.dumps(feature_collection))


In [None]:
query = "SELECT  \
n.area_name, \
SUM(ST_Length( \
    ST_Intersection(p.wkb_geometry::geography, \
                n.wkb_geometry::geography))) as length \
\
FROM neighborhoods n \
INNER JOIN paths p ON ST_Intersects(n.wkb_geometry, p.wkb_geometry) \
GROUP BY 1 "

con = engine.connect()
output = pd.read_sql_query(query, con)


### PostGIS exercises

In [None]:

postgres://ubuntu:nyc@localhost/nyc 

In [None]:
engine = create_engine('postgres://ubuntu:nyc@localhost/nyc ')


In [None]:
query  =  "SELECT * FROM pg_catalog.pg_tables where schemaname = 'public'"

In [None]:

ogr2ogr -f PostgreSQL PG:host='database-1.cpu2z0a5bugq.us-east-2.rds.amazonaws.com' port='5432' dbname='postgres' password ='postgres' user='postgres' 

In [None]:
engine = create_engine('postgresql://postgres:postgres@database-1.cpu2z0a5bugq.us-east-2.rds.amazonaws.com:5432/postgres')


In [None]:
con = engine.connect()

In [None]:
query = "SELECT * FROM pg_catalog.pg_tables where schemaname = 'public'"

pd.read_sql_query(query, con)