In [None]:
# !sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic tqdm

In [None]:
import requests

In [None]:
# This gives information for each station that remains stable over time
url_stations = "https://gbfs.citibikenyc.com/gbfs/en/station_information.json"

In [None]:
# This gives the live status of all the stations (e.g., bikes available etc)
url_status = "https://gbfs.citibikenyc.com/gbfs/en/station_status.json"

In [None]:
# We fetch for now just the time-invariant data
results = requests.get(url_stations).json()

In [None]:
# We only need a subset of the data in the JSON returned by the Citibike API, so we keep only what we need
stations = results["data"]["stations"]

In [None]:
# We will not be using dataframes for this insertion task. (See the A6 notebook if you want to use Pandas)
# We just put the data in a dataframe to understand what is going on.
import pandas as pd

df_stations = pd.DataFrame(stations)
df_stations.head(5)

In [None]:
import sqlalchemy
from sqlalchemy import create_engine

conn_string = "mysql+pymysql://{user}:{password}@{host}/".format(
    host="db.ipeirotis.org", user="student", password="dwdstudent2015"
)

engine = create_engine(conn_string)

db_name = "public"
create_db_query = (
    f"CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET 'utf8'"
)

# Create a database
engine.execute(create_db_query)

# And lets switch to the database
engine.execute(f"USE {db_name}")

# To avoid conflicts between people writing in the same database, we add a random suffix in the tables
# We only create the variable once while running the notebook
import uuid

if "suffix" not in globals():
    suffix = str(uuid.uuid4())[:8]
print(suffix)

In [None]:
# Create the two tables. One for storing the time-invariant station data
# and another table to store the time-varying station status data
stations_table = f"Stations_{suffix}"

sql = f"""CREATE TABLE IF NOT EXISTS {stations_table}
                                (station_id int, 
                                name varchar(250), 
                                capacity int,
                                lat float,
                                lon float,
                                region_id int,
                                short_name varchar(250),
                                rental_url varchar(250),
                                eightd_has_key_dispenser bool,
                                PRIMARY KEY(station_id)
                 )"""
engine.execute(sql)

In [None]:
# Create the time-varying table
status_table = f"Status_{suffix}"
sql = f"""CREATE TABLE IF NOT EXISTS {status_table}
                                (station_id int, 
                                last_reported datetime,
                                num_bikes_available int,
                                num_ebikes_available int,
                                num_bikes_disabled int,
                                num_docks_available int,
                                num_docks_disabled int,
                                is_installed bool,
                                is_renting bool,
                                is_returning bool,
                                PRIMARY KEY(station_id, last_reported)
                                )"""
engine.execute(sql)

In [None]:
stations[0]

In [None]:
# We fetch for now just the time-invariant data
# Notice that we have the INSERT IGNORE so that even when we add the same entry
# again, we do not get an error that the line exists. We do get warnings
# but this is expected

from sqlalchemy.sql import text
from tqdm.autonotebook import tqdm

query_template = text(
    f"""INSERT IGNORE INTO {db_name}.{stations_table}
            (station_id, name, capacity, lat, lon,
            region_id, short_name, rental_url, eightd_has_key_dispenser) 
        VALUES (:station_id, :name, :capacity, :lat, :lon, :region_id, 
                    :short_name, :rental_url, :eightd_has_key_dispenser)"""
)

# The tqdm(stations) shows a progress bar
for entry in tqdm(stations):

    query_parameters = {
        "station_id": int(entry["station_id"]),
        "name": entry.get("name"),
        "capacity": entry.get("capacity"),
        "lat": entry.get("lat"),
        "lon": entry.get("lon"),
        "region_id": entry.get("region_id"),
        "short_name": entry.get("short_name"),
        "rental_url": entry.get("rental_url"),
        "eightd_has_key_dispenser": entry.get("eightd_has_key_dispenser"),
    }

    engine.execute(query_template, **query_parameters)

In [None]:
check = pd.read_sql(f"SELECT * FROM {db_name}.{stations_table}", con=engine)
check

In [None]:
%matplotlib inline
check.plot(kind="scatter", x="lon", y="lat", s=1, figsize=(10, 10))

In [None]:
results = requests.get(url_status).json()
status = results["data"]["stations"]
status[0]

In [None]:
# Now we fetch the data about the time varying elements of the citibike stations
from datetime import datetime

query_template = text(
    f"""INSERT IGNORE INTO {db_name}.{status_table}(station_id, 
                                            num_bikes_available,
                                            num_ebikes_available,
                                            num_bikes_disabled,
                                            num_docks_available,
                                            num_docks_disabled,
                                            is_installed,
                                            is_renting,
                                            is_returning,
                                            last_reported) 
                    VALUES (:station_id, :num_bikes_available, :num_ebikes_available, :num_bikes_disabled,
                    :num_docks_available, :num_docks_disabled, :is_installed, :is_renting, :is_returning, :last_reported)"""
)

for entry in tqdm(status):
    query_parameters = {
        "station_id": int(entry["station_id"]),
        "num_bikes_available": entry["num_bikes_available"],
        "num_bikes_disabled": entry["num_bikes_disabled"],
        "num_ebikes_available": entry["num_ebikes_available"],
        "num_docks_available": entry["num_docks_available"],
        "num_docks_disabled": entry["num_docks_disabled"],
        "is_installed": entry["is_installed"],
        "is_renting": entry["is_renting"],
        "is_returning": entry["is_returning"],
        "last_reported": datetime.fromtimestamp(entry["last_reported"]),
    }

    engine.execute(query_template, **query_parameters)

In [None]:
check = pd.read_sql(f"SELECT * FROM {db_name}.{status_table}", con=engine)
check

In [None]:
drop_table_query = f"DROP TABLE IF EXISTS {db_name}.{status_table}"
print(drop_table_query)
engine.execute(drop_table_query)

In [None]:
drop_table_query = f"DROP TABLE IF EXISTS {db_name}.{stations_table}"
print(drop_table_query)
engine.execute(drop_table_query)