In [8]:
import requests
import pandas as pd
from arango import ArangoClient
import subprocess
import docker
import os
import tarfile

In [9]:
def refresh_data():
    
    client = ArangoClient(hosts='http://localhost:8529')
    db = client.db('BlueBikesProject',username='root', password='root')

    # Drop the collection if it exists
    if db.has_collection('regions'):
        db.delete_collection('regions')

    if db.has_collection('stations'):
        db.delete_collection('stations')
    
    if db.has_collection('station_status'):
        db.delete_collection('station_status')
    
    # Defining the collections
    region_col = db.create_collection('regions')
    station_col = db.create_collection('stations')
    status_col = db.create_collection('station_status')
    
    # Downloading the data from GBFS
    
    gbfs_url = 'https://gbfs.bluebikes.com/gbfs/gbfs.json'
    gbfs_data = requests.get(gbfs_url).json()
    
    #fetching the URLs
    
    region_url = None
    station_url = None
    stationstatus_url = None
    for feed in gbfs_data['data']['en']['feeds']:
        if feed['name'] == 'system_regions':
            region_url = feed['url']
        elif feed['name'] == 'station_information':
            station_url = feed['url']
        elif feed['name'] == 'station_status':
            stationstatus_url = feed['url']
            
    #fetching the data from the url
    
    responseRegions = requests.get(region_url).json()
    regions = responseRegions['data']['regions']
    #print(regions)
    
    responseStations = requests.get(station_url).json()
    stations = responseStations['data']['stations']
    #print(stations)
    
    responseStationStatus = requests.get(stationstatus_url).json()
    stationstatus = responseStations['data']['stations']
    #print(stationstatus)
    
    #Parsing into dataframes
    regions_df = pd.DataFrame(regions)
    stations_df = pd.DataFrame(stations)
    station_status_df = pd.DataFrame(stationstatus)

    #renaming the region_id and station_id columns in each dataframe to '_key' so that they act as documents in arangodb

    regions_df = regions_df.rename(columns={'region_id': '_key'})
    stations_df = stations_df.rename(columns={'station_id': '_key'})

    #saving these dataframes as csv
    
    regions_df.to_csv('RegionsData.csv')
    stations_df.to_csv('StationsData.csv')
    station_status_df.to_csv('StationStatusData.csv')

    #since I have installed arangodb using docker, using the docker container to run arangoimport command

    client = docker.from_env()
    container = client.containers.get('5aac08e9d503')

    # create the tar archive of the csv files

    stations_tar_file = tarfile.open('StationsData.tar.gz', 'w:gz')
    stations_tar_file.add('StationsData.csv')
    stations_tar_file.close()

    regions_tar_file = tarfile.open('RegionsData.tar.gz', 'w:gz')
    regions_tar_file.add('RegionsData.csv')
    regions_tar_file.close()

    stationstatus_tar_file = tarfile.open('StationStatusData.tar.gz', 'w:gz')
    stationstatus_tar_file.add('StationStatusData.csv')
    stationstatus_tar_file.close()

    # copy the tar archives to the container
    with open('StationsData.tar.gz', 'rb') as f:
        container.put_archive('/Data/', f.read())
    
    with open('RegionsData.tar.gz', 'rb') as f:
        container.put_archive('/Data/', f.read())
    
    with open('StationStatusData.tar.gz', 'rb') as f:
        container.put_archive('/Data/', f.read())
    
    #running the arangoimport code for importing the trips data

    cmdRegions = "arangoimport --file RegionsData.csv --type csv --collection regions --server.endpoint tcp://{ip}:8529 --server.username root --server.password root --server.database BlueBikesProject".format(ip="172.17.0.2")
    resultRegions = container.exec_run(cmdRegions)
    print(resultRegions.output.decode())

    cmdStations = "arangoimport --file StationsData.csv --type csv --collection stations --server.endpoint tcp://{ip}:8529 --server.username root --server.password root --server.database BlueBikesProject".format(ip="172.17.0.2")
    resultStations = container.exec_run(cmdStations)
    print(resultStations.output.decode())

    cmdStationStatus = "arangoimport --file StationStatusData.csv --type csv --collection station_status --server.endpoint tcp://{ip}:8529 --server.username root --server.password root --server.database BlueBikesProject".format(ip="172.17.0.2")
    resultStationStatus = container.exec_run(cmdStationStatus)
    print(resultStationStatus.output.decode())

In [10]:
#calling the function

refresh_data()

Connected to ArangoDB 'http+tcp://172.17.0.2:8529, version: 3.10.4, database: 'BlueBikesProject', username: 'root'
----------------------------------------
database:               BlueBikesProject
collection:             regions
overwrite coll. prefix: no
create:                 no
create database:        no
source filename:        RegionsData.csv
file type:              csv
quote:                  "
separator:              
headers file:           
threads:                4
on duplicate:           error
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting CSV import...

created:          20
updated/replaced: 0
ignored:          0
lines read:       22

Connected to ArangoDB 'http+tcp://172.17.0.2:8529, version: 3.10.4, database: 'BlueBikesProject', username: 'root'
----------------------------------------
database:               BlueBikesProject
collection:             stations
overwrite coll. prefix: no
create:                 no
cre