In [41]:
from arango import ArangoClient

In [42]:
import requests

In [43]:
#connecting to arango db
client = ArangoClient(hosts='http://localhost:8529')
db = client.db('BlueBikesProject',username='root', password='root')

In [None]:
# Defining the collections
region_col = db.create_collection('regions')
station_col = db.create_collection('stations')
status_col = db.create_collection('station_status')
trip_col = db.create_collection('trips', edge=True)

In [45]:
# Downloading the data from GBFS
gbfs_url = 'https://gbfs.bluebikes.com/gbfs/gbfs.json'
gbfs_data = requests.get(gbfs_url).json()

In [46]:
# fetching regions data
region_url = None
for feed in gbfs_data['data']['en']['feeds']:
    if feed['name'] == 'system_regions':
        region_url = feed['url']
        break

In [47]:
response = requests.get(region_url)

In [48]:
#saving it locally

with open('regions.json', 'wb') as f:
    f.write(response.content)

In [49]:
#since I have installed arangodb using docker, using the docker library to run the arangoimport command

import docker
import os

In [50]:
client = docker.from_env()

In [51]:
container = client.containers.get('5aac08e9d503')


In [57]:
#running the arangoimport code for importing the regions data
cmd = "arangoimport --file regions.json --type json --collection regions --server.endpoint tcp://{ip}:8529 --server.username root --server.password root --server.database BlueBikesProject".format(ip="172.17.0.2")
result = container.exec_run(cmd)
print(result.output.decode())


Connected to ArangoDB 'http+tcp://172.17.0.2:8529, version: 3.10.4, database: 'BlueBikesProject', username: 'root'
----------------------------------------
database:               BlueBikesProject
collection:             regions
overwrite coll. prefix: no
create:                 no
create database:        no
source filename:        regions.json
file type:              json
threads:                4
on duplicate:           error
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
2023-04-06T02:59:54Z [305] INFO [9ddf3] {general} processed 1.3 KB (3%) of input file

created:          20
updated/replaced: 0
ignored:          0



In [58]:
# fetching stations data
station_url = None
for feed in gbfs_data['data']['en']['feeds']:
    if feed['name'] == 'station_information':
        station_url = feed['url']
        break

In [59]:
response = requests.get(station_url)

In [60]:
#saving it locally

with open('stations.json', 'wb') as f:
    f.write(response.content)

In [61]:
#running the arangoimport code for importing the stations data
cmd = "arangoimport --file stations.json --type json --collection stations --server.endpoint tcp://{ip}:8529 --server.username root --server.password root --server.database BlueBikesProject".format(ip="172.17.0.2")
result = container.exec_run(cmd)
print(result.output.decode())

Connected to ArangoDB 'http+tcp://172.17.0.2:8529, version: 3.10.4, database: 'BlueBikesProject', username: 'root'
----------------------------------------
database:               BlueBikesProject
collection:             stations
overwrite coll. prefix: no
create:                 no
create database:        no
source filename:        stations.json
file type:              json
threads:                4
on duplicate:           error
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
2023-04-06T03:11:31Z [317] INFO [9ddf3] {general} processed 288.8 KB (3%) of input file

created:          453
updated/replaced: 0
ignored:          0



In [66]:
# fetching stations_status data
stationstatus_url = None
for feed in gbfs_data['data']['en']['feeds']:
    if feed['name'] == 'station_status':
        stationstatus_url = feed['url']
        break

In [67]:
response = requests.get(stationstatus_url)

In [68]:
#saving it locally

with open('station_status.json', 'wb') as f:
    f.write(response.content)

In [69]:
#running the arangoimport code for importing the station_status data
cmd = "arangoimport --file station_status.json --type json --collection station_status --server.endpoint tcp://{ip}:8529 --server.username root --server.password root --server.database BlueBikesProject".format(ip="172.17.0.2")
result = container.exec_run(cmd)
print(result.output.decode())

Connected to ArangoDB 'http+tcp://172.17.0.2:8529, version: 3.10.4, database: 'BlueBikesProject', username: 'root'
----------------------------------------
database:               BlueBikesProject
collection:             station_status
overwrite coll. prefix: no
create:                 no
create database:        no
source filename:        station_status.json
file type:              json
threads:                4
on duplicate:           error
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
2023-04-06T03:18:33Z [329] INFO [9ddf3] {general} processed 199.5 KB (3%) of input file

created:          453
updated/replaced: 0
ignored:          0



In [70]:
#cleaning the trip data in the csv form

import pandas as pd

In [71]:
df = pd.read_csv('tripdata022023.csv')

In [72]:
#checking for missing values

df.isnull().sum()

tripduration                  0
starttime                     0
stoptime                      0
start station id              0
start station name            0
start station latitude        0
start station longitude       0
end station id                0
end station name              0
end station latitude          0
end station longitude         0
bikeid                        0
usertype                      0
postal code                9696
dtype: int64

In [78]:
#removing null values from postal code column

df['postal code'] = df['postal code'].fillna(0)

In [79]:
#since 'trips' is an edge collection, renaming the start_station_id and end_station_id columns to _from and _to respectively

df = df.rename(columns={'start station id': '_from'})
df = df.rename(columns={'end station id': '_to'})

In [82]:
#appending 'stations/' in front of _from and _to station ids so that they refer to the ids of the station collection

df['_from'] = df['_from'].astype(str)
df['_to'] = df['_to'].astype(str)

df['_from'] = df['_from'].apply(lambda x: 'stations/' + x)
df['_to'] = df['_to'].apply(lambda x: 'stations/' + x)

In [83]:
df

Unnamed: 0,tripduration,starttime,stoptime,_from,start station name,start station latitude,start station longitude,_to,end station name,end station latitude,end station longitude,bikeid,usertype,postal code
0,263,2023-02-01 00:00:50.0410,2023-02-01 00:05:13.7090,stations/330,30 Dane St,42.381001,-71.104025,stations/110,Harvard University Gund Hall at Quincy St / Ki...,42.376369,-71.114025,7334,Subscriber,02138
1,447,2023-02-01 00:02:50.1390,2023-02-01 00:10:17.4090,stations/413,Kennedy-Longfellow School 158 Spring St,42.369553,-71.085790,stations/386,Sennott Park Broadway at Norfolk Street,42.368605,-71.099302,3257,Subscriber,02139
2,302,2023-02-01 00:09:00.7270,2023-02-01 00:14:03.2560,stations/554,Forsyth St at Huntington Ave,42.339202,-71.090511,stations/27,Roxbury Crossing T Stop - Columbus Ave at Trem...,42.331184,-71.095171,7824,Subscriber,02120
3,689,2023-02-01 00:16:00.6020,2023-02-01 00:27:30.4070,stations/87,Harvard University Housing - 115 Putnam Ave at...,42.366621,-71.114214,stations/178,MIT Pacific St at Purrington St,42.359573,-71.101295,8266,Subscriber,02139
4,468,2023-02-01 00:18:10.3820,2023-02-01 00:25:59.3730,stations/55,Boylston St at Massachusetts Ave,42.347406,-71.086784,stations/32,Landmark Center - Brookline Ave at Park Dr,42.343691,-71.102353,7431,Subscriber,02215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152970,417,2023-02-28 23:57:36.9220,2023-03-01 00:04:34.5830,stations/46,Christian Science Plaza - Massachusetts Ave at...,42.343666,-71.085824,stations/331,Huntington Ave at Mass Art,42.336586,-71.098870,3442,Subscriber,02120
152971,872,2023-02-28 23:58:06.8950,2023-03-01 00:12:39.3410,stations/11,Longwood Ave at Binney St,42.338629,-71.106500,stations/491,Harrison Ave at E. Dedham St,42.339194,-71.069750,2876,Subscriber,02446
152972,384,2023-02-28 23:58:20.9490,2023-03-01 00:04:45.2180,stations/553,Cambridge Crossing at North First Street,42.371141,-71.076198,stations/413,Kennedy-Longfellow School 158 Spring St,42.369553,-71.085790,6349,Subscriber,02141
152973,593,2023-02-28 23:58:36.5470,2023-03-01 00:08:29.6560,stations/36,Copley Square - Dartmouth St at Boylston St,42.349928,-71.077392,stations/152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,8277,Subscriber,33332


In [84]:
#saving changes to a new csv file

df.to_csv('tripdata022023_clean.csv', index=False)

In [85]:
#running the arangoimport code for importing the trips data
cmd = "arangoimport --file tripdata022023_clean.csv --type csv --collection trips --server.endpoint tcp://{ip}:8529 --server.username root --server.password root --server.database BlueBikesProject".format(ip="172.17.0.2")
result = container.exec_run(cmd)
print(result.output.decode())

Connected to ArangoDB 'http+tcp://172.17.0.2:8529, version: 3.10.4, database: 'BlueBikesProject', username: 'root'
----------------------------------------
database:               BlueBikesProject
collection:             trips
overwrite coll. prefix: no
create:                 no
create database:        no
source filename:        tripdata022023_clean.csv
file type:              csv
quote:                  "
separator:              
headers file:           
threads:                4
on duplicate:           error
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting CSV import...
2023-04-06T03:51:14Z [341] INFO [9ddf3] {general} processed 1.3 MB (3%) of input file
2023-04-06T03:51:14Z [341] INFO [9ddf3] {general} processed 2.3 MB (6%) of input file
2023-04-06T03:51:14Z [341] INFO [9ddf3] {general} processed 3.4 MB (9%) of input file
2023-04-06T03:51:14Z [341] INFO [9ddf3] {general} processed 4.4 MB (12%) of input file
2023-04-06T03:51:14