In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from pyproj import Geod
import folium   # using folium to display the data on a map

In [2]:
def read_data():
    base_url = "https://data.urbansharing.com/edinburghcyclehire.com/trips/v1/"
    month = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    path_2019 = [base_url + '2019' + '/' + x + '.csv' for x in month]
    path_2020 = [base_url + '2020' + '/' + x + '.csv' for x in month[0:4]]
    path_list = path_2019 + path_2020
    
    data_list = (pd.read_csv(path) for path in path_list)
    data = pd.concat(data_list, ignore_index=True)
    
    return(data)  

In [3]:
def distance(startLat,startLon,endLat,endLon):
    '''
    Function to compute the distance given start and end latitude and longitude
    Inputs: startLat - starting latitude
            startLon - starting longitude
            endLat - ending latitude
            endLon - ending longitude
    Output: distance between two coordinates on the earth in meters
    '''
    earth = Geod(ellps='WGS84') # define the shape of the earth for computing distance

    # use the method in the Geod/pyproj package that computes the distance
    # Order is switched for this function  
    a1,a2,dist = earth.inv(startLon,startLat,endLon,endLat,radians=False) 
    return dist


def process_data(data):

    data['started_at'] = pd.to_datetime(data['started_at'])   # change start-time column to datetime format
    data['ended_at'] = pd.to_datetime(data['ended_at'])    # change end-time column to datetime format
    # Add entries for start and end hour: used later in code
    data['start_hour'] = pd.DatetimeIndex(data['started_at']).hour
    data['end_hour'] = pd.DatetimeIndex(data['ended_at']).hour  
    
    # Append a column to the data containing the approximate distances of each journey
    data['distance'] = distance(data['start_station_latitude'].tolist(), 
                                data['start_station_longitude'].tolist(),
                                data['end_station_latitude'].tolist(), 
                                data['end_station_longitude'].tolist())


    # create a column with the day of the week the journey starts on:
    data["date"] = data.started_at.dt.date
    data['day_of_week'] = data.started_at.dt.dayofweek 
    data['day_name'] = data.started_at.dt.day_name()
    data['month'] = data.started_at.dt.month
    data['month_name'] = data.started_at.dt.month_name()
    data['year'] = data.started_at.dt.year
    data["day_of_month"] = data.started_at.dt.day
    data['week'] = data.started_at.dt.week
    data['is_weekend'] = data.day_name.isin(['Saturday','Sunday'])


    
    return data

In [4]:
def load_processed_data():
    data = read_data()
    data = process_data(data)
    
    return(data)