# Discrete Simulation Final Project
Authors: [Felipe Melo](https://github.com/FelipOliveira), [João Canavarro](https://github.com/jvcanavarro) and [Vitor Cantão](https://github.com/VitorCantao).

***
## Metro Bike [Dataset](https://bikeshare.metro.net/about/data/)
*The Metro Bike Share system makes bikes available 24/7, 365 days a year in Downtown LA, Central LA, and North Hollywood (...). Metro Bike Share offers convenient round-the-clock access to a fleet of bicycles for short trips. Metro Bike Share is one of LA Metro's multiple public transportation options for Angelenos and visitors to get around.*

In [14]:
import matplotlib.pyplot as plt
import geopy.distance as gd
import pandas as pd
import numpy as np

### Dataset Overview

In [52]:
df = pd.read_csv('../metrobike-2020.csv')
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,trip_id,duration,start_time,end_time,start_station,start_lat,start_lon,end_station,end_lat,end_lon,bike_id,plan_duration,trip_route_category,passholder_type,bike_type
0,134867493,25,1/1/2020 0:16,1/1/2020 0:41,3063,34.048038,-118.253738,4491,34.04744,-118.24794,18419,30,One Way,Monthly Pass,electric
1,134867799,35,1/1/2020 0:24,1/1/2020 0:59,4285,,,4354,34.017681,-118.409081,15661,1,One Way,One Day Pass,smart
2,134868104,37,1/1/2020 0:31,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,15848,1,One Way,Walk-up,smart
3,134868103,36,1/1/2020 0:32,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,16053,1,One Way,Walk-up,smart
4,134868102,35,1/1/2020 0:33,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,15694,1,One Way,Walk-up,smart


In [16]:
# Number of rides in 2020
df.shape[0]

209974

In [17]:
df.columns

Index(['trip_id', 'duration', 'start_time', 'end_time', 'start_station',
       'start_lat', 'start_lon', 'end_station', 'end_lat', 'end_lon',
       'bike_id', 'plan_duration', 'trip_route_category', 'passholder_type',
       'bike_type'],
      dtype='object')

In [18]:
# Remove unecessary columns
columns = ['duration', 'start_time', 'end_time', 'start_station', 'start_lat', 'start_lon', 'end_station', 'end_lat', 'end_lon', 'bike_id', 'trip_route_category', 'bike_type']
df = df[columns]
df.head()

Unnamed: 0,duration,start_time,end_time,start_station,start_lat,start_lon,end_station,end_lat,end_lon,bike_id,trip_route_category,bike_type
0,25,1/1/2020 0:16,1/1/2020 0:41,3063,34.048038,-118.253738,4491,34.04744,-118.24794,18419,One Way,electric
1,35,1/1/2020 0:24,1/1/2020 0:59,4285,,,4354,34.017681,-118.409081,15661,One Way,smart
2,37,1/1/2020 0:31,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,15848,One Way,smart
3,36,1/1/2020 0:32,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,16053,One Way,smart
4,35,1/1/2020 0:33,1/1/2020 1:08,4344,34.014309,-118.491341,4322,34.005871,-118.429161,15694,One Way,smart


####  According to Stations [Dataset](https://bikeshare.metro.net/wp-content/uploads/2021/01/metro-bike-share-stations-2021-01-01.csv):
* `4285,Metro Bike Share Free Bikes,2/27/2019,Free Bikes,Active`
* `4286,Metro Bike Share Out of Service Area Smart Bike,2/27/2019,Free Bikes,Active`
* `3000,Virtual Station,7/7/2016,N/A,Active`

Those stations doesn't possess geolocation information, which leads us to ignore then when calculating rides distances

In [64]:
df_null = df[df.isnull().any(axis=1)]
nan_stations = df_null.start_station[df_null['start_lat'].isnull()].unique()
nan_stations

array([], dtype=int64)

In [65]:
# Remove stations without geolocation
df = df.dropna()

In [66]:
# Remove trips starting and ending at the same station
df = df.loc[df.trip_route_category == 'One Way']

In [67]:
# Total bikes within all stations and Nº of stations
print('Number of bicycles: ', len(df['bike_id'].unique()))
print('Number of stations: ', df['start_station'].unique().shape[0])

Number of bicycles:  3894
Number of stations:  263


In [68]:
# Percentage of rides by each model of bicycle
df['bike_type'].value_counts(normalize=True)

standard    0.522733
electric    0.406921
smart       0.070346
Name: bike_type, dtype: float64

In [69]:
# Mean duration of a bike travel
df['duration'].mean()

26.045639739563143

### Calculate distance using geolocation coordenates

In [70]:
def get_distance(coords):
    return gd.geodesic((coords[0], coords[1]), (coords[2], coords[3])).km

In [71]:
# Create new distance column
columns = ['start_lat', 'start_lon', 'end_lat', 'end_lon']
df['distance'] = df.apply(lambda x: get_distance(x[columns]), axis=1)

In [None]:
df.head()

A better option was to filter unique pairs of start and end stations, then calculate the distances. It may be implemented in the future.

In [None]:
df[['duration', 'distance']].describe()

The duration and distance values of rides are very discrepant (see the standard deviation). Eventually, there are cases where people stay with the bike an entire day and then return then to a station, which doesn't reflect the real time they spending at the ride.
In this sense, we decided to ignore those cases when calculating the mean speed of each bycicle model.

### Calculate the average speed of each bicycle model with different intervals of time and space


In [None]:
# Select only necessary columns
time_space = df.iloc[:, [0, 11, 12]]
time_space.head()

In [None]:
def get_speed(space, time, unit):

    if unit == 'km/min': return space / time
    if unit == 'km/h'  : return space / (time / 60)
    if unit == 'm/s'   : return (space * 1000) / (time * 60)

In [None]:
unit = 'km/min'
intervals = [(15, 3), (30, 6), (45, 9), (120, 12)]

avg_speed = pd.DataFrame()
for x, y in enumerate(intervals):
    # Loc the intervals of duration and distance in order to calculate the speed of each model at these situations
    if x == 0:
        time_space_filtered = time_space.loc[(df.duration > 0) & (df.duration <= y[0]) 
                                           & (df.distance > 0) & (df.distance <= y[1])]
    else:
        time_space_filtered = time_space.loc[(df.duration > intervals[x-1][0]) & (df.duration <= y[0]) 
                                           & (df.distance > intervals[x-1][1]) & (df.distance <= y[1])]

    time_space_filtered['speed'] = time_space_filtered.apply(lambda x: get_speed(x['distance'], x['duration'], unit=unit), axis=1)
    bike_speed = time_space_filtered.groupby('bike_type').mean()
    avg_speed[y] = bike_speed['speed']

avg_speed

We are only considering trips with duration and distance up to 120 minutes and 12 kilometers, respectively.

In [None]:
title = 'Average speed of bicycle models for different intervals of time and travel distances'
xlabel = 'Duration(min) / Distance(km)'

ax = avg_speed.T.plot(kind='bar', figsize=(15, 7), xlabel=xlabel, ylabel='Speed in ' + unit, colormap='tab20b', rot=0, title=title)
for p in ax.patches:                 
    ax.annotate(round(p.get_height(),3), (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
ax.grid(linestyle=':', axis='y')

it is noticeable that electric bicycles have a certain "advantage" over ordinary ones as the distance and duration of the rides increases

#### Normalize the frequency of rides for each unique pair of (start|end) station

In [None]:
# Ignore rides that end at same station and calculate total ocurrences by each ride
group = ['start_station', 'end_station', 'distance']
stations = df.groupby(group).size().reset_index(name='occurs')

# Normalize weights by each group (i.e. each start_station)
stations['norm_rides'] = stations['occurs'] / stations.groupby('start_station')['occurs'].transform('sum')
stations = stations.drop_duplicates(['start_station', 'end_station'])

#### Create frequency and distance matrices

In [None]:
# Pivot to create rides frequency matrix
freq_matrix = stations.pivot(index='start_station', columns='end_station', values='norm_rides').fillna(0)
freq_matrix.head()

In [None]:
# Stations distance matrix
distance_matrix = stations.pivot(index='start_station', columns='end_station', values='distance').fillna(0)
distance_matrix.head()

***
#### Simulation


In [None]:
import simpy
import random

STATIONS = freq_matrix.index[0:4]
destinations_matrix = freq_matrix.loc[STATIONS, STATIONS]
BIKE_TYPE = df['bike_type'].unique()

NUMBER_OF_RIDES = 24
NUMBER_OF_BIKES = 36
NUMBER_OF_STATIONS = len(STATIONS)
NUMBER_OF_BIKES_PER_STATION = NUMBER_OF_BIKES // NUMBER_OF_STATIONS


class BikeDock:
    def __init__(self, env, init, capacity):
        self.env = env
        self._container = simpy.Container(self.env, init=init, capacity=capacity)

    @property
    def capacity(self):
        return self._container.capacity

    @property
    def available_bikes(self):
        return self._container.level

    @property
    def has_available(self):
        return self._container.level > 0

    @property
    def is_full(self):
        return self._container.level >= self._container.capacity

    def rent_bike(self):
        return self._container.get(1)

    def return_bike(self):
        return self._container.put(1)


class Ride:
    def __init__(self, env, start_station, end_station):
        self.start_station = start_station
        self.end_station = end_station
        self.env = env

    @property
    def distance(self):
        return distance_matrix[self.start_station.id][self.end_station.id]

    @property
    def duration(self):
        standard_bike_velocity_mean = 0.12
        return round(self.distance / standard_bike_velocity_mean)

    def start(self):
        env.process(self.start_station.get_bike())
        yield env.timeout(self.duration)
        env.process(self.end_station.put_bike())


class Station(object):
    def __init__(self, env, station_id):
        self.env = env
        self.id = station_id
        self.dock = BikeDock(env, init=NUMBER_OF_BIKES_PER_STATION, capacity=NUMBER_OF_BIKES_PER_STATION * 2)
        self.overflow_count = 0
        self.client_wait_time = 0

    def get_bike(self):
        wait_time = 0
        while (wait_time < 10):
            if (self.dock.has_available):
                print(f'Bike requested at station {self.id} - {self.dock.available_bikes - 1} / {self.dock.capacity}')
                return self.dock.rent_bike()
            else: 
                wait_time += 1
                self.client_wait_time += 1
                print(f'No more bikes at station {self.id}. Waiting {wait_time}...')
                yield env.timeout(1)
        print('#-#-#  I\'m out of here! Trash company!  #-#-#')
        

    def put_bike(self):
        if (self.dock.is_full):
            self.overflow_count += 1

        print(f'Bike arrived at station {self.id} - {self.dock.available_bikes + 1} / {self.dock.capacity}')
        yield self.dock.return_bike()

def cyclist_arrivals(env, stations):
    random_station_index = random.randint(0, len(stations) - 1)
    start_station = stations[random_station_index]

    random_destination_index = random_weighted(destinations_matrix.loc[start_station.id])
    final_station = stations[random_destination_index]
    print(f'New Ciclist arrives! {start_station.id} -> {final_station.id}')

    ride = Ride(env, start_station, final_station)
    yield env.process(ride.start())

stations = []
def setup(env):
    for station_id in STATIONS:
        stations.append(Station(env, station_id))
    while True:
        yield env.timeout(random.randint(1, 5))
        env.process(cyclist_arrivals(env, stations))

env = simpy.Environment()
env.process(setup(env))
env.run(until=150)

print('\n\n')
total_wait_time = 0
for station in stations:
    total_wait_time += station.client_wait_time
print(f'Total wait time: {total_wait_time}')