In [None]:
!pip install beautifulsoup4 pandas



In [None]:
from google.colab import files
import shutil
import os

uploaded = files.upload()

file_name = next(iter(uploaded))
file_size = os.path.getsize(file_name)
shutil.move('trimet_stopevents_2022-12-07.html', '/content/trimet_stopevents_2022-12-07.html')

Saving trimet_stopevents_2022-12-07.html to trimet_stopevents_2022-12-07.html


'/content/trimet_stopevents_2022-12-07.html'

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

In [None]:
file_name = '/content/trimet_stopevents_2022-12-07.html'

with open(file_name, 'r') as f:
    html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')


In [None]:
from datetime import datetime, timedelta
initial_date = datetime(2020, 1, 1)

all_records = []

trip_headings = soup.find_all('h2')

for heading in trip_headings:
    trip_id = heading.text.split()[-1]
    table = heading.find_next_sibling('table')
    if not table:
        continue

    headers = [th.text.strip() for th in table.find_all('th')]

    for row in table.find_all('tr')[1:]:
        cells = row.find_all('td')
        if len(cells) == len(headers):
            record = [cell.text.strip() for cell in cells]
            record_dict = dict(zip(headers, record))
            record_dict['trip_id'] = trip_id
            all_records.append(record_dict)

stops_df = pd.DataFrame(all_records)

stops_df['arrive_time'] = pd.to_numeric(stops_df['arrive_time'])
stops_df['tstamp'] = stops_df['arrive_time'].apply(lambda x: initial_date + timedelta(seconds=x))

stops_df['vehicle_number'] = stops_df['vehicle_number'].astype(str)
stops_df['trip_id'] = stops_df['trip_id'].astype(str)
stops_df['location_id'] = stops_df['location_id'].astype(str)
stops_df['ons'] = pd.to_numeric(stops_df['ons'])
stops_df['offs'] = pd.to_numeric(stops_df['offs'])


In [None]:
stops_df = stops_df[['trip_id', 'vehicle_number', 'tstamp', 'location_id', 'ons', 'offs']]

print(f"Number of stop events: {len(stops_df)}")

num_vehicles = stops_df['vehicle_number'].nunique()
print(f"Number of Vechicles: {(num_vehicles)}")

stop_locations = stops_df['location_id'].nunique()
print(f"Number of Stop Locations: {(stop_locations)}")

min_tstamp = stops_df['tstamp'].min()
print(f"Minimum Timestamp: {(min_tstamp)}")

max_tstamp = stops_df['tstamp'].max()
print(f"Maximum Timestamp: {(max_tstamp)}")

boarding_stopevents = (stops_df['ons'] >= 1).sum()
print(f"Number of stop events with at least one passenger boarding: {boarding_stopevents}")

percentage_boarding_stopevents = (boarding_stopevents / len(stops_df)) * 100
print(f"Percentage of stop events with at least one passenger boarding: {percentage_boarding_stopevents}")

Number of stop events: 93912
Number of Vechicles: 158
Number of Stop Locations: 4354
Minimum Timestamp: 2020-01-01 04:02:29
Maximum Timestamp: 2020-01-02 02:37:41
Number of stop events with at least one passenger boarding: 19858
Percentage of stop events with at least one passenger boarding: 21.14532754067638


In [None]:
location_6913 = stops_df[stops_df['location_id'] == '6913']
print(f"Number of stops made at 6913 location  : {len(location_6913)}")

buses_stoped = location_6913['vehicle_number'].nunique()
print(f"Number of different buses stopped at this location  : {(buses_stoped)}")

stops_with_boarding = (location_6913['ons'] >= 1).sum()
print(f"Number of stops at this location with at least one passenger boarding: {stops_with_boarding}")

percentage_buses_stoped = (stops_with_boarding  / len(location_6913)) * 100
print(f"Percentage of stops at this location did at least one passenger board: {percentage_buses_stoped}")


Number of stops made at 6913 location  : 15
Number of different buses stopped at this location  : 5
Number of stops at this location with at least one passenger boarding: 2
Percentage of stops at this location did at least one passenger board: 13.333333333333334


In [None]:
vehicle_4062 = stops_df[stops_df['vehicle_number'] == '4062']
print(f"Number of stops made by 4062 vechile: {len(vehicle_4062)}")

passengers_boarded = vehicle_4062['ons'].sum()
print(f"Number of passengers boarded the bus: {(passengers_boarded)}")

passengers_deboarded = vehicle_4062['offs'].sum()
print(f"Number of passengers deboarded the bus: {(passengers_deboarded)}")

passenges_boarding_1 = (vehicle_4062['ons'] >= 1).sum()
print(f"Number of passengers boarded 4062 vehicle’s stop events did at least one passenger board: {passenges_boarding_1 }")

percentage_onboard = (passenges_boarding_1  / len(vehicle_4062)) * 100
print(f"Percentage of 4062 vehicle’s stop events did at least one passenger board: {percentage_onboard}")


Number of stops made by 4062 vechile: 68
Number of passengers boarded the bus: 26
Number of passengers deboarded the bus: 26
Number of passengers boarded 4062 vehicle’s stop events did at least one passenger board: 11
Percentage of 4062 vehicle’s stop events did at least one passenger board: 16.176470588235293


In [96]:
! pip install scipy



In [122]:
from scipy.stats import binomtest

total_no_of_stop_events = len(stops_df)

total_no_of_bus_events = (stops_df['ons'] >= 1).sum()

percentage_boarding = total_no_of_bus_events / total_no_of_stop_events
print(f"Percentage of stop events with boardings: {percentage_boarding}")

biased_buses = []

for vehicle_id, group in stops_df.groupby('vehicle_number'):
    total_stops = len(group)
    boarding_stops = (group['ons'] >= 1).sum()

    # Perform binomial test
    result = binomtest(boarding_stops, total_stops, percentage_boarding)

    if result.pvalue < 0.05:
        biased_buses.append({
            'vehicle_number': vehicle_id,
            'total_stops': total_stops,
            'boarding_stops': boarding_stops,
            'boarding_rate': boarding_stops / total_stops,
            'p_value': result.pvalue
        })

biased_df = pd.DataFrame(biased_buses)
print("\n Vehicles with biased boarding data p < 0.05:")
print(biased_df[['vehicle_number', 'total_stops', 'boarding_stops','boarding_rate','p_value']])

Percentage of stop events with boardings: 0.21145327540676379

 Vehicles with biased boarding data p < 0.05:
  vehicle_number  total_stops  boarding_stops  boarding_rate   p_value
0           3530          576             100       0.173611  0.028077
1           3634          409              70       0.171149  0.045715
2           3733          709             128       0.180536  0.043074
3           3915          662             115       0.173716  0.017249
4           3963          405              68       0.167901  0.033011


In [124]:
from google.colab import files
import shutil
import os

uploaded = files.upload()

file_name = next(iter(uploaded))
file_size = os.path.getsize(file_name)
shutil.move('trimet_relpos_2022-12-07.csv', '/content/trimet_relpos_2022-12-07.csv')

Saving trimet_relpos_2022-12-07.csv to trimet_relpos_2022-12-07.csv


'/content/trimet_relpos_2022-12-07.csv'

In [131]:
import pandas as pd
from scipy.stats import ttest_ind

gps_df = pd.read_csv("trimet_relpos_2022-12-07.csv")

biased_vehicles = []

all_relpos = gps_df['RELPOS'].values

for vehicle_id, group in gps_df.groupby('VEHICLE_NUMBER'):
    vehicle_relpos = group['RELPOS'].values

    t_stat, p_value = ttest_ind(vehicle_relpos, all_relpos, equal_var=False)

    if p_value < 0.005:
        biased_vehicles.append({
            'vehicle_id': vehicle_id,
            'num_points': len(vehicle_relpos),
            'mean_relpos': vehicle_relpos.mean(),
            'p_value': p_value
        })

biased_df = pd.DataFrame(biased_vehicles)

print("\nVehicles with biased RELPOS data (p < 0.005):")
print(biased_df[['vehicle_id', 'num_points', 'mean_relpos', 'p_value']])



Vehicles with biased RELPOS data (p < 0.005):
   vehicle_id  num_points  mean_relpos   p_value
0        3638       10968     6.033647  0.000000
1        3804       12491     5.977343  0.000000
2        4024       12119    -0.180383  0.004008
3        4305        5770    -6.171029  0.000000


In [138]:
from scipy.stats import chi2_contingency

total_ons = stops_df['ons'].sum()
total_offs = stops_df['offs'].sum()

biased_vehicles_ons_offs = []

for vehicle_id, group in stops_df.groupby('vehicle_number'):
    bus_ons = group['ons'].sum()
    bus_offs = group['offs'].sum()

    contingency_table = [
        [bus_ons, bus_offs],
        [total_ons - bus_ons, total_offs - bus_offs]
    ]

    chi2, p_value, _, _ = chi2_contingency(contingency_table)

    if p_value < 0.05:
        biased_vehicles_ons_offs.append({
            'vehicle_id': vehicle_id,
            'num_ons': bus_ons,
            'num_offs': bus_offs,
            'p_value': p_value
        })

biased_ons_offs_df = pd.DataFrame(biased_vehicles_ons_offs)

print("\nVehicles with biased ons/offs data (p < 0.05):")
print(biased_ons_offs_df[['vehicle_id', 'num_ons', 'num_offs', 'p_value']])



Vehicles with biased ons/offs data (p < 0.05):
  vehicle_id  num_ons  num_offs   p_value
0       3056      517       457  0.030134
1       3576      379       322  0.018783
