In [1]:
import pyspark
from pyspark.sql import SparkSession

from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pandas import Series

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
from src.journey_finder import JourneyFinder
from src.delay_prediction import DelayPredictor

In [4]:
from pyspark.ml import PipelineModel
import getpass

In [5]:
spark = SparkSession.builder.appName('final-project-{0}'.format(getpass.getuser())).getOrCreate()

sc = spark.sparkContext
conf = sc.getConf()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
print(f'Start Spark name:{spark._sc.appName}, version:{spark.version}')

Start Spark name:final-project-surkov, version:3.5.0


In [7]:
loadedPipelineModel = PipelineModel.load("/user/kli/models")

hdfs_path = "/user/kli/features_with_stats.parquet"
features_with_stats = spark.read.parquet(hdfs_path)

24/05/29 09:09:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/29 09:09:39 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
                                                                                

In [8]:
station_ids = ['8501170:0:1', '8501160:0:2', '8592098', '8591818', '8592050', '8501120:0:1', '8504010:0:1', '8591976', '8592157', '8592152', '8579259', '8579257', '8595190'] 
timestamps = [1716836640.0, 1716837480.0, 1716838020.0, 1716838080.0, 1716838200.0, 1716838320.0, 1716838620.0, 1716839040.0, 1716839160.0, 1716839400.0, 1716839580.0, 1716839760.0, 1716840000.0]

delayPredictor = DelayPredictor(features_with_stats=features_with_stats, loadedPipelineModel=loadedPipelineModel, spark=spark)
delays = delayPredictor.predict(station_ids=station_ids, timestamps=timestamps)

24/05/29 09:10:03 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 67:>                                                         (0 + 1) / 1]

In [9]:
delays

[]

In [13]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
from matplotlib import pyplot as plt

timetable = pd.read_csv('data/timetable.csv')
footpaths = pd.read_csv('data/footpaths.csv')
stops_info = pd.read_csv('data/stops.csv')
stops_info['stop_name_id'] = stops_info['stop_name'] + " (" + stops_info['stop_id'] + ")"

journey_finder = JourneyFinder(
    timetable=timetable,
    footpaths=footpaths,
    stops_info=stops_info,
    delay_predictor=delayPredictor
)

possible_stops = set(timetable['dep_stop']).union(set(timetable['arr_stop'])).union(set(footpaths['stop_id_a'])).union(set(footpaths['stop_id_b']))
stops_info_subset = stops_info[stops_info['stop_id'].isin(possible_stops)]

source_widget = widgets.Dropdown(
    options=sorted(list(set(stops_info_subset['stop_name_id']))),
    description='Origin Stop:',
)

destination_widget = widgets.Dropdown(
    options=sorted(list(set(stops_info_subset['stop_name_id']))),
    description='Dest. Stop:',
)

time_widget = widgets.Text(
    value='20:00:00',
    description='Arr. Time (HH:MM:SS)',
)

confidence_widget = widgets.IntSlider(
    value=90,
    min=1,
    max=100,
    step=1,
    description='Confidence:',
    orientation='horizontal',
)

button = widgets.Button(
    description='Find Journeys'
)

output = widgets.Output()
running = widgets.Label(value='')

figs = []
def update_profile_return(b):
    global figs
    
    output.clear_output()
    with output:
        running.value = 'running...'
        arrival_time = time_widget.value
        new_figs = journey_finder.find_and_plot_journeys(
            start_station_name_id=source_widget.value, 
            end_station_name_id=destination_widget.value, 
            arrival_datetime=arrival_time,
        )
        #_ = find_and_plot_journeys(timetable, footpaths, source_stop_id, destination_stop_id, arrival_time, verbose = True, num_journeys=5)
        running.value = ''
        
        # update the global figs
        figs = new_figs
        

button.on_click(update_profile_return)
output.clear_output()
display(source_widget, destination_widget, time_widget, confidence_widget, button, running, output)

Dropdown(description='Origin Stop:', options=('Bel-Air LEB (8501170)', 'Bel-Air LEB (8501170:0:1)', 'Belmont-s…

Dropdown(description='Dest. Stop:', options=('Bel-Air LEB (8501170)', 'Bel-Air LEB (8501170:0:1)', 'Belmont-su…

Text(value='20:00:00', description='Arr. Time (HH:MM:SS)')

IntSlider(value=90, description='Confidence:', min=1)

Button(description='Find Journeys', style=ButtonStyle())

Label(value='')

Output()