In [9]:
import matplotlib.pyplot as plt

from markovBike.manager.manager import Manager
from markovBike.data_source.source import database_queries, get_stations_data

import pandas as pd
import numpy as np

In [10]:
verbose = True

n_stations = 2_000

n_trips = 50_000_000

def calculate_probability_matrix(dataframe):
    # Get a list of all unique station IDs
    station_ids = dataframe[['start_station_id',
                             'end_station_id']].stack().unique()

    # Create an empty matrix to store trip counts
    trip_counts = np.zeros((len(station_ids), len(station_ids)))

    # Loop over each row in the DataFrame and update the corresponding entry in the trip counts matrix
    for _, row in dataframe.iterrows():
        start_station_id = row['start_station_id']
        end_station_id = row['end_station_id']
        start_index = np.where(station_ids == start_station_id)[0][0]
        end_index = np.where(station_ids == end_station_id)[0][0]
        trip_counts[start_index, end_index] += 1

    # Calculate the total number of trips from each station
    total_trips = trip_counts.sum(axis=1)

    # Calculate the probability matrix
    probability_matrix = trip_counts / total_trips[:, np.newaxis]

    # Fill NaN values with zeros
    probability_matrix = np.nan_to_num(probability_matrix)

    # Convert to a pandas DataFrame and add row and column labels
    probability_matrix = pd.DataFrame(probability_matrix,
                                      index=station_ids,
                                      columns=station_ids)
    
    print('returned')

    return probability_matrix

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt

trips_raw = get_stations_data(database_queries(n_trips)['trips'], verbose=True)

# Call the calculate_probability_matrix function to get the probability matrix
probability_matrix = calculate_probability_matrix(trips_raw)

# Plot a heatmap of the probability matrix
fig, ax = plt.subplots(figsize=(10, 10))

sns.heatmap(probability_matrix, cmap='coolwarm', ax=ax)

ax.set_title('Station Transition Probability Matrix')
ax.set_xlabel('End Station ID')
ax.set_ylabel('Start Station ID')

plt.show()

Bike station table with shape (33319019, 15). Columns are: 

tripduration                             Int64
starttime                  datetime64[ns, UTC]
stoptime                   datetime64[ns, UTC]
start_station_id                         Int64
start_station_name                      object
start_station_latitude                 float64
start_station_longitude                float64
end_station_id                           Int64
end_station_name                        object
end_station_latitude                   float64
end_station_longitude                  float64
bikeid                                   Int64
usertype                                object
birth_year                               Int64
gender                                  object
dtype: object
