In [None]:
%load_ext autoreload
%autoreload 2

from markovBike.manager.manager import Manager
from markovBike.data_source.source import database_queries, get_stations_data, get_trips_data
from markovBike.data_source.preprocess import preprocess_stations_data, preprocess_trips_data

import pandas as pd

In [None]:
verbose = True

n_stations = 2_000

n_trips = 5_000

In [None]:
stations_raw = get_stations_data(database_queries(n_stations)['stations'], verbose=verbose)

stations_raw.head(3)

In [None]:
stations_raw = stations_raw[stations_raw['latitude'] != 0.0]

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

numerical_features = stations_raw.select_dtypes(include=numerics).columns.drop(
    ['region_id'])

categorical_features = stations_raw.select_dtypes(
    include='object').columns.drop(['station_id', 'name', 'short_name'
                                    ]).append(pd.Index(['region_id']))

boolean_features = stations_raw.select_dtypes(
    include='bool').columns

In [None]:
stations_preproc = preprocess_stations_data(
    stations_raw,
    index='station_id',
    drops=['name', 'short_name'],
    numerical_features=numerical_features,
    categorical_features=categorical_features,
    boolean_features=boolean_features,
    verbose=verbose)

stations_dataframe = pd.DataFrame(stations_preproc[0]).dropna()# ,columns=stations_preproc[1])

stations_dataframe.head(3)

In [None]:
# Define the longitude and latitude of nodes

latitudes = list(stations_raw['latitude'])

longitudes = list(stations_raw['longitude'])

Manager.plot_nodes(latitudes, longitudes)


In [None]:
trips_raw = get_stations_data(database_queries(n_trips)['trips'], verbose=True)

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

numerical_features = trips_raw.select_dtypes(include=numerics).columns.drop(
    ['start_station_id', 'end_station_id', 'bikeid'])

categorical_features = trips_raw.select_dtypes(include='object').columns.drop(
    ['start_station_name', 'end_station_name']).append(pd.Index(['start_station_id', 'end_station_id', 'bikeid'])).drop(['bikeid'])

boolean_features = trips_raw.select_dtypes(include='bool').columns


In [None]:
trips_preproc = preprocess_trips_data(
    trips_raw,
    drops=['start_station_name', 'end_station_name'],
    numerical_features=numerical_features,
    categorical_features=categorical_features,
    boolean_features=boolean_features,
    verbose=True)

trips_dataframe = pd.DataFrame(trips_preproc[0],columns=trips_preproc[1]).dropna()

trips_dataframe.head(3)

In [None]:
# Group the data by pairs of start and end stations, and count the number of trips between them
station_pairs = trips_raw.groupby(['start_station_id', 'end_station_id'
                                   ]).size().reset_index(name='trip_count')

# Group by start and end station id and count the trips
trip_counts = trips_raw.groupby(['start_station_id', 'end_station_id'
                                 ]).size().reset_index(name='trip_count')

# Merge the trip count DataFrame with the original bike trips DataFrame
trips_raw = trips_raw.merge(trip_counts,
                            on=['start_station_id', 'end_station_id'])

# Print the first few rows of the merged DataFrame
trips_raw.head()

In [None]:
start_stations = list(trips_raw['start_station_id'].unique())

Manager.plot_subgraphs(trips_raw, start_stations, 5)
