In [None]:
import polars as pl
import altair as alt

In [None]:
# bike trips for 2024
recorridos_2024_path = r"data/recorridos_realizados_2024.csv"

In [None]:
ecobici2024_df = pl.read_csv(recorridos_2024_path, use_pyarrow=True, try_parse_dates=True)

In [None]:
ecobici2024_df.describe()

In [None]:
#ecobici2024_df.dtypes

In [None]:
are_empty_dates = ecobici2024_df.filter(pl.col('fecha_origen_recorrido').is_null() | pl.col('fecha_destino_recorrido').is_null()).shape[0]
are_empty_dates

In [None]:
ecobici2024_df = ecobici2024_df.with_columns(
     pl.col('long_estacion_origen').alias('estacion_origen_lon'),
     pl.col('lat_estacion_origen').alias('estacion_origen_lat'),
     pl.col('long_estacion_destino').alias('estacion_destino_lon'),
     pl.col('lat_estacion_destino').alias('estacion_destino_lat')
)


In [None]:
ecobici2024_df = ecobici2024_df.with_columns(
    pl.col('fecha_origen_recorrido').dt.strftime("%B").alias("month"),
    pl.col('fecha_origen_recorrido').dt.strftime("%A").alias("day_name"),
    pl.col('fecha_origen_recorrido').dt.date().alias("date"),
    (pl.col('fecha_destino_recorrido') - pl.col('fecha_origen_recorrido')).dt.total_minutes().alias('trip_duration_minutes'),
    pl.col('fecha_origen_recorrido').dt.hour().alias('top_of_hour')
).with_columns(
    pl.when(pl.col('day_name').is_in(['Saturday', 'Sunday'])).then(1).otherwise(0).alias('is_weekend')
)

Visualization

In [None]:
ecobici2024_df.limit(100).plot.line(x='fecha_origen_recorrido', y='trip_duration_minutes')


In [None]:
trips_by_day_df = ecobici2024_df.group_by("day_name").len(name='trip_count')
trips_by_day_df

In [None]:
# Trips by day name
(
    alt.Chart(trips_by_day_df).mark_bar().encode(
        x="day_name",
        y="trip_count"
    ).properties(
        title="Trips by day in 2024",
        width=500
    )
).show()

In [None]:
trips_by_date_df = ecobici2024_df.group_by("date").len(name='trip_count_by_date')
len(trips_by_date_df)

In [None]:
(
    alt.Chart(trips_by_date_df).mark_bar().encode(
        x="date",
        y="trip_count_by_date"
    ).properties(
        title="Trips by day in 2024",
        width=600
    )
).show()

In [None]:
trips_by_hour_df = ecobici2024_df.group_by("top_of_hour").len(name='trip_count_by_hour')

In [None]:
# Trips by day name
(
    alt.Chart(trips_by_hour_df).mark_bar().encode(
        x="top_of_hour",
        y="trip_count_by_hour"
    ).properties(
        title="Trips by hour in 2024",
        width=500
    )
).show()

In [None]:
trips_by_gender_hour_df = ecobici2024_df.group_by(['top_of_hour',"género"]).agg(pl.count("top_of_hour").alias('trips_per_hour'))

In [None]:
trips_by_gender_hour_df.head()

In [None]:
# Trips by day name
(
    alt.Chart(trips_by_gender_hour_df).mark_bar().encode(
        x="género",
        y="trips_per_hour"
    ).properties(
        title="Trips by hour in 2024",
        width=500
    )
).show()

Calculate attributes: seasson, is_holiday, is_daylight, temperature, rain, wind