Selecting most popular bicing stations (by the number of checkouts and occupancy rating)

In [None]:
def select_top_stations(df, desired_num_stations=50):
    """
    Filters the input DataFrame to only include the top 'desired_num_stations' stations,
    based on a combination of total checkouts and occupancy rate.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing Bicing data with columns:
                           'last_reported' (unix timestamp), 'station_id', 
                           'num_bikes_available', and 'num_docks_available'.
        desired_num_stations (int): The number of top stations to select.

    Returns:
        final_stations (list): A list of station_ids for the selected top stations.
    """
    # Create a copy to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Convert 'last_reported' to datetime
    df_copy['last_reported'] = pd.to_datetime(df_copy['last_reported'], unit='s')
    
    # Sort the DataFrame by station and time
    df_copy.sort_values(['station_id', 'last_reported'], inplace=True)
    
    # Compute the difference in available bikes per station over time
    df_copy['bike_diff'] = df_copy.groupby('station_id')['num_bikes_available'].diff()
    
    # Calculate checkouts: if bikes_available drops, that's a checkout
    df_copy['checkouts'] = df_copy['bike_diff'].apply(lambda x: abs(x) if x < 0 else 0)
    
    # Aggregate total checkouts per station
    checkouts_by_station = df_copy.groupby('station_id')['checkouts'].sum().reset_index()
    
    # Calculate occupancy for each record (occupancy = available bikes / total capacity)
    df_copy['occupancy'] = df_copy['num_bikes_available'] / (
        df_copy['num_bikes_available'] + df_copy['num_docks_available']
    )
    
    # Compute the average occupancy per station
    occupancy_by_station = df_copy.groupby('station_id')['occupancy'].mean().reset_index()
    
    # Merge checkouts and occupancy into a single DataFrame for ranking
    station_stats = checkouts_by_station.merge(occupancy_by_station, on='station_id')
    
    # Rank stations by checkouts
    top_checkouts = station_stats.sort_values(by='checkouts', ascending=False).head(desired_num_stations)
    print(f"Top {desired_num_stations} by checkouts:")
    print(top_checkouts)
    
    # Rank stations by occupancy
    top_occupancy = station_stats.sort_values(by='occupancy', ascending=False).head(desired_num_stations)
    print(f"Top {desired_num_stations} by occupancy:")
    print(top_occupancy)
    
    # Identify common top stations from both rankings
    top_checkout_ids = set(top_checkouts['station_id'])
    top_occupancy_ids = set(top_occupancy['station_id'])
    common_top_stations = list(top_checkout_ids.intersection(top_occupancy_ids))
    print("Common top stations:", common_top_stations)
    
    # Fill up the final station list until the desired count is reached,
    # prioritizing stations from the top checkouts ranking
    final_stations = common_top_stations.copy()
    for station in top_checkouts['station_id']:
        if len(final_stations) >= desired_num_stations:
            break
        if station not in final_stations:
            final_stations.append(station)
    
    print("Final selected stations:", final_stations)
    
    return final_stations