In [None]:
import pandas as pd

def calculate_ridership(df, timestamp):
    """
    Calculate the ridership leaving and staying per route for a given timestamp.
    """
    subset = df[df['transit_timestamp'] == timestamp]

    subset['leaving'] = subset['ridership'] * 0.4
    subset['staying'] = subset['ridership'] * 0.6

    return subset

def distribute_ridership(df, timestamp):
    """
    Distribute the staying ridership based on the previous timestamp's ridership ratio.
    """
    current_data = df[df['transit_timestamp'] == timestamp]
    prev_timestamp = pd.Timestamp(timestamp) - pd.DateOffset(days=1)  # assuming daily timestamps
    prev_data = df[df['transit_timestamp'] == prev_timestamp]

    total_current_staying = current_data['staying'].sum()

    # Calculate ratios from the previous timestamp and distribute current staying ridership
    prev_data['prev_ratio'] = prev_data['ridership'] / prev_data['ridership'].sum()
    for index, row in prev_data.iterrows():
        route = row['routes']
        current_data.loc[current_data['routes'] == route, 'staying'] = row['prev_ratio'] * total_current_staying

    return current_data

def optimize_ridership(station_data):
    """
    Optimizes the ridership for a given station.
    """
    optimized_data = []

    timestamps = station_data['transit_timestamp'].unique()
    for timestamp in timestamps:
        ridership_data = calculate_ridership(station_data, timestamp)
        distributed_data = distribute_ridership(ridership_data, timestamp)
        optimized_data.append(distributed_data)

    return pd.concat(optimized_data)

# Sample dataframe (fill in your actual data here)
data = {
    'transit_timestamp': [],  # fill in your timestamps
    'station_complex_id': [],  # fill in your station IDs
    'routes': [],  # fill in your routes
    'ridership': [],  # fill in your ridership numbers
    #... (fill in other columns similarly)
}

df = pd.DataFrame(data)

# Optimize ridership for each station
stations = df['station_complex_id'].unique()
optimized_frames = []
for station in stations:
    station_data = df[df['station_complex_id'] == station]
    optimized_data = optimize_ridership(station_data)
    optimized_frames.append(optimized_data)

optimized_df = pd.concat(optimized_frames)


In [None]:
import pandas as pd

def calculate_ridership(df, timestamp):
    """ Calculate the ridership leaving and staying per route for a given timestamp. """
    subset = df[df['transit_timestamp'] == timestamp].copy()
    subset['leaving'] = subset['ridership'] * 0.4
    subset['staying'] = subset['ridership'] * 0.6
    return subset

def distribute_ridership_based_on_history(df, timestamp):
    """ Distribute the staying ridership based on historical data from the previous timestamp. """
    current_data = df[df['transit_timestamp'] == timestamp].copy()
    prev_timestamp = pd.Timestamp(timestamp) - pd.DateOffset(days=1)  # assuming daily timestamps
    prev_data = df[df['transit_timestamp'] == prev_timestamp].copy()

    total_current_staying = current_data['staying'].sum()
    prev_data['prev_ratio'] = prev_data['ridership'] / prev_data['ridership'].sum()

    for index, row in prev_data.iterrows():
        route = row['routes']
        current_data.loc[current_data['routes'].str.contains(route), 'staying'] = row['prev_ratio'] * total_current_staying

    return current_data

def optimize_ridership_for_station(station_data):
    """ Optimizes the ridership for a given station based on historical data. """
    optimized_data_frames = []
    timestamps = station_data['transit_timestamp'].unique()

    if not len(timestamps):  # checking if there are any unique timestamps
        print("No timestamps found for this station data:", station_data)
        return pd.DataFrame()  # return empty DataFrame

    for timestamp in timestamps:
        ridership_data = calculate_ridership(station_data, timestamp)
        distributed_data = distribute_ridership_based_on_history(ridership_data, timestamp)
        optimized_data_frames.append(distributed_data)

    if not optimized_data_frames:  # checking if the list is empty
        print("No data frames to concatenate for this station data:", station_data)
        return pd.DataFrame()  # return empty DataFrame

    return pd.concat(optimized_data_frames)

df = test

# Dictionary for different lines and their respective station order:
station_order = {
    '1':['R192', 'R190', 'R188', 'R186', 'R185', 'R183', 'R182', 'R180', 'R178', 'R176', 'R174', 'R173', 'R172', 'R170', 'R168', 'R164', 'R162', 'R160', 'N049', 'R154', 'N060', 'R135', 'R133', 'R131', 'R129', 'N512', 'R125', 'R123', 'R119', 'R117', 'R116', 'R106', 'R103', 'R101'],  # using station_complex_id values
    '2': ['R336', 'R335', 'R334', 'R333', 'R332', 'R331', 'R330', 'R329', 'R328', 'R326', 'R325', 'R323', 'R322', 'R320', 'R318', 'R316', 'R314', 'R312', 'R310', 'R261', 'R306', 'R304', 'R302', 'R301', 'R168', 'R161B', 'N060', 'R135', 'N512', 'N094', 'N095', 'R111', 'R600', 'R602', 'R606', 'R608', 'R610', 'R617', 'R619', 'R621', 'R622', 'R635', 'R636', 'R637', 'R639', 'R641', 'R643', 'R645']
    #... add other lines as needed
}

optimized_dataframes = []

for line, station_ids in station_order.items():
    # Splitting the routes and optimizing each line separately
    for station_id in station_ids:
        station_data = df[df['station_complex_id'] == station_id]
        # Filtering data for the current line using str.contains() since 'routes' can have multiple lines
        line_data = station_data[station_data['routes'].str.contains(line)]
        optimized_data = optimize_ridership_for_station(line_data)
        optimized_dataframes.append(optimized_data)

optimized_df = pd.concat(optimized_dataframes)