Initializes the StationData object by processing a given CSV DataFrame and filtering it by a date range.

- Filters out rows where the "Date" column is missing.
- Keeps only the rows where the "Date" falls within the specified range (date[0] to date[1]).
- Aggregates train data (scheduled, cancelled, and late trains) for each departure station.
- Creates a DataFrame summarizing the total number of scheduled, cancelled, and late trains per station.
- Stores this summarized data in self.df.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

class StationData:
    def __init__(self, csv, date):
        data = {}
        newcsv = csv.dropna(subset = "Date")
        newcsv = newcsv[(newcsv["Date"] >= date[0]) & (newcsv["Date"] <= date[1])]
        for i in range(len(newcsv)):
            station = newcsv.iloc[i]["Departure station"]
            scheduled = newcsv.iloc[i]["Number of scheduled trains"]
            cancelled = newcsv.iloc[i]["Number of cancelled trains"]
            late = newcsv.iloc[i]["Number of trains delayed at departure"]

            if pd.notna(station) and pd.notna(scheduled) and pd.notna(cancelled) and pd.notna(late):
                if station not in data:
                    data[station] = [0, 0, 0]
                data[station][0] += scheduled
                data[station][1] += cancelled
                data[station][2] += late
        self.df = pd.DataFrame.from_dict(
            data, orient='index', columns=['Scheduled', 'Cancelled', 'Late']
        ).reset_index()
        self.df = self.df.rename(columns={'index': 'Departure station'})


Generates a horizontal bar chart showing the number of scheduled, cancelled, and late trains for a given list of stations.

- Filters the internal DataFrame to include only rows corresponding to the stations in station_list.
- Removes all other stations by setting them to NaN and dropping them.
- Plots three horizontal bars per station (Scheduled, Cancelled, Late) with different colors.
- Customizes the chart with labels, a title, and a legend for clarity.
- Displays the final plot.

In [None]:
def station_scheduled_late(self, station_list):
    df = self.df
    df.loc[~df["Departure station"].isin(station_list), "Departure station"] = np.nan
    df = df.dropna(subset = "Departure station")
    pos = np.arange(len(df["Departure station"]))
    width = 0.5
    plt.barh(pos + width / 3, df["Late"], width / 3, color='lightsteelblue', label='Late')
    plt.barh(pos, df["Scheduled"], width / 3, color='IndianRed', label='Scheduled')
    plt.barh(pos - width / 3, df["Cancelled"], width / 3, color='Blue', label='Cancelled')
    plt.yticks(pos, df["Departure station"])
    plt.xlabel("Number of Trains", fontsize=14)
    plt.title("Scheduled, Cancelled, and Late Trains per Station", fontsize=15)
    plt.legend()
    plt.tight_layout()
    plt.show()
StationData.station_scheduled_late = station_scheduled_late

Initializes the LateData object by processing a CSV DataFrame containing train delay information within a specified date range.

- Filters out rows with missing "Date" values and retains only those within the provided date range.
- Iterates through each row and aggregates delay-related data per departure station:
    - Total number of trains delayed at arrival.
    - Number of trains delayed more than 15, 30, and 60 minutes.
    - Cumulative counts and delay cause percentages (if available) for each station.
- Computes average delay percentages per station for various causes:
    - Passenger handling
    - Station management
    - Rolling stock
    - Traffic management
    - Infrastructure
    - External causes
- Stores the cleaned and summarized data in self.df, dropping intermediate cumulative fields used for calculations.

In [None]:
class LateData():
    def __init__(self, csv, date):
        data = {}
        newcsv = csv.dropna(subset=["Date"])
        newcsv = newcsv[(newcsv["Date"] >= date[0]) & (newcsv["Date"] <= date[1])]
        for i in range(len(newcsv)):
            row = newcsv.iloc[i]
            station = row["Departure station"]
            trainlate = row["Number of trains delayed at arrival"]
            late15 = row["Number of trains delayed > 15min"]
            late30 = row["Number of trains delayed > 30min"]
            late60 = row["Number of trains delayed > 60min"]
            if pd.notna(station) and pd.notna(trainlate) and pd.notna(late15) and pd.notna(late30) and pd.notna(late60):
                if station not in data:
                    data[station] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                data[station][0] += trainlate
                data[station][1] += late15
                data[station][2] += late30
                data[station][3] += late60
                data[station][4] += 1
                if pd.notna(row.get("Total Pct", None)):
                    data[station][5] += row.get("Pct delay due to passenger handling (crowding, disabled persons, connections)", 0)
                    data[station][6] += row.get("Pct delay due to station management and equipment reuse", 0)
                    data[station][7] += row.get("Pct delay due to rolling stock", 0)
                    data[station][8] += row.get("Pct delay due to traffic management", 0)
                    data[station][9] += row.get("Pct delay due to infrastructure", 0)
                    data[station][10] += row.get("Pct delay due to external causes", 0)
        df = pd.DataFrame.from_dict(data, orient='index', 
            columns=[
                'trainlate', 'late15', 'late30', 'late60', 'count',
                'sum_passenger', 'sum_station', 'sum_stock', 
                'sum_management', 'sum_infra', 'sum_others'
            ]).reset_index().rename(columns={'index': 'station'})
        df["pct_passenger"] = df["sum_passenger"] / df["count"]
        df["pct_station"] = df["sum_station"] / df["count"]
        df["pct_stock"] = df["sum_stock"] / df["count"]
        df["pct_management"] = df["sum_management"] / df["count"]
        df["pct_infra"] = df["sum_infra"] / df["count"]
        df["pct_others"] = df["sum_others"] / df["count"]
        self.df = df.drop(columns=["count", "sum_passenger", "sum_station", "sum_stock", "sum_management", "sum_infra", "sum_others"])


Displays a stacked horizontal bar chart showing the distribution of train delay durations for a given list of stations.

- Filters the internal DataFrame to include only rows for the specified stations.
- Calculates the number of trains delayed:
    - Less than 15 minutes
    - Between 15 and 30 minutes
    - Between 30 and 60 minutes
    - More than 60 minutes
- Converts these counts into percentages of total delayed trains for each station.
- Creates a stacked horizontal bar chart for each station to visualize delay duration categories.
- Adds a legend and axis labels for clarity.

In [None]:
def late_train_duration(self, station_list):
    df = self.df.copy()
    df.loc[~df["station"].isin(station_list), "station"] = np.nan
    df = df.dropna(subset=["station"])
    df["total"] = df["trainlate"]
    df["late15_only"] = df["late15"] - df["late30"]
    df["late30_only"] = df["late30"] - df["late60"]
    df["late60_only"] = df["late60"]
    df["lateless15min"] = df["trainlate"] - df["late15"]
    df = df[df["total"] > 0]
    df["late15_pct"] = df["late15_only"] / df["total"] * 100
    df["late30_pct"] = df["late30_only"] / df["total"] * 100
    df["late60_pct"] = df["late60_only"] / df["total"] * 100
    df["lateless15min"] = df["lateless15min"] / df["total"] * 100
    pos = np.arange(len(df["station"]))
    plt.barh(pos, df["lateless15min"], color='green', label='Delay <15 min')
    plt.barh(pos, df["late15_pct"], color='lightsteelblue', label='Delay ≥15 min', left=df["lateless15min"])
    plt.barh(pos, df["late30_pct"], color='IndianRed', label='Delay ≥30 min', left=df["lateless15min"] + df["late15_pct"])
    plt.barh(pos, df["late60_pct"], color='Blue', label='Delay ≥60 min', left=df["lateless15min"] + df["late15_pct"] + df["late30_pct"])
    plt.yticks(pos, df["station"])
    plt.xlabel("Percentage of Delayed Trains (%)", fontsize=14)
    plt.legend()

LateData.late_train_data = late_train_duration



Displays a pie chart for each station in the provided list, showing the distribution of delay causes as percentages.

- Filters the internal DataFrame to include only the selected stations.
- For each station:
    - Retrieves average delay percentages by cause (passenger, station management, rolling stock, traffic management, infrastructure, external).
    - Skips the station if all values are missing or zero.
    - Displays a pie chart with the proportional contribution of each delay cause.
- Adds labels, a title, and formatting for better readability.
- Prints a message if no matching stations are found or if a station lacks delay cause data.

In [None]:
def late_train_pct(self, station_list):
    df = self.df.copy()
    df = df[df["station"].isin(station_list)]
    if df.empty:
        print("No matching station found in the data.")
        return
    for _, row in df.iterrows():
        station = row["station"]
        x = [row["pct_passenger"], row["pct_station"], row["pct_stock"], row["pct_management"], row["pct_infra"], row["pct_others"]]
        if all(np.isnan(x)) or sum(x) == 0:
            print(f"No delay cause data available for {station}.")
            continue
        labels = ["Passenger Delay", "Station Management", "Rolling Stock", "Traffic Management", "Infrastructure", "External"]
        colors = plt.get_cmap('Blues')(np.linspace(0, 1, len(x)))

        fig, ax = plt.subplots(figsize=(6, 6))
        ax.pie(
            x, labels=labels, colors=colors, autopct='%1.1f%%',
            wedgeprops={"linewidth": 1, "edgecolor": "white"},
            startangle=140
        )
        ax.set_title(f"Distribution of Delay Causes for {station}")
        plt.tight_layout()
        plt.show()

LateData.late_train_pct = late_train_pct

In [None]:
class Predict():
    def __init__(self, csv, arrival_station, departure_station):
        newcsv = csv
        newcsv.loc[newcsv["Arrival station"] != arrival_station, "Arrival station"] = np.nan
        newcsv.loc[newcsv["Departure station"] != departure_station, "Departure station"] = np.nan
        newcsv = newcsv.dropna(subset = "Arrival station")
        newcsv = newcsv.dropna(subset = "Departure station")
        self.csv = newcsv


In [None]:
def moy(self, type):
    total = 0
    self.csv = self.csv.dropna(subset = type)
    for i in self.csv[type]:
        total += i
    if len(self.csv[type]) != 0:
        return total / len(self.csv[type])
    return 0
Predict.moy = moy

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(self, type1, type2):
    csv = self.csv.dropna(subset=[type1, type2])
    x = csv[type1]
    y = csv[type2]

    split_index = int(len(x) * 0.8)
    train_x = x[:split_index]
    train_y = y[:split_index]
    test_x = x[split_index:]
    test_y = y[split_index:]

    model = np.poly1d(np.polyfit(train_x, train_y, 3))

    mse = mean_squared_error(test_y, model(test_x))
    return np.sqrt(mse)

Predict.rmse = rmse


In [None]:
from sklearn.metrics import r2_score

def r2(self, type1, type2):
    csv = self.csv.dropna(subset=[type1, type2])
    x = csv[type1]
    y = csv[type2]

    split_index = int(len(x) * 0.8)
    train_x = x[:split_index]
    train_y = y[:split_index]
    test_x = x[split_index:]
    test_y = y[split_index:]

    model = np.poly1d(np.polyfit(train_x, train_y, 3))

    return r2_score(test_y, model(test_x))

Predict.r2 = r2