# Berlin Public Transport Analysis 🚇

## Overview
This notebook analyzes simulated public transport data in Berlin to identify patterns in passenger volumes and delays. The goal is to gain basic insights using Python for data cleaning, exploration, and visualization.

In [None]:

# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Simulate the Data
np.random.seed(42)
num_rows = 1000

data = {
    "Route": np.random.choice(["U1", "U2", "U3", "U4", "U5"], num_rows),
    "Station": np.random.choice(
        ["Alexanderplatz", "Potsdamer Platz", "Hauptbahnhof", "Friedrichstraße", "Zoo"], num_rows
    ),
    "Passenger_Count": np.random.randint(50, 500, num_rows),
    "Delay_Minutes": np.random.choice([0, 1, 2, 5, 10], num_rows, p=[0.6, 0.2, 0.1, 0.05, 0.05]),
    "DateTime": pd.date_range(start="2024-01-01", periods=num_rows, freq="H"),
}

df = pd.DataFrame(data)
print("Preview of the data:")
print(df.head())

# Basic statistics
print("\nSummary statistics:")
print(df.describe())
    

## Visualizations

In [None]:

# Average Passengers by Route
avg_passengers_by_route = df.groupby("Route")["Passenger_Count"].mean()
plt.figure(figsize=(8, 5))
avg_passengers_by_route.plot(kind="bar", title="Average Passenger Count by Route")
plt.ylabel("Average Passengers")
plt.grid(axis="y")
plt.show()

# Average Delay by Station
avg_delay_by_station = df.groupby("Station")["Delay_Minutes"].mean()
plt.figure(figsize=(8, 5))
avg_delay_by_station.plot(kind="bar", color="orange", title="Average Delay by Station")
plt.ylabel("Average Delay (Minutes)")
plt.xticks(rotation=45)
plt.grid(axis="y")
plt.show()

# Passenger and Delay Trends by Hour
df['Hour'] = df['DateTime'].dt.hour
avg_delay_by_hour = df.groupby('Hour')['Delay_Minutes'].mean()
avg_passenger_by_hour = df.groupby('Hour')['Passenger_Count'].mean()

plt.figure(figsize=(10, 6))
plt.plot(avg_delay_by_hour, label="Average Delay (Minutes)", marker="o")
plt.plot(avg_passenger_by_hour, label="Average Passengers", marker="x", color="orange")
plt.title("Passenger and Delay Trends by Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Values")
plt.legend()
plt.grid()
plt.show()
    