## Notebook for processing TlF CID bike sharing data 

This is a simple helper notebook which reads in raw data as downloaded from https://cycling.data.tfl.gov.uk/

PLease refer to the report for a description of the data.  


In [None]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# path and file names 
may_2025_data_part1 = "data/419JourneyDataExtract01May2025-14May2025.csv"
may_2025_data_part2 = "data/420JourneyDataExtract14May2025-31May2025.csv"

store_adjacency_matrices_file = "output/am_bikes.pkl"


In [None]:
# Data load and preparation routine.  

# Step 1: load and join separate data sets 

full_bike_df1 = pd.read_csv(may_2025_data_part2)
full_bike_df2 = pd.read_csv(may_2025_data_part1)

full_bike_df = pd.concat([full_bike_df1, full_bike_df2])

n_full = len(full_bike_df)



# Step 2: remove self loops 
bike_df = full_bike_df[
    full_bike_df['Start station number'] != full_bike_df['End station number']
    ][
        ["Start date", "Start station number", "End station number"]
        ]

n_filtered = len(bike_df)


# Step 3: Select and rename columns 
bike_df.columns = ["timestamp", "source", "target"]


# Step 4:  Set day of month as t e [1,..,31]
bike_df["tf"] = pd.to_datetime(bike_df["timestamp"]).dt.day -1 


# Step 5: Generate adjacency matrices per day.  This takes a minute or so.  
node_list = list(set(bike_df["source"].unique().tolist() + bike_df["target"].unique().tolist()))

adjacency_matrices = []
for tf in bike_df["tf"].unique():
    adjacency_matrix = np.zeros((len(node_list), len(node_list)))
    for _, row in bike_df[bike_df["tf"] == tf].iterrows():
        i, j = node_list.index(row["source"]), node_list.index(row["target"])
        adjacency_matrix[i, j] = 1
        adjacency_matrix[j, i] = 1  # ensure symmetry 
    adjacency_matrices.append(adjacency_matrix)

# double check for no self loops 
if any(np.diag(adjacency_matrices[0]) > 0):
    raise ValueError("Self loops in adjacency matrix")


# Step 6:  Summarize some stats for end user 

T = len(adjacency_matrices)
n = adjacency_matrices[0].shape[0]

# Total number of edges remaining 
edges_remaining = np.array([
        am.sum()/2
        for am in adjacency_matrices 
    ])

edges_in_dynamic_network = edges_remaining.sum()
edge_density = edges_remaining / (n*(n-1))
avg_edge_density = edge_density.mean()
min_edge_density = edge_density.min()
max_edge_density = edge_density.max()
avg_node_degree = edges_in_dynamic_network / (T*n)

# summarize for end user 

print("Loaded and processed data")
print("Number of days: ".rjust(40), T)
print("Number of nodes: ".rjust(40), n)
print("Initial number of edges: ".rjust(40), n_full)
print("Number of edges after filtering: ".rjust(40), n_filtered)
print("Number of edges in dynamic network: ".rjust(40), int(edges_in_dynamic_network))
print("Min/Mean/Max edge density: ".rjust(40), round(min_edge_density, 3), "/", round(avg_edge_density, 3), "/", round(max_edge_density, 3))
print("Average node degree: ".rjust(40), round(avg_node_degree, 2))

# Step 7:  Save as pickle file for quick reference 
import pickle
with open(store_adjacency_matrices_file, "wb") as f:
    pickle.dump(adjacency_matrices, f)
