We first import pandas to read, parse, store and do anything to our dataframe followed by numpy for matrices and math functions

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Pandas_logo.svg/1920px-Pandas_logo.svg.png" width="512" height="207">

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/NumPy_logo_2020.svg/1920px-NumPy_logo_2020.svg.png" width="512" height="230">



In [63]:
import pandas as pd
import numpy as np

We start by reading the "dataset.csv" file and make it a dataframe <br>
The delimiter in the dataset is ';'

In [64]:
csv = pd.read_csv("dataset.csv", sep=';')

Remove all comment columns since they're not data

In [65]:
csv = csv.drop("Cancellation comments", axis=1)
csv = csv.drop("Departure delay comments", axis=1)
csv = csv.drop("Arrival delay comments", axis=1)

Remove all duplicates

In [66]:
csv = csv.drop_duplicates()

# Clean Date column
The date format must be %Y-%m (A year and a month) <br>
We replace all wrong delimiters by a '-' <br>
We convert all the strings to datetimes under the wanted format <br>
We exclude all data from before 2000 and after today <br>

In [67]:
csv["Date"] = csv["Date"].astype(str).str.replace(r"(\d{4})\w(\d{2})", r"\1-\2", regex=True)
csv["Date"] = pd.to_datetime(csv["Date"], errors="coerce", format="%Y-%m")
today = pd.to_datetime("today").normalize()
csv.loc[(csv.Date < "2000-01-01") | (csv.Date > today), "Date"] = pd.NaT

Clean Service column

In [68]:
csv["Service"] = csv["Service"].convert_dtypes(str)

Clean Departure station

In [69]:
csv["Departure station"] = csv["Departure station"].convert_dtypes(str)
mask = csv["Departure station"].str.contains(r".+\d.+", na=False)
csv.loc[mask, "Departure station"] = np.nan

Clean Arrival station

In [70]:
csv["Arrival station"] = csv["Arrival station"].convert_dtypes(str)
mask = csv["Arrival station"].str.contains(r".+\d.+", na=False)
csv.loc[mask, "Arrival station"] = np.nan

Clean Average journey time

In [71]:
numbers_with_letters = csv["Average journey time"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[numbers_with_letters, "Average journey time"] = np.nan
csv["Average journey time"] = csv["Average journey time"].convert_dtypes(float)
csv.loc[csv["Average journey time"] < 0, "Average journey time"] = np.nan

Clean Number of scheduled trains

In [72]:
numbers_with_letters = csv["Number of scheduled trains"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[numbers_with_letters, "Number of scheduled trains"] = np.nan
csv.loc[csv["Number of scheduled trains"] % 1 != 0, "Number of scheduled trains"] = np.nan
csv["Number of scheduled trains"] = csv["Number of scheduled trains"].convert_dtypes(int)

Clean Number of cancelled trains

In [73]:
numbers_with_letters = csv["Number of cancelled trains"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[numbers_with_letters, "Number of cancelled trains"] = np.nan
csv.loc[csv["Number of cancelled trains"] % 1 != 0, "Number of cancelled trains"] = np.nan
csv["Number of cancelled trains"] = csv["Number of cancelled trains"].convert_dtypes(int)


Clean Number of trains delayed at departure

In [74]:
numbers_with_letters = csv["Number of trains delayed at departure"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[numbers_with_letters, "Number of trains delayed at departure"] = np.nan
csv.loc[csv["Number of trains delayed at departure"] % 1 != 0, "Number of trains delayed at departure"] = np.nan
csv.loc[csv["Number of trains delayed at departure"] > csv["Number of scheduled trains"], "Number of trains delayed at departure"] = np.nan
csv["Number of trains delayed at departure"] = csv["Number of trains delayed at departure"].convert_dtypes(int)

Clean Average delay of late trains at departure

In [75]:
numbers_with_letters = csv["Average delay of late trains at departure"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[numbers_with_letters, "Average delay of late trains at departure"] = np.nan
csv["Average delay of late trains at departure"] = csv["Average delay of late trains at departure"].convert_dtypes(float)
csv.loc[csv["Average delay of late trains at departure"] < 0, "Average delay of late trains at departure"] = np.nan

Clean Average delay of all trains at departure

In [None]:
numbers_with_letters = csv["Average delay of all trains at departure"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[numbers_with_letters, "Average delay of all trains at departure"] = np.nan
csv["Average delay of all trains at departure"] = csv["Average delay of all trains at departure"].convert_dtypes(float)
csv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10662 entries, 0 to 10839
Data columns (total 23 columns):
 #   Column                                                                         Non-Null Count  Dtype         
---  ------                                                                         --------------  -----         
 0   Date                                                                           9874 non-null   datetime64[ns]
 1   Service                                                                        10129 non-null  string        
 2   Departure station                                                              10091 non-null  string        
 3   Arrival station                                                                10101 non-null  string        
 4   Average journey time                                                           9842 non-null   Float64       
 5   Number of scheduled trains                                                     9330 no

Clean Number of trains delayed at arrival

Clean Average delay of late trains at arrival

Clean Average delay of all trains at arrival

Clean Number of trains delayed > 15 min

Clean Average delay of trains > 15min (if competing with flights)

Clean Number of trains delayed > 30min

Clean Number of trains delayed > 60min

Clean Pct delay due to external causes

Clean Pct delay due to infrastructure

CleanPct delay due to traffic management

Clean Pct delay due to rolling stock

Clean Pct delay due to station management and equipment reuse

Clean Pct delay due to passenger handling (crowding, disabled persons, connections)