We first import pandas to read, parse, store and do anything to our dataframe followed by numpy for matrices and math functions

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Pandas_logo.svg/1920px-Pandas_logo.svg.png" width="512" height="207">

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/NumPy_logo_2020.svg/1920px-NumPy_logo_2020.svg.png" width="512" height="230">



In [59]:
import pandas as pd
import numpy as np

We start by reading the "dataset.csv" file and make it a dataframe <br>
The delimiter in the dataset is ';'

In [60]:
csv = pd.read_csv("dataset.csv", sep=';')

Remove all comment columns since they're not data

In [61]:
csv = csv.drop("Cancellation comments", axis=1)
csv = csv.drop("Departure delay comments", axis=1)
csv = csv.drop("Arrival delay comments", axis=1)

Remove all duplicates

In [62]:
csv = csv.drop_duplicates()

# Clean Date column
The date format must be %Y-%m (A year and a month) <br>
We replace all wrong delimiters by a '-' <br>
We convert all the strings to datetimes under the wanted format <br>
We exclude all data from before 2000 and after today <br>

In [63]:
csv["Date"] = csv["Date"].astype(str).str.replace(r"(\d{4})\w(\d{2})", r"\1-\2", regex=True)
csv["Date"] = pd.to_datetime(csv["Date"], errors="coerce", format="%Y-%m")
today = pd.to_datetime("today").normalize()
csv.loc[(csv.Date < "2000-01-01") | (csv.Date > today), "Date"] = pd.NaT

Clean Service column

In [64]:
csv["Service"] = csv["Service"].convert_dtypes(str)

Clean Departure station

In [65]:
csv["Departure station"] = csv["Departure station"].convert_dtypes(str)
mask = csv["Departure station"].str.contains(r".+\d.+", na=False)
csv.loc[mask, "Departure station"] = np.nan

Clean Arrival station

In [66]:
csv["Arrival station"] = csv["Arrival station"].convert_dtypes(str)
mask = csv["Arrival station"].str.contains(r".+\d.+", na=False)
csv.loc[mask, "Arrival station"] = np.nan

Clean Average journey time

In [67]:
mask = csv["Average journey time"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[mask, "Average journey time"] = np.nan
csv["Average journey time"] = csv["Average journey time"].convert_dtypes(float)
csv["Average journey time"] = csv["Average journey time"] < 0

Clean Number of scheduled trains

In [None]:
mask = csv["Number of scheduled trains"].astype(str).str.contains(r"[a-zA-Z]", na=False)
csv.loc[mask, "Number of scheduled trains"] = np.nan
csv["Number of scheduled trains"] = (csv["Number of scheduled trains"] % 1 == 0)

9330


Clean Number of cancelled trains

In [69]:
csv = csv.dropna(subset="Number of cancelled trains")
csv["Number of cancelled trains"] = csv["Number of cancelled trains"].convert_dtypes()

Clean Number of trains delayed at departure

In [70]:
csv = csv.dropna(subset="Number of trains delayed at departure")
csv["Number of trains delayed at departure"] = csv["Number of trains delayed at departure"].convert_dtypes()

Clean Average delay of late trains at departure

In [71]:
csv = csv.dropna(subset="Average delay of late trains at departure")
csv["Average delay of late trains at departure"] = csv["Average delay of late trains at departure"].convert_dtypes()

Clean Average delay of all trains at departure

In [72]:
csv = csv.dropna(subset="Average delay of all trains at departure")
csv["Average delay of all trains at departure"] = csv["Average delay of all trains at departure"].convert_dtypes()

Clean Number of trains delayed at arrival

In [73]:
csv = csv.dropna(subset="Number of trains delayed at arrival")
csv["Number of trains delayed at arrival"] = csv["Number of trains delayed at arrival"].convert_dtypes()

Clean Average delay of late trains at arrival

In [74]:
csv = csv.dropna(subset="Average delay of late trains at arrival")
csv["Average delay of late trains at arrival"] = csv["Average delay of late trains at arrival"].convert_dtypes()

Clean Average delay of all trains at arrival

In [75]:
csv = csv.dropna(subset="Average delay of all trains at arrival")
csv["Average delay of all trains at arrival"] = csv["Average delay of all trains at arrival"].convert_dtypes()

Clean Number of trains delayed > 15 min

In [76]:
csv = csv.dropna(subset="Number of trains delayed > 15min")
csv["Number of trains delayed > 15min"] = csv["Number of trains delayed > 15min"].convert_dtypes()

Clean Average delay of trains > 15min (if competing with flights)

In [77]:
csv = csv.dropna(subset="Average delay of trains > 15min (if competing with flights)")
csv["Average delay of trains > 15min (if competing with flights)"] = csv["Average delay of trains > 15min (if competing with flights)"].convert_dtypes()

Clean Number of trains delayed > 30min

In [78]:
csv = csv.dropna(subset="Number of trains delayed > 30min")
csv["Number of trains delayed > 30min"] = csv["Number of trains delayed > 30min"].convert_dtypes()

Clean Number of trains delayed > 60min

In [79]:
csv = csv.dropna(subset="Number of trains delayed > 60min")
csv["Number of trains delayed > 60min"] = csv["Number of trains delayed > 60min"].convert_dtypes()

Clean Pct delay due to external causes

In [80]:
csv = csv.dropna(subset="Pct delay due to external causes")
csv["Pct delay due to external causes"] = csv["Pct delay due to external causes"].convert_dtypes()

Clean Pct delay due to infrastructure

In [81]:
csv.dropna(subset="Pct delay due to infrastructure")
csv["Pct delay due to infrastructure"] = csv["Pct delay due to infrastructure"].convert_dtypes()

CleanPct delay due to traffic management

In [82]:
csv.dropna(subset="Pct delay due to traffic management")
csv["Pct delay due to traffic management"] = csv["Pct delay due to traffic management"].convert_dtypes()

Clean Pct delay due to rolling stock

In [83]:
csv.dropna(subset="Pct delay due to rolling stock")
csv["Pct delay due to rolling stock"] = csv["Pct delay due to rolling stock"].convert_dtypes()

Clean Pct delay due to station management and equipment reuse

In [84]:
csv = csv.dropna(subset="Pct delay due to station management and equipment reuse")
csv["Pct delay due to station management and equipment reuse"] = csv["Pct delay due to station management and equipment reuse"].convert_dtypes()

Clean Pct delay due to passenger handling (crowding, disabled persons, connections)

In [85]:
csv = csv.dropna(subset="Pct delay due to passenger handling (crowding, disabled persons, connections)")
csv["Pct delay due to passenger handling (crowding, disabled persons, connections)"] = csv["Pct delay due to passenger handling (crowding, disabled persons, connections)"].convert_dtypes()
csv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3480 entries, 0 to 10837
Data columns (total 23 columns):
 #   Column                                                                         Non-Null Count  Dtype         
---  ------                                                                         --------------  -----         
 0   Date                                                                           3237 non-null   datetime64[ns]
 1   Service                                                                        3280 non-null   string        
 2   Departure station                                                              3301 non-null   string        
 3   Arrival station                                                                3298 non-null   string        
 4   Average journey time                                                           3223 non-null   boolean       
 5   Number of scheduled trains                                                     3234 non