# CanYouCatchIt?
A web application allowing you to obtain the percentage of chance that your bus/tram/metro is late. 💻🤖🎲🚌 🚎🚇🔮

_Build with the STIB API (available [here](https://opendata.stib-mivb.be/store/))_

# Notes: Making some models 💻🤖 🚌🚎🚇
We are here to explore the data

## Load the data

Write a function loading the csv files

In [None]:
# import
import glob
import pandas as pd
import seaborn as sns
import os

# Set the path to the directory holding CSV files
DELAY_PATH = '/home/haeresis/Documents/Github/CanYouCatchIt/machine_learning/data'

def load_delay_data(delay_path=DELAY_PATH):
    """
    Load the cvs file in a panda dataframe
    """
    return pd.concat([pd.read_csv(f) for f in glob.glob('../data/delay*.csv')], ignore_index = True)

## Take a Quick Look at the Data Structure
### Load the data

In [None]:
# load the csv file
delay = load_delay_data()
# display the head
delay.head()

### Display Information

In [None]:
# display the tail
delay.tail()

Display number of non-null data and the type of each row

In [None]:
# display some information
delay.info()

Count the number of each distinct delay value

In [None]:
# count the values
delay["delay"].value_counts()

In [None]:
# count the values
delay["hour"].value_counts()

In [None]:
delay.describe()

### Plot the data

In [None]:
%matplotlib inline
# only in a Jupyter notebook column="delay"

# plot the data
import matplotlib.pyplot as plt
delay.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
is_line_39 = delay['line'] == 39
is_stop_0089 = delay['stop'] == '0089'
is_day_3 = delay['day'] == 3
delay_line_39 = delay[is_line_39 & is_stop_0089 & is_day_3]
sns.boxplot(x=delay_line_39['delay'])

## Create A Test Set

Drop unlabbeled row

In [None]:
delay.dropna(inplace=True)
delay.reset_index(drop=True, inplace=True)

Drop row without line number equal to 39 because we are only studing one line for now

In [None]:
# Get names of indexes for which column line has not a value of 39
index_to_remove = delay[ delay['line'] != 39].index
# Delete these row indexes from dataFrame
delay.drop(index_to_remove , inplace=True)

nunique = delay.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
delay.drop(cols_to_drop, axis=1)

delay.drop(['trip'], axis=1)

# Reset the labels
delay.reset_index(drop=True, inplace=True)
delay.head()

### Data stratification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# Stratifie the data with the different hour
# This make sure that the representation of each hour is the same in the train set then in the overall dataset
# This stratification is not necessary is you have enough data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(delay, delay["hour"]):
    strat_train_set = delay.loc[train_index]
    strat_test_set = delay.loc[test_index]

In [None]:
# Display the representation of each line in the dataset
delay["hour"].value_counts() / len(delay)

In [None]:
# Display the representation of each hour in the test set
strat_test_set["hour"].value_counts() / len(strat_test_set)

#### Compare the error between a random test set and the statified one

In [None]:
def stop_cat_proportions(data):
    return data["hour"].value_counts() / len(data)

train_set, test_set = train_test_split(delay, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": stop_cat_proportions(delay),
    "Stratified": stop_cat_proportions(strat_test_set),
    "Random": stop_cat_proportions(test_set),
}).sort_index()
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props

## Discover and Visualize the Data to Gain Insights

In [None]:
# Create a copy so we can play with it without harming the training set
delay = strat_train_set.copy()

### Visualizing Time Data

In [None]:
# display all the delay mesured by the hour of the day
delay.plot(kind="scatter", x="hour", y="delay", alpha=0.1, title="Delay per hour")

In [None]:
# display the delay mesured by the hour of the day on the stop number 0089
delay_stop_0089 = delay[delay["stop"] == "0089"]
delay_stop_0089.plot(kind="scatter", x="hour", y="delay", alpha=0.5, title="Delay per hour on stop n°0089")

In [None]:
# display the delay mesured by the hour of the day on the stop number 0089
is_stop_0089 = delay["stop"] == "0089"
is_day_3 = delay["day"] == 3
delay_0089_2 = delay[is_stop_0089 & is_day_3]
delay_0089_2.plot(kind="scatter", x="hour", y="delay", alpha=0.5, title="Delay per hour on stop n°0089 a thursday")

In [None]:
# Display the mean delay based on the hour of the day
delay.groupby("hour").mean().plot(y="delay", kind="bar", title="Mean delay per hour")

In [None]:
# Display the mean delay based on the hour of the day
delay_stop_0089.groupby("hour").mean().plot(y="delay", kind="bar", title="Mean delay per hour on stop n°0089")

In [None]:
# Display the mean delay based on the hour of the day
delay_0089_2.groupby("hour").mean().plot(y="delay", kind="bar", title="Mean delay per hour on stop n°0089 the thursday")

In [None]:
# Display the mean temperature based on the hour of the day
delay.groupby("hour").mean().plot(y="temp", kind="bar", title="Mean temperature per hour")