# Import Data

Import the [Helisinki Bikes dataset from Kaggle](https://www.kaggle.com/datasets/geometrein/helsinki-city-bikes) and supporting package libraries.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('KaggleHelsinkiBikesDatabase.csv')

# Clean Data

- Analyze dateset for cleaning needs.
- Drop any rows containing null values.
- Update date types as needed.
- Split date values into date components stored in separate columns in dataset.

In [2]:
# Analyze data for cleaning needs
data.info()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12157458 entries, 0 to 12157457
Data columns (total 14 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   departure               object 
 1   return                  object 
 2   departure_id            object 
 3   departure_name          object 
 4   return_id               object 
 5   return_name             object 
 6   distance (m)            float64
 7   duration (sec.)         float64
 8   avg_speed (km/h)        float64
 9   departure_latitude      float64
 10  departure_longitude     float64
 11  return_latitude         float64
 12  return_longitude        float64
 13  Air temperature (degC)  float64
dtypes: float64(8), object(6)
memory usage: 1.3+ GB


departure                     0
return                        0
departure_id                  0
departure_name                0
return_id                     0
return_name                   0
distance (m)                  0
duration (sec.)               0
avg_speed (km/h)           3550
departure_latitude            0
departure_longitude           0
return_latitude               1
return_longitude              1
Air temperature (degC)    15902
dtype: int64

In [3]:
# Drop any rows with null values
data.dropna(subset=['avg_speed (km/h)', 'return_latitude', 'return_longitude', 'Air temperature (degC)'], inplace=True)
data.info()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 12138008 entries, 0 to 12157457
Data columns (total 14 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   departure               object 
 1   return                  object 
 2   departure_id            object 
 3   departure_name          object 
 4   return_id               object 
 5   return_name             object 
 6   distance (m)            float64
 7   duration (sec.)         float64
 8   avg_speed (km/h)        float64
 9   departure_latitude      float64
 10  departure_longitude     float64
 11  return_latitude         float64
 12  return_longitude        float64
 13  Air temperature (degC)  float64
dtypes: float64(8), object(6)
memory usage: 1.4+ GB


departure                 0
return                    0
departure_id              0
departure_name            0
return_id                 0
return_name               0
distance (m)              0
duration (sec.)           0
avg_speed (km/h)          0
departure_latitude        0
departure_longitude       0
return_latitude           0
return_longitude          0
Air temperature (degC)    0
dtype: int64

In [4]:
# Update data types where necessary
data['departure'] = pd.to_datetime(data['departure'])
data['return'] = pd.to_datetime(data['return'])
data['departure_id'] = data['departure_id'].astype(str)
data['return_id'] = data['return_id'].astype(str)
data.dtypes

departure                 datetime64[ns]
return                    datetime64[ns]
departure_id                      object
departure_name                    object
return_id                         object
return_name                       object
distance (m)                     float64
duration (sec.)                  float64
avg_speed (km/h)                 float64
departure_latitude               float64
departure_longitude              float64
return_latitude                  float64
return_longitude                 float64
Air temperature (degC)           float64
dtype: object

In [5]:
# Split date values into date components and store in separate columns
data['dept_day'] = data['departure'].dt.day
data['dept_month'] = data['departure'].dt.month
data['dept_year'] = data['departure'].dt.year
data['dept_hour'] = data['departure'].dt.hour
data['dept_minute'] = data['departure'].dt.minute
data['dept_second'] = data['departure'].dt.second
data['dept_microsecond'] = data['departure'].dt.microsecond
data['dept_nanosecond'] = data['departure'].dt.nanosecond

data['ret_day'] = data['return'].dt.day
data['ret_month'] = data['return'].dt.month
data['ret_year'] = data['return'].dt.year
data['ret_hour'] = data['return'].dt.hour
data['ret_minute'] = data['return'].dt.minute
data['ret_second'] = data['return'].dt.second
data['ret_microsecond'] = data['return'].dt.microsecond
data['ret_nanosecond'] = data['return'].dt.nanosecond

# Find Anomalies

After manually analyzing the dataset, performing external research about the dataset, and brainstorming potential target audiences and problem statements / hypothesises, I came up with two anomaly use cases to focus on. The following code checks for the two anomaly use cases in the dataset:

1. Bike ride distance not recorded:
    - In these cases the bike was moved to a different station but the distance was not recorded. There might be a malfunction with recording the distance on the bike that the vender needs to look into.
    - Set new column as true if (1) the distance = 0 meters and (2) the departure_id and return_id are not the same.
2. Bike quickly returned to the same station:
    - In these cases the bike rider may have changed their mind about renting the bike. If an event like this occurs often in the same station, there may be a possibility that there are defective bikes in that station and the vender should pay a visit to the station to check on the bikes.
    - Set new column as true if (1) the duration is less than 60 seconds and (2) depature_id and return_id are the same.

Lastly, I combined the anomaly label columns into a single, combined label column. If either anomaly is found, the label should be set to 1.

In [6]:
# Set new column as true if (1) the distance = 0 meters and (2) the departure_id and return_id are not the same
data['bike_malfunction_recording_distance'] = np.where((data['distance (m)'] == 0) & (data['departure_id'] != data['return_id']), 1, 0)

# Set new column as true if (1) the duration is less than 60 seconds and (2) depature_id and return_id are the same
data['bike_malfunction_no_rides'] = np.where((data['duration (sec.)'] < 60) & (data['departure_id'] == data['return_id']), 1, 0)

# Combine anomalies into a single column. If either anomaly is found, the label should be set to 1.
data['combined_anomalies'] = np.where((data['bike_malfunction_recording_distance'] == 1) | (data['bike_malfunction_no_rides'] == 1), 1, 0)

In [7]:
# Calculate total rows for entire dataset and anomalies
bike_rides_num_rows = len(data.index)
bike_malfunction_recording_distance_num_rows = len(data[data['bike_malfunction_recording_distance'] == 1].index)
bike_malfunction_no_rides_num_rows = len(data[data['bike_malfunction_no_rides'] == 1].index)
combined_anomalies_num_rows = len(data[data['combined_anomalies'] == 1].index)

print(f"Total bike rides: {bike_rides_num_rows}")
print(f"Bike rides with no distance recorded: {bike_malfunction_recording_distance_num_rows}; {np.divide(bike_malfunction_recording_distance_num_rows, bike_rides_num_rows)} || where (1) the distance = 0 meters and (2) the departure_id and return_id are not the same")
print(f"Bikes quickly returned: {bike_malfunction_no_rides_num_rows}; {np.divide(bike_malfunction_no_rides_num_rows, bike_rides_num_rows)} || where (1) the duration is less than 60 seconds and (2) depature_id and return_id are the same")
print(f"Combined anoamlies found: {combined_anomalies_num_rows}; {np.divide(combined_anomalies_num_rows, bike_rides_num_rows)}")

Total bike rides: 12138008
Bike rides with no distance recorded: 211897; 0.017457312600222374 || where (1) the distance = 0 meters and (2) the departure_id and return_id are not the same
Bikes quickly returned: 243110; 0.020028821862697735 || where (1) the duration is less than 60 seconds and (2) depature_id and return_id are the same
Combined anoamlies found: 455007; 0.03748613446292011


# Encode Data

Prepare dataset for training and testing.

In [8]:
# Encoding station names, station IDs, and departure/return dates & times
encoder = LabelEncoder()
data['departure_id'] = encoder.fit_transform(data['departure_id'])
data['return_id'] = encoder.fit_transform(data['return_id'])
data['departure_name'] = encoder.fit_transform(data['departure_name'])
data['return_name'] = encoder.fit_transform(data['return_name'])
data['departure'] = encoder.fit_transform(data['departure'])
data['return'] = encoder.fit_transform(data['return'])
data = pd.get_dummies(data)

# Training and Testing Models

In [9]:
# Create training and validation/testing datasets
bike_rides_data_attributes = data.drop(['bike_malfunction_recording_distance', 'bike_malfunction_no_rides', 'combined_anomalies', 'departure', 'return'], axis = 1)
bike_rides_anomaly_data_labels = data['combined_anomalies']
bike_rides_anomaly_malfunction_recording_distance_data_labels = data['bike_malfunction_recording_distance']
bike_rides_anomaly_malfunction_no_rides_data_labels = data['bike_malfunction_no_rides']

X_train, X_test, y_train, y_test = train_test_split(bike_rides_data_attributes, bike_rides_anomaly_data_labels, test_size = 0.3, random_state = 42)

X_train_dist, X_test_dist, y_train_dist, y_test_dist = train_test_split(bike_rides_data_attributes, bike_rides_anomaly_malfunction_recording_distance_data_labels, test_size = 0.3, random_state = 42)
X_train_rides, X_test_rides, y_train_rides, y_test_rides = train_test_split(bike_rides_data_attributes, bike_rides_anomaly_malfunction_recording_distance_data_labels, test_size = 0.3, random_state = 42)

## Model: Gaussian Naive Bayes

Train and test the Gaussian Naive Bayes model with training and testing datasets.
Spliting the date values into their date components resulted in different accuracy scores.
- Date values left as single value resulted in an accuracy score of 0.9624424981250359
- Date values split into year, month, day, hour, minute, and second (as well any further) resulted in an accuracy score of 0.9624441458415891

In [10]:
gnb = GaussianNB()
pred_gnb = gnb.fit(X_train, y_train).predict(X_test)

print(accuracy_score(y_test, pred_gnb, normalize=True, sample_weight=None))

0.9624441458415891


## Model: Decision Tree Classifier

Train and test the Decision Tree Classifier model with training and testing datasets.
Spliting the date values into their date components resulted in different accuracy scores.
- Date values left as single value resulted in an accuracy score of 0.9994285169754625
- Date values split into year, month, day, hour, minute, and second resulted in an accuracy score of 0.9993843032479514
- Date values split into year, month, day, hour, minute, second, microsecond resulted in an accuracy score of 0.9993966611221005
- Date values split into year, month, day, hour, minute, second, microsecond, and nanosecond resulted in an accuracy score of 0.9993972103609515

In [11]:
clf = DecisionTreeClassifier()
pred_clf = clf.fit(X_train, y_train).predict(X_test)

print(accuracy_score(y_test, pred_clf, normalize=True, sample_weight=None))

0.9993972103609515


In [12]:
# from sklearn import tree

# plt.figure().set_figwidth(7)
# plt.figure().set_figheight(150)
# tree.plot_tree(clf, rounded=True, fontsize=14)
# plt.show()