In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

# model building
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Masking
from tensorflow.keras.metrics import RootMeanSquaredError
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Flatten
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dropout
from keras import layers
from tensorflow.keras import callbacks

2022-03-03 17:06:26.025249: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-03 17:06:26.025309: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Read in data

In [2]:
!pwd

/home/jakob/code/hmichinaka/berlin-bike-theft-forecasting/notebooks


In [4]:
df_feat = pd.read_csv("../raw_data/daily_features_theft.csv")

In [7]:
df_feat.columns

Index(['Unnamed: 0', 'date', 'min_temp', 'max_temp', 'the_temp',
       'weather_state_name', 'weather_state_abbr', 'wind_speed',
       'wind_direction', 'air_pressure', 'humidity', 'visibility',
       'daylight_hours', 'isholidays', 'weekday', 'weeknum'],
      dtype='object')

In [8]:
df_feat.drop(columns = 'Unnamed: 0', inplace=True )

In [9]:
df_feat.head()

Unnamed: 0,date,min_temp,max_temp,the_temp,weather_state_name,weather_state_abbr,wind_speed,wind_direction,air_pressure,humidity,visibility,daylight_hours,isholidays,weekday,weeknum
0,2021-01-01,0.005,2.395,2.07,Sleet,sl,3.020726,227.61664,1007.0,93,6.03165,07:50:24,1,4,53
1,2021-01-02,-1.135,3.045,2.465,Heavy Cloud,hc,2.666541,193.699868,1015.0,89,12.863316,07:51:41,0,5,53
2,2021-01-03,-1.315,1.955,0.985,Snow,sn,9.161659,55.677273,1015.0,96,2.144352,07:53:05,0,6,53
3,2021-01-04,0.03,1.78,1.355,Sleet,sl,6.271227,54.331796,1017.0,96,2.927901,07:54:33,0,0,1
4,2021-01-05,-0.03,2.73,1.27,Light Rain,lr,6.009528,26.500356,1017.0,94,7.021805,07:56:08,0,1,1


In [48]:
df_theft = pd.read_csv("../raw_data/bike_theft_cleaned.csv")

In [49]:
df_theft.head()

Unnamed: 0.1,Unnamed: 0,date_reported,date_theft_start,hour_theft_start,date_theft_end,hour_theft_end,LOR,estimated_value,attempt,type_bike,theft_type,theft_type_detail,datetime_theft_start,datetime_theft_end,Timedelta_theft_end_start,datetime_theft_mean,BZR,PGR,Bezirk
0,0,2022-02-28,2022-02-26,14:00:00,2022-02-26,14:00:00,7100103,580.0,No,bike,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern,2022-02-26 14:00:00,2022-02-26 14:00:00,0.0,2022-02-26 14:00:00,71001,710,7
1,1,2022-02-28,2022-02-28,16:00:00,2022-02-28,16:00:00,2100106,0.0,No,man's bike,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern,2022-02-28 16:00:00,2022-02-28 16:00:00,0.0,2022-02-28 16:00:00,21001,210,2
2,2,2022-02-28,2022-02-28,20:00:00,2022-02-28,20:00:00,8300935,0.0,No,other bike,Fahrraddiebstahl,Einfacher Diebstahl von Fahrrädern,2022-02-28 20:00:00,2022-02-28 20:00:00,0.0,2022-02-28 20:00:00,83009,830,8
3,3,2022-02-28,2022-02-28,8:00:00,2022-02-28,8:00:00,4400727,1000.0,No,other bike,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern,2022-02-28 08:00:00,2022-02-28 08:00:00,0.0,2022-02-28 08:00:00,44007,440,4
4,4,2022-02-28,2022-02-25,5:00:00,2022-02-25,17:00:00,9100305,499.0,No,man's bike,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern,2022-02-25 05:00:00,2022-02-25 17:00:00,12.0,2022-02-25 11:00:00,91003,910,9


# Combine datasets

## Group theft data by Bezirk and date

In [50]:
# Group dataframe by date and LOR
df_theft = df_theft.pivot_table(index = "date_theft_start", columns = "Bezirk", values = "type_bike", aggfunc= "count")

df_theft.fillna(value = 0, inplace=True)

In [51]:
# create "total" column which sums up data for each day
df_theft["total"] = df_theft.sum(axis=1)


df_theft.shape

(424, 13)

In [52]:
df_feat.columns

Index(['date', 'min_temp', 'max_temp', 'the_temp', 'weather_state_name',
       'weather_state_abbr', 'wind_speed', 'wind_direction', 'air_pressure',
       'humidity', 'visibility', 'daylight_hours', 'isholidays', 'weekday',
       'weeknum'],
      dtype='object')

In [53]:
df_theft.columns

Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'total'], dtype='object', name='Bezirk')

## Merge datasets

We merge the "total" column from the theft dataset with the features from the features dataset

In [54]:
# First we have to reset the index in the df_theft to create a column to merge on 

df_theft.reset_index( inplace=True)

In [55]:
# as the last step we rename the date column to match the name in df_feat
df_theft = df_theft.rename(columns={"date_theft_start": "date"})

In [56]:
df_feat.columns

Index(['date', 'min_temp', 'max_temp', 'the_temp', 'weather_state_name',
       'weather_state_abbr', 'wind_speed', 'wind_direction', 'air_pressure',
       'humidity', 'visibility', 'daylight_hours', 'isholidays', 'weekday',
       'weeknum'],
      dtype='object')

In [57]:
df_theft.columns

Index(['date', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'total'], dtype='object', name='Bezirk')

In [58]:
# combine the datasets
df_combined = pd.merge(df_feat, df_theft, on = "date")

In [59]:
df_combined.head()

Unnamed: 0,date,min_temp,max_temp,the_temp,weather_state_name,weather_state_abbr,wind_speed,wind_direction,air_pressure,humidity,...,4,5,6,7,8,9,10,11,12,total
0,2021-01-01,0.005,2.395,2.07,Sleet,sl,3.020726,227.61664,1007.0,93,...,4.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,16.0
1,2021-01-02,-1.135,3.045,2.465,Heavy Cloud,hc,2.666541,193.699868,1015.0,89,...,3.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,16.0
2,2021-01-03,-1.315,1.955,0.985,Snow,sn,9.161659,55.677273,1015.0,96,...,2.0,0.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,17.0
3,2021-01-04,0.03,1.78,1.355,Sleet,sl,6.271227,54.331796,1017.0,96,...,4.0,0.0,2.0,4.0,3.0,1.0,0.0,1.0,2.0,33.0
4,2021-01-05,-0.03,2.73,1.27,Light Rain,lr,6.009528,26.500356,1017.0,94,...,5.0,2.0,1.0,5.0,0.0,4.0,0.0,0.0,1.0,29.0


## Select relevant features

In [60]:
df_combined.columns

Index([              'date',           'min_temp',           'max_temp',
                 'the_temp', 'weather_state_name', 'weather_state_abbr',
               'wind_speed',     'wind_direction',       'air_pressure',
                 'humidity',         'visibility',     'daylight_hours',
               'isholidays',            'weekday',            'weeknum',
                          1,                    2,                    3,
                          4,                    5,                    6,
                          7,                    8,                    9,
                         10,                   11,                   12,
                    'total'],
      dtype='object')

- For now we will drop the Bezirke columns, and the weather_state_name as well as weather_state_abbr
- We will also drop min_temp and max_temp and just keep the_temp 
- We will further drop wind_direction

In [68]:
cols_select = ["date", "the_temp", "wind_speed", "air_pressure", "humidity", "visibility", 
              "daylight_hours", "isholidays", "weekday", "weeknum", 
              "total"]

In [69]:
df_combined = df_combined[cols_select]

KeyError: "['date'] not in index"

In [64]:
df_combined.head()

Unnamed: 0,date,the_temp,wind_speed,air_pressure,humidity,visibility,daylight_hours,daylight_hours.1,isholidays,weekday,weeknum,total
0,2021-01-01,2.07,3.020726,1007.0,93,6.03165,07:50:24,07:50:24,1,4,53,16.0
1,2021-01-02,2.465,2.666541,1015.0,89,12.863316,07:51:41,07:51:41,0,5,53,16.0
2,2021-01-03,0.985,9.161659,1015.0,96,2.144352,07:53:05,07:53:05,0,6,53,17.0
3,2021-01-04,1.355,6.271227,1017.0,96,2.927901,07:54:33,07:54:33,0,0,1,33.0
4,2021-01-05,1.27,6.009528,1017.0,94,7.021805,07:56:08,07:56:08,0,1,1,29.0


In [66]:
df_combined = df_combined.set_index("date")

# Train-test-split

In [67]:
len(df_combined)

424

In [None]:
df_train = df_theft_model["total"][:len_]

In [None]:
df_test = df_theft_model["total"][len_:]

In [None]:
def get_X_y(window_size, future_horizon, dataset):

  X = []
  y = []

  for i in range(0, dataset.shape[0] - window_size - future_horizon):
    X.append(dataset[i: i + window_size])
    y.append(dataset[i + window_size: i + window_size + future_horizon])

  return np.array(X), np.array(y)

In [None]:
window_size = 31
future_horizon = 1

X_train, y_train = get_X_y(window_size,future_horizon, df_train)
X_test, y_test = get_X_y(window_size, future_horizon, df_test)