In [8]:
# URL Download
import csv
import io
import urllib.request
import requests

import pandas as pd
import numpy as np

In [1]:
# Read in data from the Berlin Polizei URL
def load_data():
    url = "https://www.internetwache-polizei-berlin.de/vdb/Fahrraddiebstahl.csv"
    download = requests.get(url)
    decoded_content = download.content.decode('ISO-8859-1')
    file = decoded_content.splitlines()

    cr = csv.DictReader(file, delimiter=',')
    my_list = list(cr)
    df  = pd.DataFrame(my_list)
    return df

In [2]:
###################################
###  Clean data ######

#dict to translate from German to English
eng_col_names = {
    "ANGELEGT_AM": "date_reported",
    "TATZEIT_ANFANG_DATUM": "date_theft_start",
    "TATZEIT_ANFANG_STUNDE": "hour_theft_start",
    "TATZEIT_ENDE_DATUM": "date_theft_end",
    "TATZEIT_ENDE_STUNDE": "hour_theft_end",
    "LOR": "LOR",
    "SCHADENSHOEHE": "estimated_value",
    "VERSUCH": "attempt",
    "ART_DES_FAHRRADS": "type_bike", 
    "DELIKT": "theft_type",
    "ERFASSUNGSGRUND": "theft_type_detail"    
}

# define function for renaming the categories
def rename_type_bike(x): 
    if x == "Herrenfahrrad": 
        return "man's bike"
    if x == "Damenfahrrad":
        return "woman's bike"
    if x == "Fahrrad":
        return "bike"
    if x == "Kinderfahrrad":
        return "child's bike"
    else: 
        return "other bike"

# dictionary for "attempt"
attempt_dict = {
    "Ja": "Yes", 
    "Nein": "No", 
    "Unbekannt": "Unknown"
}

In [3]:
# Concatenates translation of column and category names, 
#  conversion of dtypes, drop duplicates and create 
#  higher regional levels from LOR
def clean_theft_data(d):
    """returns a clean dataframe"""
    #translate columns to English
    d.rename(columns= eng_col_names, inplace=True)
    
    #translate bike type to English
    d["type_bike"] = d["type_bike"].apply(rename_type_bike)
    
    #translate attempt type to English
    d["attempt"] = d["attempt"].map(attempt_dict)
    
    # convert the date columns to format='%d.%m.%Y
    d["date_reported"] = pd.to_datetime(d["date_reported"], format='%d.%m.%Y')
    d["date_theft_start"] = pd.to_datetime(d["date_theft_start"], format='%d.%m.%Y')
    d["date_theft_end"] = pd.to_datetime(d["date_theft_end"], format='%d.%m.%Y')
    
    # convert the time columns to int
    d["hour_theft_start"] = d["hour_theft_start"].astype(int)
    d["hour_theft_end"] = d["hour_theft_end"].astype(int)
    
    #convert value column to float
    d["estimated_value"] = d["estimated_value"].astype(float)
    
    #drop duplicates
    d = d.drop_duplicates()
    
    # BZR (first six numbers)
    d["BZR"] = d["LOR"].str[:6]
    
    # PGR (first four numbers)
    d["PGR"] = d["LOR"].str[:4]
    
    # Bezirk (first four numbers)
    d["Bezirk"] = d["LOR"].str[:2]
    
    return d

In [4]:
# Group by bezirk and sum up
def pivot_theft_data(d):
    d = d.pivot_table(index = "date_theft_start", columns = "Bezirk", values = "type_bike", aggfunc= "count")
    d.fillna(value = 0, inplace=True)
    return d

In [5]:
# Calculate percentage theft by Bezirk
def perc_split_bezirk(d):
    """returns df showing % split of bikes stolen over the last 2 weeks per Bezirk in Berlin"""
    d = d[-15:]
    d.loc['perc_split']= d.sum()
    d = d.div(d.sum(axis=1), axis=0)
    d = d.iloc[-1]
    return pd.DataFrame(d)

In [35]:
def calculate_rolling_average(df, window_size):
    fill_value = df["total"][-window_size:].mean()
    df["total_moving_average"] = df["total"].rolling(window = window_size, center = True).mean().fillna(fill_value)

In [114]:
# Create the dataframe for the modelling
def create_modelling_dataframe():
    df = load_data()
    df = clean_theft_data(df)
    df = pivot_theft_data(df)

    # add "total column"
    df["total"] = df.sum(axis = 1)

    # cut-off the last three days
    df.drop(df.tail(3).index,inplace=True)

    # calculate rolling average
    calculate_rolling_average(df, window_size = 3)
    
    # select relevant columns for modelling
    cols_list =  ["total", "total_moving_average"]
    df = df[cols_list]
    
    return df

In [102]:
df = load_data()

In [103]:
df = clean_theft_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [104]:
df= pivot_theft_data(df)

In [105]:
df["total"] = df.sum(axis = 1)
df.head()

Bezirk,01,02,03,04,05,06,07,08,09,10,11,12,total
date_theft_start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-01-01,1.0,4.0,1.0,4.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,16.0
2021-01-02,0.0,0.0,5.0,3.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,16.0
2021-01-03,1.0,1.0,3.0,2.0,0.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,17.0
2021-01-04,6.0,7.0,3.0,4.0,0.0,2.0,4.0,3.0,1.0,0.0,1.0,2.0,33.0
2021-01-05,7.0,2.0,2.0,5.0,2.0,1.0,5.0,0.0,4.0,0.0,0.0,1.0,29.0


In [106]:
df.shape

(430, 13)

In [107]:
df.drop(df.tail(3).index,inplace=True)

In [108]:
df.shape

(427, 13)

In [109]:
calculate_rolling_average(df, window_size=3)

In [110]:
df.head()

Bezirk,01,02,03,04,05,06,07,08,09,10,11,12,total,total_moving_average
date_theft_start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-01-01,1.0,4.0,1.0,4.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,16.0,33.0
2021-01-02,0.0,0.0,5.0,3.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,16.0,16.333333
2021-01-03,1.0,1.0,3.0,2.0,0.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,17.0,22.0
2021-01-04,6.0,7.0,3.0,4.0,0.0,2.0,4.0,3.0,1.0,0.0,1.0,2.0,33.0,26.333333
2021-01-05,7.0,2.0,2.0,5.0,2.0,1.0,5.0,0.0,4.0,0.0,0.0,1.0,29.0,33.0


In [111]:
cols_list = ["total", "total_moving_average"]

In [112]:
df = df[cols_list]

In [113]:
df.head()

Bezirk,total,total_moving_average
date_theft_start,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01,16.0,33.0
2021-01-02,16.0,16.333333
2021-01-03,17.0,22.0
2021-01-04,33.0,26.333333
2021-01-05,29.0,33.0


In [115]:
test = create_modelling_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [117]:
test.head()

Bezirk,total,total_moving_average
date_theft_start,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01,16.0,33.0
2021-01-02,16.0,16.333333
2021-01-03,17.0,22.0
2021-01-04,33.0,26.333333
2021-01-05,29.0,33.0


In [120]:
!pwd

/home/jakob/code/hmichinaka/berlin-bike-theft-forecasting


In [122]:
!ls

berlin-bike-theft-forecasting  MANIFEST.in  README.md	      setup.py
bike_theft_total_daily.csv     notebooks    requirements.txt  tests
Makefile		       raw_data     scripts


SyntaxError: invalid syntax (4249665749.py, line 1)

In [125]:
df = create_modelling_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [127]:
model = create_opt_model()

NameError: name 'Sequential' is not defined

In [130]:
def get_X_y(dataset, window_size= 31, future_horizon =1):

  X = []
  y = []

  for i in range(0, dataset.shape[0] - window_size - future_horizon):
    X.append(dataset[i: i + window_size])
    y.append(dataset[i + window_size: i + window_size + future_horizon])

  return np.array(X), np.array(y)

In [133]:
import numpy as np
from keras.layers import Dense, LSTM, Dropout
from tensorflow.keras import Sequential, callbacks

def create_opt_model(): 
    """
    Creates the finetuned model selected in B_Theft_Modelling
    Outputs models, that needs to be compiled and fit"""
    model = Sequential()
    # first LSTM layer
    model.add(LSTM(units = 70, activation = "tanh", return_sequences = True))
    model.add(Dropout(0.2))
    # second LSTM layer
    model.add(LSTM(units= 30, activation= "tanh", return_sequences= False))
    model.add(Dropout(0.2))

    # output layer to predict one value
    model.add(Dense(1, activation= "linear"))
    return model

2022-03-08 11:23:41.262559: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-08 11:23:41.262641: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
window_size = 31
future_horizon = 1

X_train, y_train = get_X_y(window_size,future_horizon, df_train)
X_test, y_test = get_X_y(window_size, future_horizon, df_test)

X_test = np.expand_dims(X_test, 2)
X_train = np.expand_dims(X_train, 2)

In [129]:
df = create_modelling_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [134]:
model = create_opt_model()

2022-03-08 11:23:44.506656: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-08 11:23:44.506719: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-08 11:23:44.506796: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LAPTOP-C0GM00LC): /proc/driver/nvidia/version does not exist
2022-03-08 11:23:44.507791: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [135]:
X, y = get_X_y(df)

In [137]:
es = callbacks.EarlyStopping(patience = 5, restore_best_weights=True)

In [136]:
X.shape, y.shape

((395, 31, 2), (395, 1, 2))

In [None]:
model.fit()