In [24]:
# URL Download
import csv
import io
import urllib.request
import requests

import pandas as pd
import numpy as np
import joblib

In [2]:
# Read in data from the Berlin Polizei URL
def load_data():
    url = "https://www.internetwache-polizei-berlin.de/vdb/Fahrraddiebstahl.csv"
    download = requests.get(url)
    decoded_content = download.content.decode('ISO-8859-1')
    file = decoded_content.splitlines()

    cr = csv.DictReader(file, delimiter=',')
    my_list = list(cr)
    df  = pd.DataFrame(my_list)
    return df

In [3]:
###################################
###  Clean data ######

#dict to translate from German to English
eng_col_names = {
    "ANGELEGT_AM": "date_reported",
    "TATZEIT_ANFANG_DATUM": "date_theft_start",
    "TATZEIT_ANFANG_STUNDE": "hour_theft_start",
    "TATZEIT_ENDE_DATUM": "date_theft_end",
    "TATZEIT_ENDE_STUNDE": "hour_theft_end",
    "LOR": "LOR",
    "SCHADENSHOEHE": "estimated_value",
    "VERSUCH": "attempt",
    "ART_DES_FAHRRADS": "type_bike", 
    "DELIKT": "theft_type",
    "ERFASSUNGSGRUND": "theft_type_detail"    
}

# define function for renaming the categories
def rename_type_bike(x): 
    if x == "Herrenfahrrad": 
        return "man's bike"
    if x == "Damenfahrrad":
        return "woman's bike"
    if x == "Fahrrad":
        return "bike"
    if x == "Kinderfahrrad":
        return "child's bike"
    else: 
        return "other bike"

# dictionary for "attempt"
attempt_dict = {
    "Ja": "Yes", 
    "Nein": "No", 
    "Unbekannt": "Unknown"
}

In [4]:
# Concatenates translation of column and category names, 
#  conversion of dtypes, drop duplicates and create 
#  higher regional levels from LOR
def clean_theft_data(d):
    """returns a clean dataframe"""
    #translate columns to English
    d.rename(columns= eng_col_names, inplace=True)
    
    #translate bike type to English
    d["type_bike"] = d["type_bike"].apply(rename_type_bike)
    
    #translate attempt type to English
    d["attempt"] = d["attempt"].map(attempt_dict)
    
    # convert the date columns to format='%d.%m.%Y
    d["date_reported"] = pd.to_datetime(d["date_reported"], format='%d.%m.%Y')
    d["date_theft_start"] = pd.to_datetime(d["date_theft_start"], format='%d.%m.%Y')
    d["date_theft_end"] = pd.to_datetime(d["date_theft_end"], format='%d.%m.%Y')
    
    # convert the time columns to int
    d["hour_theft_start"] = d["hour_theft_start"].astype(int)
    d["hour_theft_end"] = d["hour_theft_end"].astype(int)
    
    #convert value column to float
    d["estimated_value"] = d["estimated_value"].astype(float)
    
    #drop duplicates
    d = d.drop_duplicates()
    
    # BZR (first six numbers)
    d["BZR"] = d["LOR"].str[:6]
    
    # PGR (first four numbers)
    d["PGR"] = d["LOR"].str[:4]
    
    # Bezirk (first four numbers)
    d["Bezirk"] = d["LOR"].str[:2]
    
    return d

In [5]:
# Group by bezirk and date reported sum up
def pivot_theft_data(d):
    d = d.pivot_table(index = "date_reported", columns = "Bezirk", values = "type_bike", aggfunc= "count")
    d.fillna(value = 0, inplace=True)
    return d

In [6]:
# Calculate percentage theft by Bezirk
def perc_split_bezirk(d):
    """returns df showing % split of bikes stolen over the last 2 weeks per Bezirk in Berlin"""
    d = d[-15:]
    d.loc['perc_split']= d.sum()
    d = d.div(d.sum(axis=1), axis=0)
    d = d.iloc[-1]
    return pd.DataFrame(d)

In [7]:
def calculate_rolling_average(df, window_size):
    fill_value = df["total"][-window_size:].mean()
    df["total_moving_average"] = df["total"].rolling(window = window_size, center = False).mean().fillna(fill_value)

In [8]:
# Create the dataframe for the modelling
def create_modelling_dataframe():
    df = load_data()
    df = clean_theft_data(df)
    df = pivot_theft_data(df)

    # add "total column"
    df["total"] = df.sum(axis = 1)

    # calculate rolling average
    calculate_rolling_average(df, window_size = 3)
    
    # select relevant columns for modelling
    cols_list =  ["total", "total_moving_average"]
    df = df[cols_list]
    
    return df

In [9]:
def bikes_stolen_365():
    """returns total bikes stolen in the last 365 days in Berlin"""
    df = load_data()
    df = clean_theft_data(df)
    df = pivot_theft_data(df)
    df['Total'] = df.sum(axis=1)
    df = pd.DataFrame(df["Total"])
    df =df[-365:]
    total_stolen_365=df.sum().values[0]
    return int(total_stolen_365)

In [10]:
def theft_frequency():
    """returns frequency (in minutes) of bikes stolen in Berlin in the last 365 days"""
    minutes_day=1440
    minutes_year=1440*365
    return round(minutes_year/bikes_stolen_365())

In [11]:
def get_X_y(dataset, window_size= 31, future_horizon =1):

  X = []
  y = []

  for i in range(0, dataset.shape[0] - window_size - future_horizon):
    X.append(dataset[i: i + window_size])
    y.append(dataset["total"][i + window_size: i + window_size + future_horizon])

  return np.array(X), np.array(y)

In [12]:
import numpy as np
from keras.layers import Dense, LSTM, Dropout
from tensorflow.keras import Sequential, callbacks

def create_opt_model(): 
    """
    Creates the finetuned model selected in B_Theft_Modelling
    Outputs models, that needs to be compiled and fit"""
    model = Sequential()
    # first LSTM layer
    model.add(LSTM(units = 70, activation = "tanh", return_sequences = True))
    model.add(Dropout(0.2))
    # second LSTM layer
    model.add(LSTM(units= 30, activation= "tanh", return_sequences= False))
    model.add(Dropout(0.2))

    # output layer to predict one value
    model.add(Dense(1, activation= "linear"))
    return model

2022-03-10 15:16:22.600577: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-10 15:16:22.600614: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [13]:
def get_pred_array():
    # read in dataset from URL
    df = load_data()
    # clean dataset
    clean_theft_data(df)
    # group by Bezirk and date reported
    pivot_theft_data(df)
    # add "total column"
    df["total"] = df.sum(axis = 1)
    # calculate rolling average
    calculate_rolling_average(df, window_size = 3)
    # drop not-needed columns
    cols_list =  ["total", "total_moving_average"]
    df = df[cols_list]
    # only keep last 31 days

# Start work

In [16]:
df = create_modelling_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [18]:
df

Bezirk,total,total_moving_average
date_reported,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01,4.0,50.000000
2021-01-02,7.0,50.000000
2021-01-03,3.0,4.666667
2021-01-04,21.0,10.333333
2021-01-05,31.0,18.333333
...,...,...
2022-03-05,36.0,42.000000
2022-03-06,22.0,32.666667
2022-03-07,53.0,37.000000
2022-03-08,41.0,38.666667


In [None]:

window_size = 31
future_horizon = 1

X_train, y_train = get_X_y(window_size,future_horizon, df_train)
X_test, y_test = get_X_y(window_size, future_horizon, df_test)

X_test = np.expand_dims(X_test, 2)
X_train = np.expand_dims(X_train, 2)

In [19]:
model = create_opt_model()

2022-03-10 15:18:02.266852: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-10 15:18:02.266965: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-10 15:18:02.267021: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LAPTOP-C0GM00LC): /proc/driver/nvidia/version does not exist
2022-03-10 15:18:02.268112: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Predict on the last 31 days

In [20]:
pred_df = df[-31:]

In [21]:
pred_df.head()

Bezirk,total,total_moving_average
date_reported,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-07,46.0,39.666667
2022-02-08,51.0,44.333333
2022-02-09,53.0,50.0
2022-02-10,67.0,57.0
2022-02-11,54.0,58.0


 1.1_B_Theft_DataCleaning.ipynb
 1.2_B_Theft_AllocationToBezirke.ipynb
 1.2_B_Theft_EDA.ipynb
 1.3.1_B_Theft_Modelling.ipynb
 1.3.2_B_Theft_Modelling_Added_Features.ipynb
 1.3.3_B_Theft_Modelling_Clean_Reported_Date.ipynb
 1.4_B_Theft_Production_Test.ipynb
 2_B_Sharing_Location_EDA.ipynb
 4_B_Accident_EDA.ipynb
'4 Bike Accident.ipynb'
 5_Streamlit_mapping.ipynb
 B_Theft_Modelling_OLD_IterativeApproach.ipynb


In [22]:
model.predict()

TypeError: predict() missing 1 required positional argument: 'x'

In [19]:
test = create_modelling_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [21]:
test.tail()

Bezirk,total,total_moving_average
date_reported,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-03-03,50.0,45.0
2022-03-04,37.0,42.333333
2022-03-05,34.0,40.333333
2022-03-06,21.0,30.666667
2022-03-07,42.0,32.333333


In [24]:
model = create_opt_model()

2022-03-09 10:50:56.336654: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-09 10:50:56.336713: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-09 10:50:56.336729: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LAPTOP-C0GM00LC): /proc/driver/nvidia/version does not exist
2022-03-09 10:50:56.336937: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [27]:
model

<keras.engine.sequential.Sequential at 0x7fc8dd8a6c70>

In [135]:
X, y = get_X_y(df)

In [137]:
es = callbacks.EarlyStopping(patience = 5, restore_best_weights=True)

In [136]:
X.shape, y.shape

((395, 31, 2), (395, 1, 2))

In [None]:
model.fit()

In [11]:
df = create_modelling_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [13]:
df.tail()

Bezirk,total,total_moving_average
date_theft_start,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-28,30.0,32.333333
2022-03-01,40.0,36.0
2022-03-02,38.0,37.0
2022-03-03,33.0,35.666667
2022-03-04,36.0,35.666667


In [33]:
X, y = get_X_y(df, window_size=31, future_horizon=1)

In [34]:
X.shape

(396, 31, 2)

In [35]:
y.shape

(396, 1)

In [50]:
# read in dataset from URL
df = create_modelling_dataframe()
#only keep the last 31 days
df = df[-31:]
# create input array
X_input = np.expand_dims(df, axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["BZR"] = d["LOR"].str[:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["PGR"] = d["LOR"].str[:4]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["Bezirk"] = d["LOR"].str[:2]


In [54]:
model.predict(X_input)[0][0]

-0.0467952

In [44]:
df.shape

(31, 2)

In [46]:
df.shape

(31, 2)

In [60]:
import datetime

In [66]:
date = df.index[-1] 
pred_date =  df.index[-1] +  datetime.timedelta(days = 1) 
print(pred_date.date())

2022-03-08


In [59]:
df.index[-1] 

Timestamp('2022-03-07 00:00:00')

In [49]:
np.expand_dims(df, axis = 0).shape

(1, 31, 2)

In [69]:
d = {"date": [pred_date.date()], "total": [13]}

In [80]:
pd.DataFrame(d, index=[0]).set_index("date")

Unnamed: 0_level_0,total
date,Unnamed: 1_level_1
2022-03-08,13


In [78]:
df

Bezirk,total,total_moving_average
date_reported,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-05,37.0,37.0
2022-02-06,36.0,35.666667
2022-02-07,46.0,39.666667
2022-02-08,51.0,44.333333
2022-02-09,53.0,50.0
2022-02-10,67.0,57.0
2022-02-11,54.0,58.0
2022-02-12,41.0,54.0
2022-02-13,50.0,48.333333
2022-02-14,53.0,48.0
