# Bicycle Thefts in Toronto

In [None]:
# Suggested Tasks:

# Data Exploration: Familiarize yourself with the dataset. Identify key variables such as date, location, bike type, etc.

# Temporal Analysis: Analyze temporal trends in bicycle thefts. Are there seasons or times of day when thefts are more frequent?

# Geographical Analysis: Map the locations of bicycle thefts. Are there particular areas that are more heavily affected?

# Profiles of Stolen Bikes: Examine characteristics of stolen bikes (brand, model, color). Are there specific types of bikes that are targeted more often?

# Theft Network: Explore the possibility of links between bicycle thefts. Are there patterns indicating organized operations?

# Predictive Factors: Identify potential predictive factors for bicycle thefts. This could include variables such as weather, holidays, etc.

# Recommendations: Formulate recommendations based on your analyses to help the police prevent bicycle thefts and improve recovery.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# loading the data and taking a look at the first 5 rows
df_bicycle = pd.read_csv('data\Bicycle_Thefts_Open_Data.csv')
df = df_bicycle.copy()
df.head()

In [None]:
# checking the shape of the data
df.info()

In [None]:
# describing the data to get a better understanding of the data
df.describe().T

In [None]:
# checking for missing values
df.isnull().sum()

In [None]:
# checking for the percentage of missing values
# we dont need the columns with more than 50% missing values
df.isnull().mean() * 100

In [None]:
# checking for duplicates
df.duplicated().sum()

# Temporal Analysis

In [None]:
# selecting the last 10 years of data
df = df[df['OCC_YEAR'] >= 2014]
# df = df[(df['OCC_YEAR'] >= 2014) & (df['OCC_YEAR'] < 2024)]

In [None]:
# checking the number of thefts by year
# on this graph we can appreciate a growing pattern since 2014 to 2018
# the number of thefts has been decreasing over the last few years
# i dont have full data for 2024 to analyze
plt.figure(figsize=(12, 4))
sns.countplot(data=df, x='OCC_YEAR')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Bicycle Thefts by Year')
plt.show()


In [None]:
# checking the number of thefts by month
# There is a seasonality in thefts. Bicycle thefts increase in summer with a peak in July.
plt.figure(figsize=(12, 4))
sns.countplot(data=df, x='OCC_MONTH')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Bicycle Thefts by Month')
plt.show()

In [None]:
# checking the number of thefts by year and month. 
# from december 2023 to february 2024 there is a increase in the number of thefts compared to the previous years.
plt.figure(figsize=(12, 4))
sns.countplot(data=df, x='OCC_MONTH', hue='OCC_YEAR', palette='viridis')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Bicycle Thefts by Year and Month')
plt.show()

In [None]:
# 
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='OCC_DOW', 
              order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            )
plt.xticks(rotation=45)
plt.xlabel('Occurrence Year')
plt.ylabel('Count')
plt.title('Bicycle Thefts by Year')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='OCC_YEAR')
plt.xticks(rotation=45)
plt.xlabel('Occurrence Year')
plt.ylabel('Count')
plt.title('Bicycle Thefts by Year')
plt.show()

Ideas:
- create pivot table (year / stolen_count), (year / city) mb group by index: year, city, type of bike. mb premises_type/stolen_count
- watch correlation between cost and stolen_count


In [None]:
# # Define the order of the months
# month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# # Convert 'OCC_MONTH' to a categorical type with the specified order
# df['OCC_MONTH'] = pd.Categorical(df['OCC_MONTH'], categories=month_order, ordered=True)

# # Now, create the plot
# df.groupby(['OCC_YEAR', 'OCC_MONTH']).size().unstack().plot(kind='bar', stacked=True, figsize=(12, 4), colormap='viridis')