# Bicycle Thefts in Toronto

In [None]:
# Suggested Tasks:

# Data Exploration: Familiarize yourself with the dataset. Identify key variables such as date, location, bike type, etc.

# Temporal Analysis: Analyze temporal trends in bicycle thefts. Are there seasons or times of day when thefts are more frequent?

# Geographical Analysis: Map the locations of bicycle thefts. Are there particular areas that are more heavily affected?

# Profiles of Stolen Bikes: Examine characteristics of stolen bikes (brand, model, color). Are there specific types of bikes that are targeted more often?

# Theft Network: Explore the possibility of links between bicycle thefts. Are there patterns indicating organized operations?

# Predictive Factors: Identify potential predictive factors for bicycle thefts. This could include variables such as weather, holidays, etc.

# Recommendations: Formulate recommendations based on your analyses to help the police prevent bicycle thefts and improve recovery.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.2f}'.format)


In [None]:
# loading the data and taking a look at the first 5 rows
df_bicycle = pd.read_csv('data\Bicycle_Thefts_Open_Data.csv')
df = df_bicycle.copy()
df.head()

In [None]:
# checking the shape of the data
df.info()

In [None]:
df['OCC_DATE'] = pd.to_datetime(df['OCC_DATE'])

In [None]:
# describing the data to get a better understanding of the data
df.describe().T

In [None]:
# checking for missing values
df.isnull().sum()

In [None]:
# checking for the percentage of missing values
# we dont need the columns with more than 50% missing values
df.isnull().mean() * 100

In [None]:
# checking for duplicates
df.duplicated().sum()

In [None]:
df.head()

# Temporal Analysis

In [None]:
# selecting the last 10 years of data
df = df[df['OCC_YEAR'] >= 2014]
# df = df[(df['OCC_YEAR'] >= 2014) & (df['OCC_YEAR'] < 2024)]

In [None]:
# checking the number of thefts by year
# on this graph we can appreciate a growing pattern from 2014 to 2018
# the number of thefts has been decreasing over the last few years
# i dont have full data for 2024 to analyze. Maybe i can predict the number of thefts for 2024.
plt.figure(figsize=(8, 3))
ax = sns.countplot(data=df, x='OCC_YEAR')
ax.bar_label(ax.containers[0], fmt='%.0f', label_type='center', color='white')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('How has the number of thefts changed over the years?')
plt.show()


In [None]:
# Group data by year and count occurrences, excluding 2024
yearly_counts = df[df['OCC_YEAR'] != 2024]['OCC_YEAR'].value_counts().sort_index()

# Calculate year-over-year percentage change
percentage_change = yearly_counts.pct_change() * 100

# Create line chart
plt.figure(figsize=(10, 6))

# Plot the number of thefts
plt.plot(yearly_counts.index, yearly_counts.values, marker='o', linestyle='-', label='Number of Thefts')

# Add year-over-year percentage change annotations
for i, txt in enumerate(percentage_change.values[1:]):
    plt.annotate(f'{txt:.1f}%', (yearly_counts.index[i+1], yearly_counts.values[i+1]), textcoords="offset points", xytext=(0,10), ha='right')

plt.xlabel('Year')
plt.ylabel('Number of Thefts')
plt.title('How has the number of thefts changed over the years?')
plt.xticks(yearly_counts.index, rotation=45)
plt.grid(False)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# checking the number of thefts by month
# There is a seasonality in thefts. Bicycle thefts increase in summer with a peak in July.
plt.figure(figsize=(8, 3))
ax = sns.countplot(data=df, x='OCC_MONTH')
ax.bar_label(ax.containers[0], fmt='%.0f', label_type='center', color='white')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Did bicycle thefts occur more frequently in certain months?')
plt.show()

In [None]:
# checking the number of thefts by year and month. 
# from december 2023 to february 2024 there is an increase in the number of thefts compared to the previous years.
plt.figure(figsize=(8, 3))
sns.countplot(data=df, x='OCC_MONTH', hue='OCC_YEAR', palette='viridis')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Bicycle Thefts by Year and Month')
plt.show()

In [None]:
# checking the number of thefts by day of the week.
# The number of thefts is higher on weekdays than weekends with a peak on Friday.
plt.figure(figsize=(8, 3))
ax = sns.countplot(data=df, x='OCC_DOW', 
              order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
              )
ax.bar_label(ax.containers[0], fmt='%.0f', label_type='center', color='white')
plt.xticks(rotation=45)
plt.xlabel('Occurrence Year')
plt.ylabel('Count')
plt.title('Did bicycle thefts occur more frequently in certain days of the week?')
plt.show()

In [None]:
# checking the number of thefts by day of the month
plt.figure(figsize=(8, 3))
ax = sns.countplot(data=df, x='OCC_DAY')
ax.bar_label(ax.containers[0], fmt='%.0f', label_type='center', color='white', rotation=90, size=8)
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Did bicycle thefts occur more frequently in certain days of the month?')
plt.show()

In [None]:
# checking the number of thefts by hour
# The number of thefts increase from 12:00 with a peak between 17:00 - 18:00.
hours = sorted(df['OCC_HOUR'].unique(), key=lambda x: (x < 5, x))
df['OCC_HOUR'] = pd.Categorical(df['OCC_HOUR'], categories=hours, ordered=True)
plt.figure(figsize=(8, 3))
ax = sns.countplot(data=df, x='OCC_HOUR')
ax.bar_label(ax.containers[0], fmt='%.0f', label_type='center', color='white', rotation=90, size=9)
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Did bicycle thefts occur more frequently at certain times of the day?')
plt.show()

# Geographical Analysis

In [None]:
# import folium

# # Initialize the map centered around Toronto
# m = folium.Map(location=[43.7, -79.4], zoom_start=11)

# # Add markers to the map
# for index, row in df.iterrows():
#     folium.Marker(
#         location=[row['LAT_WGS84'], row['LONG_WGS84']],
#         popup=f'Latitude: {row["LAT_WGS84"]}, Longitude: {row["LONG_WGS84"]}'
#     ).add_to(m)

# # Save the map as an HTML file
# m.save("map.html")

# Profiles of Stolen Bikes

In [None]:
# checking the number of thefts by bike type
df.replace({'BIKE_TYPE': {'BM':'BMX', 'EL':'Electric', 'FO':'Folding', 'MT':'Mountain', 'OT':'Other',
                          'RC':'Racer', 'RE':'Recumbant', 'RG':'Regular', 'SC':'Scooter', 'TA':'Tandem',
                            'TO':'Touring', 'TR':'Tricycle', 'UN':'Unicycle'}}, inplace=True)

In [None]:
# Excluding uncategorized and unknown types, mountain bikes, regular bikes, and racer bikes are the most frequently stolen.

# Get top 5 bike types
top_bike_types = df['BIKE_TYPE'].value_counts().nlargest(5)

# Calculate percentage of all thefts
total_thefts = df['BIKE_TYPE'].value_counts().sum()
percentage_thefts = top_bike_types / total_thefts * 100

# Create the horizontal bar plot using seaborn
ax = sns.barplot(x=top_bike_types.values, y=top_bike_types.index, orient='h')
ax.bar_label(ax.containers[0], fmt='%.0f', label_type='center', color='white')

# Add percentages to the plot
for i, p in enumerate(ax.patches):
    percentage = percentage_thefts[top_bike_types.index[i]]
    ax.annotate(f'{percentage:.0f}%', (p.get_x() + p.get_width() - 3, p.get_y() + p.get_height() / 2), ha='right', va='top', color='yellow')

plt.xlabel('Count')
plt.ylabel('Bike Type')
plt.title('Did certain types of bikes get stolen more often? (Top 5)')
plt.show()

# Theft Network

# Predictive Factors

# Recommendations