# Bicycle Thefts in Toronto

In [1]:
# Suggested Tasks:

# Data Exploration: Familiarize yourself with the dataset. Identify key variables such as date, location, bike type, etc.

# Temporal Analysis: Analyze temporal trends in bicycle thefts. Are there seasons or times of day when thefts are more frequent?

# Geographical Analysis: Map the locations of bicycle thefts. Are there particular areas that are more heavily affected?

# Profiles of Stolen Bikes: Examine characteristics of stolen bikes (brand, model, color). Are there specific types of bikes that are targeted more often?

# Theft Network: Explore the possibility of links between bicycle thefts. Are there patterns indicating organized operations?

# Predictive Factors: Identify potential predictive factors for bicycle thefts. This could include variables such as weather, holidays, etc.

# Recommendations: Formulate recommendations based on your analyses to help the police prevent bicycle thefts and improve recovery.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
# load the data and take a look at the first few rows
df_bicycle = pd.read_csv('Bicycle_Thefts_Open_Data.csv')
df = df_bicycle.copy()
df.head()

Unnamed: 0,X,Y,OBJECTID,EVENT_UNIQUE_ID,PRIMARY_OFFENCE,OCC_DATE,OCC_YEAR,OCC_MONTH,OCC_DOW,OCC_DAY,OCC_DOY,OCC_HOUR,REPORT_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DOW,REPORT_DAY,REPORT_DOY,REPORT_HOUR,DIVISION,LOCATION_TYPE,PREMISES_TYPE,BIKE_MAKE,BIKE_MODEL,BIKE_TYPE,BIKE_SPEED,BIKE_COLOUR,BIKE_COST,STATUS,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,-8838282.52,5409902.51,1,GO-20141263544,B&E,2013/12/26 05:00:00+00,2013,December,Thursday,26,360,19,2014/01/01 05:00:00+00,2014,January,Wednesday,1,1,17,D14,Other Commercial / Corporate Places (For Profi...,Commercial,FELT,F59,RC,21.0,SILRED,1300.0,STOLEN,165,Harbourfront-CityPlace,77,Waterfront Communities-The Island (77),-79.4,43.64
1,-8843626.12,5409538.96,2,GO-20141261431,THEFT UNDER,2014/01/01 05:00:00+00,2014,January,Wednesday,1,1,7,2014/01/01 05:00:00+00,2014,January,Wednesday,1,1,7,D14,"Apartment (Rooming House, Condo)",Apartment,SUPERCYCLE,,MT,10.0,,,STOLEN,85,South Parkdale,85,South Parkdale (85),-79.44,43.64
2,-8840398.79,5413057.03,3,GO-20141263784,PROPERTY - FOUND,2014/01/01 05:00:00+00,2014,January,Wednesday,1,1,18,2014/01/01 05:00:00+00,2014,January,Wednesday,1,1,18,D14,"Single Home, House (Attach Garage, Cottage, Mo...",House,TREK,SOHO S,RG,1.0,BLK,,RECOVERED,80,Palmerston-Little Italy,80,Palmerston-Little Italy (80),-79.41,43.66
3,-8838072.94,5410325.76,4,GO-20149000090,THEFT UNDER,2014/01/01 05:00:00+00,2014,January,Wednesday,1,1,12,2014/01/02 05:00:00+00,2014,January,Thursday,2,2,20,D52,"Apartment (Rooming House, Condo)",Apartment,GI,TCX2 (2010),OT,9.0,BLU,1019.0,STOLEN,165,Harbourfront-CityPlace,77,Waterfront Communities-The Island (77),-79.39,43.64
4,-8835727.45,5411803.31,5,GO-20149000074,THEFT UNDER,2013/12/25 05:00:00+00,2013,December,Wednesday,25,359,16,2014/01/02 05:00:00+00,2014,January,Thursday,2,2,16,D51,"Apartment (Rooming House, Condo)",Apartment,CA,RZ 120 1,MT,21.0,WHI,1500.0,STOLEN,168,Downtown Yonge East,75,Church-Yonge Corridor (75),-79.37,43.65


In [4]:
df.info() # 34 columns: float64(6), int64(9), object(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34985 entries, 0 to 34984
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   X                  34985 non-null  float64
 1   Y                  34985 non-null  float64
 2   OBJECTID           34985 non-null  int64  
 3   EVENT_UNIQUE_ID    34985 non-null  object 
 4   PRIMARY_OFFENCE    34985 non-null  object 
 5   OCC_DATE           34985 non-null  object 
 6   OCC_YEAR           34985 non-null  int64  
 7   OCC_MONTH          34985 non-null  object 
 8   OCC_DOW            34985 non-null  object 
 9   OCC_DAY            34985 non-null  int64  
 10  OCC_DOY            34985 non-null  int64  
 11  OCC_HOUR           34985 non-null  int64  
 12  REPORT_DATE        34985 non-null  object 
 13  REPORT_YEAR        34985 non-null  int64  
 14  REPORT_MONTH       34985 non-null  object 
 15  REPORT_DOW         34985 non-null  object 
 16  REPORT_DAY         349

In [8]:
# descriptive statistics
df.describe().T 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X,34985.0,-8756050.86,847920.43,-8863505.63,-8840815.56,-8837719.75,-8835692.53,0.0
Y,34985.0,5364885.41,519544.4,0.0,5411478.78,5413318.98,5416550.43,5440288.68
OBJECTID,34985.0,17493.0,10099.44,1.0,8747.0,17493.0,26239.0,34985.0
OCC_YEAR,34985.0,2018.39,2.76,1975.0,2016.0,2018.0,2021.0,2023.0
OCC_DAY,34985.0,15.66,8.63,1.0,8.0,16.0,23.0,31.0
OCC_DOY,34985.0,201.95,77.38,1.0,152.0,205.0,259.0,366.0
OCC_HOUR,34985.0,13.29,6.54,0.0,9.0,14.0,18.0,23.0
REPORT_YEAR,34985.0,2018.41,2.74,2014.0,2016.0,2018.0,2021.0,2023.0
REPORT_DAY,34985.0,15.97,8.6,1.0,9.0,16.0,23.0,31.0
REPORT_DOY,34985.0,203.04,77.71,1.0,153.0,206.0,260.0,366.0


In [9]:
# check for missing values
df.isnull().sum()

X                        0
Y                        0
OBJECTID                 0
EVENT_UNIQUE_ID          0
PRIMARY_OFFENCE          0
OCC_DATE                 0
OCC_YEAR                 0
OCC_MONTH                0
OCC_DOW                  0
OCC_DAY                  0
OCC_DOY                  0
OCC_HOUR                 0
REPORT_DATE              0
REPORT_YEAR              0
REPORT_MONTH             0
REPORT_DOW               0
REPORT_DAY               0
REPORT_DOY               0
REPORT_HOUR              0
DIVISION                 0
LOCATION_TYPE            0
PREMISES_TYPE            0
BIKE_MAKE              174
BIKE_MODEL           13067
BIKE_TYPE                0
BIKE_SPEED             557
BIKE_COLOUR           3045
BIKE_COST             2378
STATUS                   0
HOOD_158                 0
NEIGHBOURHOOD_158        0
HOOD_140                 0
NEIGHBOURHOOD_140        0
LONG_WGS84               0
LAT_WGS84                0
dtype: int64

In [10]:
# check for the percentage of missing values
# we dont need the columns with more than 50% missing values
df.isnull().mean() * 100

X                    0.00
Y                    0.00
OBJECTID             0.00
EVENT_UNIQUE_ID      0.00
PRIMARY_OFFENCE      0.00
OCC_DATE             0.00
OCC_YEAR             0.00
OCC_MONTH            0.00
OCC_DOW              0.00
OCC_DAY              0.00
OCC_DOY              0.00
OCC_HOUR             0.00
REPORT_DATE          0.00
REPORT_YEAR          0.00
REPORT_MONTH         0.00
REPORT_DOW           0.00
REPORT_DAY           0.00
REPORT_DOY           0.00
REPORT_HOUR          0.00
DIVISION             0.00
LOCATION_TYPE        0.00
PREMISES_TYPE        0.00
BIKE_MAKE            0.50
BIKE_MODEL          37.35
BIKE_TYPE            0.00
BIKE_SPEED           1.59
BIKE_COLOUR          8.70
BIKE_COST            6.80
STATUS               0.00
HOOD_158             0.00
NEIGHBOURHOOD_158    0.00
HOOD_140             0.00
NEIGHBOURHOOD_140    0.00
LONG_WGS84           0.00
LAT_WGS84            0.00
dtype: float64

In [10]:
# i think i'll replace missing in bike_cost and bike_speed using code below
# for t in data['income_type'].unique():
#     data.loc[(data['income_type'] == t) & (data['days_employed'].isna()), 'days_employed'] = \
#     data.loc[(data['income_type'] == t), 'days_employed'].median()

Ideas:
- create pivot table (year / stolen_count), (year / city) mb group by index: year, city, type of bike. mb premises_type/stolen_count
- watch correlation between cost and stolen_count
