In [None]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import folium
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.cm as cm
from matplotlib import colors as mcolors
from sklearn.decomposition import PCA

In [None]:
FILEPATH = './Major_Crime_Indicators_Open_Data_-3805566126367379926.csv'
data = pd.read_csv(FILEPATH, index_col=0)
df = data.copy()
df.head()

In [None]:
df.info()

In [None]:
# count null value of all columns
df.isnull().sum()

In [None]:
# remove null values of data
df = df.dropna()
df.isnull().sum()

In [None]:
dropped_columns = ['HOOD_140', 'NEIGHBOURHOOD_140', 'x', 'y']
df = df[df['NEIGHBOURHOOD_158'] != 'NSA']
df.drop(dropped_columns, axis=1, inplace=True)

In [None]:
long_min, long_max = df['LONG_WGS84'].min(), df['LONG_WGS84'].max()
lat_min, lat_max = df['LAT_WGS84'].min(), df['LAT_WGS84'].max()

print(f"Longitude range: {long_min} to {long_max}")
print(f"Latitude range: {lat_min} to {lat_max}")

In [None]:
year_max, year_min = df['REPORT_YEAR'].max(), df['REPORT_YEAR'].min()
print(f"Original year range: {year_min} to {year_max}")
# Crime trend from 2014 - 2024
df_filtered = df[df['REPORT_YEAR'] != 2025]

In [None]:
yearly_cases = df_filtered.groupby(['REPORT_YEAR']).size().reset_index(name='Yearly_Case_Count')
plt.figure(figsize=(10, 6))
sns.lineplot(x='REPORT_YEAR', y='Yearly_Case_Count', data=yearly_cases,marker='o')
plt.xlabel('Year')
plt.ylabel('Number of Crime Cases')
plt.title('Crime Cases by Year')
plt.xticks(ticks=yearly_cases['REPORT_YEAR'], rotation=45)
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.ylim(0)
plt.show()

Key findings: The crime trend from 2014 to 2024 shows a steady increase in incidents until a peak in 2019, followed by a sharp decline in 2020 and 2021, likely due to the pandemic. Since then crime cases have continued to increase, with a notable surge in 2023.

In [None]:
yearly_cases_category = df_filtered.groupby(['REPORT_YEAR','MCI_CATEGORY']).size().reset_index(name='Case_Count')
plt.figure(figsize=(10, 6))
sns.barplot(x='REPORT_YEAR', y='Case_Count', hue='MCI_CATEGORY', data=yearly_cases_category)
plt.title("Total Crime Cases by Category over Time (2014-2024)")
plt.xlabel('Year')
plt.ylabel('Number of Cases')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

**Key Findings:** The plot reveals that Assault consistently has the highest number of cases from 2014 to 2024, with a sharp increase in 2023. Auto Theft shows notable increase, while Robbery and Theft Over remain relatively stable but lower in frequency.

In [None]:
# Crime trend from 2021 - 2024
df_recent = df[df['REPORT_YEAR'].isin([2021, 2022, 2023, 2024])]

In [None]:
sns.countplot(data=df_recent, x='MCI_CATEGORY',hue='MCI_CATEGORY')
plt.title('Total Crime Cases by Category (2021-2024)')
plt.xlabel('MCI Category')
plt.ylabel('Number of Cases')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
premised_cases_category = df_recent.groupby(['PREMISES_TYPE']).size().reset_index(name='Case_Count')
total_cases = premised_cases_category['Case_Count'].sum()
premised_cases_category['Percentage'] = (premised_cases_category['Case_Count'] / total_cases) * 100
# sort by percentage
premised_cases_category= premised_cases_category.sort_values(by='Percentage', ascending=False)


plt.figure(figsize=(10, 6))
sns.barplot(x='Percentage', y='PREMISES_TYPE', data=premised_cases_category, palette='viridis', hue='PREMISES_TYPE', legend=False)

for index, value in enumerate(premised_cases_category['Percentage']):
    plt.text(value - 2, index, f'{value:.2f}%', color='white', va='center')

plt.title('Crime Distribution by Premise Type (2021-2024)')
plt.xlabel('Percentage of Total Crime Cases (%)')
plt.ylabel('Type of Premise')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
hour_cases = df_recent.groupby(['OCC_HOUR']).size().reset_index(name='Case_Count')

plt.figure(figsize=(10, 6))
sns.lineplot(x='OCC_HOUR', y='Case_Count', data=hour_cases, marker='o')
plt.title('Crime Distribution by Time of Day (2021-2024)')
plt.xlabel('Hour')
plt.ylabel('Number of Cases')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

**Key findings:** The plot shows crime peaking at midnight, followed by a sharp decline, with the lowest activity in the early morning hours. Crime gradually increases from 6:00 AM, with another rise in the afternoon and evening, remaining high between 3:00 PM and 9:00 PM.

In [None]:
# Define the logical order of the days of the week
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Normalize the day names in the 'OCC_DOW' column
df_recent.loc[:, 'OCC_DOW'] = df_recent['OCC_DOW'].str.strip()

# Grouping the data by day of the week
day_of_week_cases = df_recent['OCC_DOW'].value_counts()

# Reindex the grouped data to be in the correct order
day_of_week_cases = day_of_week_cases.reindex(days_order)

# Plotting the bar plot for crime cases by day of the week
plt.figure(figsize=(10, 6))
sns.barplot(x=day_of_week_cases.index, y=day_of_week_cases.values, palette='viridis', hue=day_of_week_cases.index, legend=False)

plt.xlabel('Day of the Week')
plt.ylabel('Number of Crime Cases')
plt.title('Crime Cases by Day of the Week')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

**Key findings:** The distribution of crime cases over days of the week show no significant trends, remaining relatively equal throughout.

In [None]:
NEIGHBOURHOOD_crime = df_recent.groupby(['NEIGHBOURHOOD_158']).size().reset_index(name='Case_Count')
NEIGHBOURHOOD_crime = NEIGHBOURHOOD_crime.sort_values(by='Case_Count', ascending=False)
NEIGHBOURHOOD_crime.head(10)

sns.barplot(x='Case_Count', y='NEIGHBOURHOOD_158', data=NEIGHBOURHOOD_crime.head(10))
plt.title('Top 10 Neighbourhoods by Crime Frequency(2021-2024)')
plt.xlabel('Number of Cases')
plt.ylabel('Neighbourhood')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
# Create a base map centered around Toronto
map_center = [43.7, -79.4]  # Rough center of Toronto
crime_map = folium.Map(location=map_center, zoom_start=11)

# Extract the coordinates from the dataframe as a list of lists
heat_data = df_recent[['LAT_WGS84', 'LONG_WGS84']].dropna().values.tolist()

# Add the heatmap to the map
HeatMap(heat_data, radius=10).add_to(crime_map)

# Display the map
crime_map.save('crime_heatmap.html')
crime_map

In [None]:
coordinates = df_recent[['LAT_WGS84', 'LONG_WGS84']].values

# Standardize the data
scaler = StandardScaler()
coordinates_scaled = scaler.fit_transform(coordinates)

In [None]:
# Using Elbow Method to determine the optimal number of clusters
inertia = []
range_n_clusters = range(1, 20)

for k in range_n_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(coordinates_scaled)
    inertia.append(kmeans.inertia_)

# Plotting the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, inertia, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method to Find Optimal Number of Clusters')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

While the elbow method finds the ideal number of clusters based on a tradeoff of complexity and inertia change, this does not necessarily translate to an ideal number of police divisions to oversee the city.

In [None]:
optimal_k = 16
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
labels = kmeans.fit_predict(coordinates_scaled)

# Add the cluster labels back to the dataframe safely
df_new = df_recent.copy()  # Create a full copy of the DataFrame to avoid potential view-related issues
df_new.loc[:, 'Cluster'] = labels

# Create a folium map centered at Toronto
map_center = [43.7, -79.4]
crime_cluster_map = folium.Map(location=map_center, zoom_start=11)

# Generate a list of colors from the colormap using the updated approach
colormap = plt.colormaps['tab20']  # Access the colormap

# Add the clusters to the map using different colors for each cluster
for label in range(optimal_k):
    color = mcolors.to_hex(colormap(label / (optimal_k - 1)))  # Normalize index to range [0, 1]
    cluster_points = df_new[df_new['Cluster'] == label][['LAT_WGS84', 'LONG_WGS84']].values
    for point in cluster_points:
        folium.CircleMarker(
            location=[point[0], point[1]],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.6
        ).add_to(crime_cluster_map)

# Save and display the map
crime_cluster_map.save('crime_cluster_map_k16.html')
crime_cluster_map

In [None]:
pca = PCA(n_components=2)
coordinates_2d = pca.fit_transform(coordinates_scaled)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=coordinates_2d[:, 0], y=-coordinates_2d[:, 1], hue=labels, palette='tab20', alpha=0.6)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2 (flipped)')
plt.title('Cluster Visualization in 2D Using PCA (Vertically Flipped)')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))  # Adjust the legend position
plt.show()

In [None]:
cluster_counts = df_new['Cluster'].value_counts()
print(cluster_counts)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, palette='viridis', hue=cluster_counts.index, legend=False)
plt.xlabel('Cluster Label')
plt.ylabel('Number of Points')
plt.title('Cluster Size Distribution')
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
centroids = kmeans.cluster_centers_
original_centroids = scaler.inverse_transform(centroids)  # Convert back to original scale
centroid_df = pd.DataFrame(original_centroids, columns=['Latitude', 'Longitude'])
print(centroid_df)

In [None]:
for idx, row in centroid_df.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f'Cluster {idx} Centroid',
        icon=folium.Icon(color='blue', icon='info-sign')
    ).add_to(crime_cluster_map)

# Save the map
crime_cluster_map.save('crime_cluster_map_with_centroids.html')
crime_cluster_map