# Time slot of crime occuring prediction

Police patrol usually runs in shift to cover the 24 hours. Hence, we designed our prediction model to predict in which time slot a crime will occur at a location. Where the location is given by latitude and longitude. 

## Import the necessary libraries

In [1]:
from pyspark.sql import functions as F
from sklearn.cluster import DBSCAN, KMeans
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

import folium
from folium.plugins import HeatMap, MarkerCluster, AntPath
from pyspark.ml.feature import StringIndexer, OneHotEncoder
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import gaussian_kde

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


In [2]:
%load_ext autoreload
%autoreload 2
import sys; 
import sys; sys.path.insert(0, 'lib/')
from libs import utils, genetic_algo

## Load the data
Load the data and do the necessary cleaning and preprocessing.

In [3]:
data_path = "data/Crime_Data_from_2020_to_Present_20241013.csv"
# Read and Clean Data
spark = utils.start_spark()
df = utils.prepare_data(spark, data_path)
# Additional feature engineering - add more features
df = utils.feature_engineering(df)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/09 10:38:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/09 10:38:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [4]:
df.show()

24/11/09 10:38:44 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+----------+--------+----+-----------+-----------+--------+------+--------------------+--------------------+--------+--------+--------------------+---------+--------------------+--------------+--------------------+------+------------+--------------------+-------+---------+-------------------+----+---------+--------+---------+-------+-----------+------------------+
| Date Rptd|  DATE OCC|TIME OCC|AREA|  AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|        Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status| Status Desc|            LOCATION|    LAT|      LON|       Occ DateTime|hour|time_slot|Year OCC|Month OCC|Day OCC|  age_group|          Category|
+----------+----------+--------+----+-----------+-----------+--------+------+--------------------+--------------------+--------+--------+--------------------+---------+--------------------+--------------+--------------------+------+------------

                                                                                

In [5]:
columns_drop = ["AREA NAME", 
                "Rpt Dist No", 
                "Part 1-2", 
                "Crm Cd", 
                "Crm Cd Desc", 
                "Mocodes", 
                "Premis Cd",
                "Premis Desc",
                "Weapon Used Cd",
                "Weapon Desc",
                "Status Desc",
                "LOCATION",
                "age_group",
                "Date Rptd",
                "DATE OCC",
                "TIME OCC",
                "Vict Age",
                "Vict Descent",
                "Status",
                "Occ DateTIme",
                "Year OCC", 
                "hour",
                "Category",
                "Vict Sex"]
df_dropped = utils.drop_columns(df, columns_drop)
df_dropped.show()

+----+-------+---------+---------+---------+-------+
|AREA|    LAT|      LON|time_slot|Month OCC|Day OCC|
+----+-------+---------+---------+---------+-------+
|   1|34.0444|-118.2628|        3|        2|      7|
|   3| 34.021|-118.3002|        3|       11|      4|
|   9|34.1576|-118.4387|        3|        3|      3|
|   6|34.0944|-118.3277|        2|        8|      2|
|  18|33.9467|-118.2463|        3|       12|      3|
|   1|34.0415| -118.262|        2|        7|      6|
|   3|34.0335|-118.3537|        2|        5|      3|
|  13|33.9911|-118.2521|        2|       12|      4|
|  19|34.2336|-118.4535|        2|       12|      5|
|  18|33.9492|-118.2739|        2|        7|      4|
|  19|34.2285|-118.4258|        1|       10|      5|
|   2|34.0868|-118.2991|        2|        2|      7|
|   3|34.0103|-118.3456|        1|        2|      1|
|  18|33.9551|-118.2814|        2|        7|      4|
|  13|34.0112|-118.2716|        2|        1|      5|
|   7|34.0452|-118.3351|        3|       11|  

24/11/09 10:38:51 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Train the model with Logistic Regression

In [7]:
df_pd = df_dropped.toPandas()
df_pd.head()

Unnamed: 0,AREA,LAT,LON,time_slot,Month OCC,Day OCC
0,1,34.0444,-118.2628,3,2,7
1,3,34.021,-118.3002,3,11,4
2,9,34.1576,-118.4387,3,3,3
3,6,34.0944,-118.3277,2,8,2
4,18,33.9467,-118.2463,3,12,3


### With the whole dataset

In [14]:
features = ["LAT", "LON", "Month OCC", "Day OCC"]
X = df_pd[features]
y = df_pd["time_slot"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Logistic Regression model
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
lr.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = lr.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_pred, y_test))



Accuracy: 0.40820571127488026
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.11      0.41      0.17     15395
           3       0.89      0.41      0.56    128846

    accuracy                           0.41    144241
   macro avg       0.34      0.27      0.25    144241
weighted avg       0.81      0.41      0.52    144241



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### With area-specific data

In [13]:
# Using data from area 1
data = df_pd[df_pd["AREA"]==1]
features = ["LAT", "LON", "Month OCC", "Day OCC"]
X = data[features]
y = data["time_slot"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Logistic Regression model
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
lr.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = lr.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_pred, y_test))

Accuracy: 0.4273470594030145
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.00      0.38      0.00         8
           3       1.00      0.43      0.60     10143

    accuracy                           0.43     10151
   macro avg       0.33      0.27      0.20     10151
weighted avg       1.00      0.43      0.60     10151



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Train the model with Ensemble Methods

In [18]:
def train_time_slot_prediction(data, use_cluster=False):
    if use_cluster:
        features = ['Month OCC', 'Day OCC', 'LAT', 'LON', "cluster"]
    else:
        features = ['Month OCC', 'Day OCC', 'LAT', 'LON']
    X = data[features] # Features
    y = data['time_slot']  # Target variable
    
    # Convert 'Day OCC' to categorical encoding (one-hot encoding or ordinal encoding)
    X = pd.get_dummies(X, columns=['Month OCC', "Day OCC"], drop_first=True)
    # Encode the target variable (time_slot) to numeric values
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    # Split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define a pipeline for each model (Random Forest, Gradient Boosting, XGBoost)
    def create_pipeline(model):
        return Pipeline([
            ('scaler', StandardScaler()),  # Scaling features
            ('classifier', model)
        ])

    # Random Forest Classifier
    rf_model = RandomForestClassifier(random_state=42)
    rf_pipeline = create_pipeline(rf_model)

    # Gradient Boosting Classifier
    gb_model = GradientBoostingClassifier(random_state=42)
    gb_pipeline = create_pipeline(gb_model)

    # XGBoost Classifier
    xgb_model = XGBClassifier(random_state=42)
    xgb_pipeline = create_pipeline(xgb_model)

    # Prepare the parameter grids for hyperparameter tuning
    param_grid_rf = {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5, 7],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2],
        'classifier__class_weight': ['balanced', None]
    }

    param_grid_gb = {
        'classifier__n_estimators': [50, 100],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__max_depth': [3, 5, 7],
        'classifier__min_weight_fraction_leaf': [0.0, 0.01]
    }

    param_grid_xgb = {
        'classifier__n_estimators': [50, 100],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__max_depth': [3, 5, 7]
    }

    # Cross-validation using StratifiedKFold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform Grid Search for Random Forest
    grid_search_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search_rf.fit(X_train, y_train)

    # Perform Grid Search for Gradient Boosting
    grid_search_gb = GridSearchCV(gb_pipeline, param_grid_gb, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search_gb.fit(X_train, y_train)

    # Perform Grid Search for XGBoost
    grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search_xgb.fit(X_train, y_train)

    # Best Model Evaluation (Random Forest)
    best_rf_model = grid_search_rf.best_estimator_
    rf_pred = best_rf_model.predict(X_test)
    print("Random Forest Results:")
    print(f"Accuracy: {accuracy_score(y_test, rf_pred)}")
    print(classification_report(y_test, rf_pred))

    # Best Model Evaluation (Gradient Boosting)
    best_gb_model = grid_search_gb.best_estimator_
    gb_pred = best_gb_model.predict(X_test)
    print("Gradient Boosting Results:")
    print(f"Accuracy: {accuracy_score(y_test, gb_pred)}")
    print(classification_report(y_test, gb_pred))

    # Best Model Evaluation (XGBoost)
    best_xgb_model = grid_search_xgb.best_estimator_
    xgb_pred = best_xgb_model.predict(X_test)
    print("XGBoost Results:")
    print(f"Accuracy: {accuracy_score(y_test, xgb_pred)}")
    print(classification_report(y_test, xgb_pred))
    
    # Save the best model (based on the best performance)
    best_model = None
    best_accuracy = 0

    # Choose the best model based on accuracy
    if accuracy_score(y_test, rf_pred) > best_accuracy:
        best_model = best_rf_model
        best_accuracy = accuracy_score(y_test, rf_pred)
        model_name = "Random Forest"
    if accuracy_score(y_test, gb_pred) > best_accuracy:
        best_model = best_gb_model
        best_accuracy = accuracy_score(y_test, gb_pred)
        model_name = "Gradient Boosting"
    if accuracy_score(y_test, xgb_pred) > best_accuracy:
        best_model = best_xgb_model
        best_accuracy = accuracy_score(y_test, xgb_pred)
        model_name = "XGBoost"

    # Print which model was selected as the best
    print(f"The best model is: {model_name} with accuracy: {best_accuracy}")
    return best_model
    

In [21]:
def save_model(model, filename):# Save the best model using pickle
    model_path = f"models/{filename}.pkl"
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"Best model saved as '{filename}.pkl'")

In [15]:
data = df_dropped.toPandas()
train_time_slot_prediction(data)

                                                                                

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits


24/11/09 01:10:53 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 813404 ms exceeds timeout 120000 ms
24/11/09 01:10:53 WARN SparkContext: Killing executors is not supported by current scheduler.
24/11/09 01:11:01 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

Fitting 5 folds for each of 18 candidates, totalling 90 fits




Random Forest Results:
Accuracy: 0.41390450704030063
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     29376
           1       0.43      0.14      0.21     55903
           2       0.41      0.88      0.56     58962

    accuracy                           0.41    144241
   macro avg       0.28      0.34      0.26    144241
weighted avg       0.33      0.41      0.31    144241



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Gradient Boosting Results:
Accuracy: 0.4232499774682649
              precision    recall  f1-score   support

           0       0.48      0.00      0.01     29376
           1       0.43      0.34      0.38     55903
           2       0.42      0.71      0.53     58962

    accuracy                           0.42    144241
   macro avg       0.44      0.35      0.31    144241
weighted avg       0.44      0.42      0.36    144241

XGBoost Results:
Accuracy: 0.4204560423180649
              precision    recall  f1-score   support

           0       0.46      0.00      0.00     29376
           1       0.42      0.35      0.38     55903
           2       0.42      0.69      0.52     58962

    accuracy                           0.42    144241
   macro avg       0.43      0.35      0.30    144241
weighted avg       0.43      0.42      0.36    144241

The best model is: Gradient Boosting with accuracy: 0.4232499774682649
Best model saved as 'best_model.pkl'


In [19]:
data_area_1 = data[data["AREA"]==1]
best_model = train_time_slot_prediction(data_area_1)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Random Forest Results:
Accuracy: 0.43887301743670576
              precision    recall  f1-score   support

           0       0.80      0.00      0.00      2096
           1       0.45      0.11      0.18      3697
           2       0.44      0.92      0.59      4358

    accuracy                           0.44     10151
   macro avg       0.56      0.35      0.26     10151
weighted avg       0.52      0.44      0.32     10151

Gradient Boosting Results:
Accuracy: 0.45305881193971037
              precision    recall  f1-score   support

           0       0.47      0.02      0.03      2096
           1       0.43      0.35      0.39      3697
           2       0.46      0.75      0.57      4358

    accuracy                           0.45     10151
   macro avg       0.45      0.37      0.33     1015

In [23]:
save_model(best_model, "best_model_area_1")

Best model saved as 'best_model_area_1.pkl'


# Train model with information from clustering

In [17]:
df = pd.read_pickle("pickles/clusters_kmeans_overall.pkl")
df.head()

Unnamed: 0,Vict Age,LAT,LON,AREA,Month OCC,Day OCC,hour,time_slot,cluster
0,47,34.0444,-118.2628,1,2,7,18,3,16
1,19,34.021,-118.3002,3,11,4,17,3,23
2,19,34.1576,-118.4387,9,3,3,20,3,23
3,28,34.0944,-118.3277,6,8,2,12,2,4
4,41,33.9467,-118.2463,18,12,3,23,3,3


In [25]:
# Using data from area 1
features = ["LAT", "LON", "Month OCC", "Day OCC", "cluster"]
X = df[features]
y = df["time_slot"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Logistic Regression model
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
lr.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = lr.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_pred, y_test))



Accuracy: 0.5179733917540782
              precision    recall  f1-score   support

           1       0.00      0.08      0.01      1645
           2       0.68      0.59      0.63     64227
           3       0.62      0.47      0.53     78369

    accuracy                           0.52    144241
   macro avg       0.43      0.38      0.39    144241
weighted avg       0.64      0.52      0.57    144241



In [23]:
best_model = train_time_slot_prediction(df, use_cluster=True)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits




Fitting 5 folds for each of 18 candidates, totalling 90 fits




Random Forest Results:
Accuracy: 0.5578233650626382
              precision    recall  f1-score   support

           0       0.70      0.13      0.22     29376
           1       0.60      0.67      0.64     55903
           2       0.51      0.67      0.58     58962

    accuracy                           0.56    144241
   macro avg       0.61      0.49      0.48    144241
weighted avg       0.59      0.56      0.53    144241

Gradient Boosting Results:
Accuracy: 0.7860663750251315
              precision    recall  f1-score   support

           0       0.81      0.78      0.79     29376
           1       0.76      0.65      0.70     55903
           2       0.79      0.92      0.85     58962

    accuracy                           0.79    144241
   macro avg       0.79      0.78      0.78    144241
weighted avg       0.78      0.79      0.78    144241

XGBoost Results:
Accuracy: 0.7859831809263663
              precision    recall  f1-score   support

           0       0.81      

In [24]:
save_model(best_model, "best_model_clustering")

Best model saved as 'best_model_clustering.pkl'
