In [14]:
%pip install xgboost imblearn

Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import os
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [16]:
data = pd.read_csv("datasets/cleaned_crime_data.csv")

In [17]:
# Predict on past data from 2020 to 2024
data = data.dropna(ignore_index=False)
data = data[data["Year"] <= 2024]
data

Unnamed: 0,ID,Case Number,Date,Primary Type,Description,Arrest,Domestic,District,Ward,Community Area,...,Month,Latitude,Longitude,primary_type_count,total_crimes_per_yr,monthly_crime_count,generalized_loc,Lat_round,Lon_round,location_crime_count
2,13203321,JG415333,2023-09-06 17:00:00,CRIMINAL DAMAGE,TO VEHICLE,False,False,1.0,42.0,32.0,...,9,41.886018,-87.633938,141930,246748,21305,Residential,41.886,-87.634,278.0
3,13204489,JG416325,2023-09-06 11:00:00,THEFT,OVER $500,False,False,1.0,4.0,32.0,...,9,41.871835,-87.626151,268613,246748,21305,Residential,41.872,-87.626,141.0
4,12419690,JE295655,2021-07-07 10:30:00,SEX OFFENSE,SEXUAL EXPLOITATION OF A CHILD,False,False,5.0,10.0,54.0,...,7,41.655116,-87.594883,6241,195093,18040,Residential,41.655,-87.595,65.0
5,12729745,JF279458,2022-06-14 14:47:00,ROBBERY,ATTEMPT STRONG ARM - NO WEAPON,True,False,16.0,30.0,15.0,...,6,41.945232,-87.766735,46438,224691,19472,Commercial,41.945,-87.767,61.0
6,12835559,JF406130,2022-09-21 22:00:00,MOTOR VEHICLE THEFT,AUTOMOBILE,True,False,3.0,6.0,69.0,...,9,41.769347,-87.615008,97077,224691,21207,Other,41.769,-87.615,168.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1160266,13805547,JJ217844,2024-10-16 12:00:00,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,False,False,25.0,30.0,19.0,...,10,41.926521,-87.769726,90187,240458,20926,Residential,41.927,-87.770,18.0
1160368,13805658,JJ218028,2024-12-01 00:00:00,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,False,False,19.0,47.0,6.0,...,12,41.955088,-87.669072,90187,240458,18073,Other,41.955,-87.669,80.0
1160413,13805913,JJ217745,2024-10-22 14:50:00,DECEPTIVE PRACTICE,FRAUD OR CONFIDENCE GAME,False,False,19.0,44.0,6.0,...,10,41.936778,-87.648107,90187,240458,20926,Commercial,41.937,-87.648,239.0
1160441,13805552,JJ217956,2024-11-15 12:00:00,DECEPTIVE PRACTICE,FORGERY,False,False,2.0,3.0,38.0,...,11,41.819360,-87.615176,90187,240458,18317,Residential,41.819,-87.615,30.0


Feature Selection

1. Use Label Encoder to handle categorical attributes
2. Use standard z-score to normalize data


In [18]:
# Encode categorical attributes
le = LabelEncoder()
data["primary_type_encoded"] = le.fit_transform(data["Primary Type"])
data["generalized_loc_encoded"] = le.fit_transform(data["generalized_loc"])
data["Arrest"] = data["Arrest"].astype(int)
data["Domestic"] = data["Domestic"].astype(int)
# data
primary_type_dict = dict(zip(data["Primary Type"], data["primary_type_encoded"]))
generalized_loc_dict = dict(zip(data["generalized_loc"], data["generalized_loc_encoded"]))
print(f"Primary Type Dict: {primary_type_dict}")
print(f"Generalized Type Dict: {generalized_loc_dict}")

Primary Type Dict: {'CRIMINAL DAMAGE': 5, 'THEFT': 22, 'SEX OFFENSE': 20, 'ROBBERY': 19, 'MOTOR VEHICLE THEFT': 14, 'BURGLARY': 3, 'BATTERY': 2, 'HOMICIDE': 9, 'CRIMINAL SEXUAL ASSAULT': 6, 'OFFENSE INVOLVING CHILDREN': 16, 'WEAPONS VIOLATION': 23, 'DECEPTIVE PRACTICE': 8, 'STALKING': 21, 'CRIMINAL TRESPASS': 7, 'ASSAULT': 1, 'PROSTITUTION': 17, 'NARCOTICS': 15, 'KIDNAPPING': 13, 'ARSON': 0, 'INTERFERENCE WITH PUBLIC OFFICER': 11, 'PUBLIC PEACE VIOLATION': 18, 'INTIMIDATION': 12, 'HUMAN TRAFFICKING': 10, 'CRIM SEXUAL ASSAULT': 4}
Generalized Type Dict: {'Residential': 5, 'Commercial': 1, 'Other': 3, 'Street/Outdoor': 6, 'Vehicle': 7, 'Public Transportation': 4, 'Institutional': 2, 'Airport': 0}


In [19]:
# Select X and y
X = data[['Year', 'District', 'generalized_loc_encoded', 'Community Area', 'Month', "Arrest", 'Domestic', 'location_crime_count', 'primary_type_count']].dropna()
# Normalize X before training
scaler = StandardScaler()
X = scaler.fit_transform(X)

y = data["primary_type_encoded"].dropna().to_numpy()

# Compare shape of both to see if they match
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (1089710, 9)
Shape of y: (1089710,)


Train for each model

In [20]:
# Function to make sure that the folder exists or else we would make a new one
def ensure_path(path):
    if not os.path.exists(path):
        os.mkdir(path)

In [21]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

In [22]:
# Create a map of the model name and the model

# Configurations for Random Forest, DecisionTree, and XGBoost. I set the configs to be similar so we can better compare the results of those models
MAX_DEPTH = 30
MAX_LEAF = 2000
IMPURITY = 0.001 #A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
LEARNING_RATE = 0.1
MIN_SAMPLES_SPLIT = 10


models = {
    "KNNClassifier": KNeighborsClassifier(n_neighbors=5, metric="cityblock"),
    "DecisionTreeClassifier": DecisionTreeClassifier(criterion="entropy", max_depth=MAX_DEPTH, max_leaf_nodes=MAX_LEAF, min_impurity_decrease=IMPURITY, random_state=42, min_samples_split=MIN_SAMPLES_SPLIT, class_weight=class_weight_dict),
    "RandomForestClassifier": RandomForestClassifier(criterion="entropy", max_depth=MAX_DEPTH, max_leaf_nodes=MAX_LEAF, min_impurity_decrease=IMPURITY, random_state=42, min_samples_split=MIN_SAMPLES_SPLIT, class_weight=class_weight_dict),
    "XGBoostClassifier": XGBClassifier(max_depth=MAX_DEPTH, max_leaves=MAX_LEAF, learning_rate=LEARNING_RATE, objective='multi:softmax')
}

model_dirs = {
    "KNNClassifier": "models/KNN/",
    "DecisionTreeClassifier": "models/Decision_Tree/",
    "RandomForestClassifier": "models/Random_Forest/",
    "XGBoostClassifier": "models/XGBoost/"
}

Training each model for each fold

In [23]:

for model_name, model in models.items():
    print(f"\n*** TRAINING {model_name} ***")
    folder = model_dirs[model_name]
    ensure_path(folder)
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y)

    # Train the model
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    print(f"Prediction: {y_hat[:10]}")
    print(f"Actual: {y_test[:10]}")
    
    # Evaluate model metrics
    print("\n*** EVALUATING ***")
    

    # Outputs the result
    print(classification_report(y_test, y_hat))

    # Save the model 
    file_name = os.path.join(folder, f"{model_name.lower()}.bin")
    joblib.dump(model, file_name)
    print(f"Saved {model_name} to {folder}")





*** TRAINING KNNClassifier ***
Prediction: [19  1 16 22  3 14  2 23 22  1]
Actual: [19  1 20 22  3 14  2 19 22  1]

*** EVALUATING ***
              precision    recall  f1-score   support

           0       0.26      0.28      0.27       506
           1       0.88      0.92      0.90     21032
           2       0.99      0.99      0.99     42516
           3       0.68      0.88      0.76      7733
           4       0.00      0.00      0.00        15
           5       0.99      0.97      0.98     27016
           6       0.34      0.35      0.35      1427
           7       0.61      0.54      0.58      4274
           8       0.85      0.87      0.86     16091
           9       0.40      0.33      0.36       713
          10       0.00      0.00      0.00        10
          11       0.48      0.30      0.37       526
          12       0.41      0.09      0.14       170
          13       0.16      0.03      0.05       111
          14       0.88      0.85      0.87     18520

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Prediction: [19  2  1 22 14  3 14  2  5  5]
Actual: [19  2  1 22 14  3 14  2  5  5]

*** EVALUATING ***
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       506
           1       1.00      1.00      1.00     21032
           2       1.00      1.00      1.00     42516
           3       1.00      1.00      1.00      7733
           4       1.00      1.00      1.00        15
           5       1.00      1.00      1.00     27016
           6       1.00      1.00      1.00      1427
           7       1.00      1.00      1.00      4274
           8       1.00      1.00      1.00     16091
           9       1.00      1.00      1.00       712
          10       1.00      1.00      1.00        10
          11       1.00      1.00      1.00       526
          12       1.00      1.00      1.00       170
          13       1.00      1.00      1.00       111
          14       1.00      1.00      1.00     18520
          15       1.00      1.