# Crime Classification â€“ CS 9548 Project
**Goal:** Exploring Machine Learning Techniques for Image Classification

## Import Libraries

In [2]:
%pip install -r "requirements.txt"

Note: you may need to restart the kernel to use updated packages.


In [4]:
import kagglehub
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from PIL import Image
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.utils import to_categorical


## Download and Extract Dataset

In [7]:
# Download latest version
import kagglehub
path = kagglehub.dataset_download("odins0n/ucf-crime-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\jacki\.cache\kagglehub\datasets\odins0n\ucf-crime-dataset\versions\1


## Create DataFrame of Images

In [8]:
import pathlib
import pandas as pd

# Get directories of train and test datasets
data_dir = pathlib.Path(path)
train_dir = data_dir / "Train"
test_dir = data_dir / "Test"

print("Data dir:", data_dir)
print(train_dir)
print(test_dir)

# Function to build dataframe
def build_image_df(root_dir):
    root_dir = pathlib.Path(root_dir)
    image_paths = list(root_dir.glob("*/*.png"))

    rows = []

    for p in image_paths:
        label = p.parent.name 
        rows.append({"image": str(p), "label": label})

    return pd.DataFrame(rows)


# Build train and test dataframes
train = build_image_df(train_dir)
test = build_image_df(test_dir)

print(train['label'].value_counts())
print(test['label'].value_counts())

Data dir: C:\Users\jacki\.cache\kagglehub\datasets\odins0n\ucf-crime-dataset\versions\1
C:\Users\jacki\.cache\kagglehub\datasets\odins0n\ucf-crime-dataset\versions\1\Train
C:\Users\jacki\.cache\kagglehub\datasets\odins0n\ucf-crime-dataset\versions\1\Test
label
NormalVideos     947768
Stealing          44802
Robbery           41493
Burglary          39504
Arrest            26397
Shoplifting       24835
Fighting          24684
Arson             24421
RoadAccidents     23486
Abuse             19076
Explosion         18753
Vandalism         13626
Assault           10360
Shooting           7140
Name: count, dtype: int64
label
NormalVideos     64952
Burglary          7657
Shooting          7630
Shoplifting       7623
Explosion         6510
Arrest            3365
Arson             2793
RoadAccidents     2663
Assault           2657
Stealing          1984
Fighting          1231
Vandalism         1111
Robbery            835
Abuse              297
Name: count, dtype: int64


## Encode Label as Integer

In [9]:
from sklearn.preprocessing import LabelEncoder

# Create and fit label encoder on training labels
le = LabelEncoder()
train['label_idx'] = le.fit_transform(train['label'])

# Apply the same encoding to test labels
test['label_idx'] = le.transform(test['label'])

# Number of classes and mapping
num_classes = len(le.classes_)
print("Number of classes:", num_classes)
print("Class name vs label_idx:")
for index, cls in enumerate(le.classes_):
    print(index, cls)

Number of classes: 14
Class name vs label_idx:
0 Abuse
1 Arrest
2 Arson
3 Assault
4 Burglary
5 Explosion
6 Fighting
7 NormalVideos
8 RoadAccidents
9 Robbery
10 Shooting
11 Shoplifting
12 Stealing
13 Vandalism


We can see that the dataset has a large class imbalance. To mitigate this, we can do some under/oversampling.

## Under/Oversampling

In [None]:
import numpy as np

# Initialize sizes for over/undersampling
TARGET_NORMAL = 45000   # maximum size for NormalVideos undersampling
TARGET_OTHER  = 40000   # minimum size for other classes oversampling

# Undersample NormalVideos to 50,000
normal_mask = train['label'] == "NormalVideos"

normal_df = train[normal_mask].sample(n=TARGET_NORMAL, random_state=42)

# Oversample all other classes

# Create balanced_dfs as a list
balanced_dfs = [normal_df]

for cls in train['label'].unique():
    if cls == "NormalVideos":
        continue
    
    # Check number of samples for each class
    cls_df = train[train['label'] == cls]
    n_current = len(cls_df)
    
    # If there's enough samples, leave as it is
    if n_current >= TARGET_OTHER:
        balanced_dfs.append(cls_df)

    else:
        # Oversample with replacement
        extra = cls_df.sample(
            n=TARGET_OTHER - n_current,
            replace=True,
            random_state=42
        )

        cls_balanced = pd.concat([cls_df, extra], ignore_index=True)
        balanced_dfs.append(cls_balanced)

# Put into new balanced training dataframe
train_balanced = pd.concat(balanced_dfs).reset_index(drop=True)

# Encode labels again
train_balanced['label_idx'] = le.transform(train_balanced['label'])

In [14]:
print(train_balanced['label'].value_counts())

label
NormalVideos     45000
Stealing         44802
Robbery          41493
Arson            40000
Arrest           40000
Abuse            40000
Burglary         40000
Assault          40000
Fighting         40000
Explosion        40000
RoadAccidents    40000
Shooting         40000
Shoplifting      40000
Vandalism        40000
Name: count, dtype: int64


In [15]:
train_balanced.head()

Unnamed: 0,image,label,label_idx
0,C:\Users\jacki\.cache\kagglehub\datasets\odins...,NormalVideos,7
1,C:\Users\jacki\.cache\kagglehub\datasets\odins...,NormalVideos,7
2,C:\Users\jacki\.cache\kagglehub\datasets\odins...,NormalVideos,7
3,C:\Users\jacki\.cache\kagglehub\datasets\odins...,NormalVideos,7
4,C:\Users\jacki\.cache\kagglehub\datasets\odins...,NormalVideos,7
