# Grouping Categorical Variables

## Setup

In [1]:
#import libraries
import pandas as pd
import numpy as np
import janitor 
from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv("../data/df_for_grouping.csv", dtype={"time_str": str}, parse_dates=["date", "datetime"])

## Demographics

The `vict_sex` variable requires no grouping as there are three categories, none of which are too low frequency.

However, there are several low-frequency values in `vict_descent`. Knowing that this will be a feature in my model, and to avoid anonymisation issues, I will combine some of the lower-frequency ethnicities. I have chosen to group all groups of Asian descent, and to reassign the remaining ethnicities (all of which represented less than 1% of the dataset) to "Other". This decision was made with recognition of the sensitivities involved and acknowledgement that different Asian subgroups — such as Indian, Vietnamese, Korean - have distinct cultural/socioeconomic backgrounds. For another type of project, this aggregration would likely be inappropriate (UK Gov, 2020), but I deemed that grouping was most suitable to ensure the robustness of my model and avoid overly granular categories that could lead to unreliable output.

In [3]:
#define descent categories
vict_descent_map_2 = {
    "Other Asian": "Asian",
    "Chinese": "Asian",
    "Cambodian": "Asian",
    "Filipino": "Asian",
    "Guamanian": "Other/Unknown",
    "American Indian/Alaskan Native": "Other/Unknown",
    "Japanese": "Asian",
    "Korean": "Asian",
    "Laotian": "Asian",
    "Pacific Islander": "Other/Unknown",
    "Samoan": "Other/Unknown",
    "Hawaiian": "Other/Unknown",
    "Vietnamese": "Asian",
    "Asian Indian": "Asian",
    "Other": "Other/Unknown",
    "Unknown": "Other/Unknown"
}

#map to dataframe
df["vict_descent"] = df["vict_descent"].map(vict_descent_map_2).combine_first(df["vict_descent"])

## Weapon Type

There are 80 unique weapon types in the dataset. I feel that this was a small enough number to handle mostly manually, so I will define categories that I consider to be logical. For efficiency, I will ask ChatGPT to categorise each of the weapon types into my pre-defined categories (OpenAI, 2025a). 

In [4]:
#define weapon categories
weapon_map = {
    "STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)": "Bodily Force",
    "UNKNOWN WEAPON/OTHER WEAPON": "Other/Unknown/No Weapon Used",
    "VERBAL THREAT": "Verbal Threat",
    "HAND GUN": "Gun/Firearm",
    "KNIFE WITH BLADE 6INCHES OR LESS": "Knife/Blade/Sharp Object",
    "SEMI-AUTOMATIC PISTOL": "Gun/Firearm",
    "OTHER KNIFE": "Knife/Blade/Sharp Object",
    "UNKNOWN FIREARM": "Gun/Firearm",
    "VEHICLE": "Vehicle",
    "MACE/PEPPER SPRAY": "Burning/Toxic Substance",
    "BOTTLE": "Blunt/Hitting Object",
    "STICK": "Blunt/Hitting Object",
    "ROCK/THROWN OBJECT": "Blunt/Hitting Object",
    "CLUB/BAT": "Blunt/Hitting Object",
    "FOLDING KNIFE": "Knife/Blade/Sharp Object",
    "REVOLVER": "Gun/Firearm",
    "KITCHEN KNIFE": "Knife/Blade/Sharp Object",
    "BLUNT INSTRUMENT": "Blunt/Hitting Object",
    "KNIFE WITH BLADE OVER 6 INCHES IN LENGTH": "Knife/Blade/Sharp Object",
    "PIPE/METAL PIPE": "Blunt/Hitting Object",
    "AIR PISTOL/REVOLVER/RIFLE/BB GUN": "Gun/Firearm",
    "SIMULATED GUN": "Gun/Firearm",
    "BELT FLAILING INSTRUMENT/CHAIN": "Blunt/Hitting Object",
    "OTHER CUTTING INSTRUMENT": "Knife/Blade/Sharp Object",
    "HAMMER": "Blunt/Hitting Object",
    "PHYSICAL PRESENCE": "Bodily Force",
    "SCREWDRIVER": "Knife/Blade/Sharp Object",
    "MACHETE": "Knife/Blade/Sharp Object",
    "UNKNOWN TYPE CUTTING INSTRUMENT": "Knife/Blade/Sharp Object",
    "SCISSORS": "Knife/Blade/Sharp Object",
    "OTHER FIREARM": "Gun/Firearm",
    "CONCRETE BLOCK/BRICK": "Blunt/Hitting Object",
    "SHOTGUN": "Gun/Firearm",
    "RIFLE": "Gun/Firearm",
    "FIXED OBJECT": "Blunt/Hitting Object",
    "STUN GUN": "Gun/Firearm",
    "BOARD": "Blunt/Hitting Object",
    "FIRE": "Burning/Toxic Substance",
    "GLASS": "Blunt/Hitting Object",
    "SWITCH BLADE": "Knife/Blade/Sharp Object",
    "CAUSTIC CHEMICAL/POISON": "Burning/Toxic Substance",
    "BRASS KNUCKLES": "Blunt/Hitting Object",
    "AXE": "Knife/Blade/Sharp Object",
    "TIRE IRON": "Blunt/Hitting Object",
    "SCALDING LIQUID": "Burning/Toxic Substance",
    "TOY GUN": "Gun/Firearm",
    "RAZOR BLADE": "Knife/Blade/Sharp Object",
    "SWORD": "Knife/Blade/Sharp Object",
    "BOMB THREAT": "Verbal Threat",
    "RAZOR": "Knife/Blade/Sharp Object",
    "ICE PICK": "Knife/Blade/Sharp Object",
    "HECKLER & KOCH 93 SEMIAUTOMATIC ASSAULT RIFLE": "Gun/Firearm",
    "ASSAULT WEAPON/UZI/AK47/ETC": "Gun/Firearm",
    "DIRK/DAGGER": "Knife/Blade/Sharp Object",
    "LIQUOR/DRUGS": "Other/Unknown/No Weapon Used",
    "EXPLOXIVE DEVICE": "Burning/Toxic Substance",
    "AUTOMATIC WEAPON/SUB-MACHINE GUN": "Gun/Firearm",
    "SAWED OFF RIFLE/SHOTGUN": "Gun/Firearm",
    "STARTER PISTOL/REVOLVER": "Gun/Firearm",
    "ROPE/LIGATURE": "Other/Unknown/No Weapon Used",
    "SEMI-AUTOMATIC RIFLE": "Gun/Firearm",
    "CLEAVER": "Knife/Blade/Sharp Object",
    "BOWIE KNIFE": "Knife/Blade/Sharp Object",
    "DOG/ANIMAL (SIC ANIMAL ON)": "Other/Unknown/No Weapon Used",
    "DEMAND NOTE": "Verbal Threat",
    "STRAIGHT RAZOR": "Knife/Blade/Sharp Object",
    "BLACKJACK": "Blunt/Hitting Object",
    "SYRINGE": "Knife/Blade/Sharp Object",
    "BOW AND ARROW": "Other/Unknown/No Weapon Used",
    "MARTIAL ARTS WEAPONS": "Blunt/Hitting Object",
    "UNK TYPE SEMIAUTOMATIC ASSAULT RIFLE": "Gun/Firearm",
    "UZI SEMIAUTOMATIC ASSAULT RIFLE": "Gun/Firearm",
    "RELIC FIREARM": "Gun/Firearm",
    "HECKLER & KOCH 91 SEMIAUTOMATIC ASSAULT RIFLE": "Gun/Firearm",
    "ANTIQUE FIREARM": "Gun/Firearm",
    "MAC-10 SEMIAUTOMATIC ASSAULT WEAPON": "Gun/Firearm",
    "MAC-11 SEMIAUTOMATIC ASSAULT WEAPON": "Gun/Firearm",
    "M1-1 SEMIAUTOMATIC ASSAULT RIFLE": "Gun/Firearm",
    "M-14 SEMIAUTOMATIC ASSAULT RIFLE": "Gun/Firearm"
}

#map to dataframe
df["weapon_group"] = df["weapon_type"].map(weapon_map).fillna("Other/Unknown/No Weapon Used")
df.drop("weapon_type", axis=1, inplace=True)

I edited the `weapon_map` dictionary slightly to tweak the decisions made by ChatGPT, to ensure that the categorisation was logical, e.g. changing "SYRINGE" from "Burning/Toxic Substance" to "Knife/Blade/Sharp Object".

## Crime Type

There are 142 crime types in the dataset, which I feel is too many to deal with completely manually, but I have noticed that many of them had repeating words (e.g. "THEFT") so I will write a function to group them by keyword. 

In [5]:
#define function to group crimes
def crime_grouping(crime):
    if pd.isna(crime):
        return "Other"
    crime = crime.upper()
    if any(word in crime for word in ["ASSAULT", "BRANDISH", "SHOTS", "BATTERY", "BOMB", "HOMICIDE", "MANSLAUGHTER", "LYNCHING", "KIDNAPPING", "IMPRISONMENT", "TRAFFICKING"]):
        return "Assault/Violence"
    elif any(word in crime for word in ["THEFT", "BURGLARY", "ROBBERY", "STOLEN", "EXTORTION", "PICKPOCKET", "SNATCHING", "BUNCO", "FRAUD", "COUNTERFEIT"]):
        return "Theft-Related"
    elif any(word in crime for word in ["VANDALISM", "ARSON"]):
        return "Property Damage"
    elif any(word in crime for word in ["VIOLATION", "TRESPASSING", "DISTURBING", "CONTEMPT", "THROWING", "RESISTING", "STALKING", "PROWLER", "THREAT"]):
        return "Public Order/Threatening Behaviour"
    elif any(word in crime for word in ["LEWD", "SEX", "RAPE", "PENETRATION", "INDECENT", "COPULATION", "PEEPING", "PIMPING", "AGNST"]):
        return "Sexual Offence"
    else:
        return "Other"

#apply function to dataframe
df["crime_group"] = df["crime_type"].apply(crime_grouping).fillna("Other")
df.drop("crime_type", axis=1, inplace=True)

There is one crime type that will likely be extremely relevant to my model, but for which a definition is not provided in the metadata: “CRM AGNST CHLD (13 OR UNDER) (14‑15 & SUSP 10 YRS OLDER)”. I asked ChatGPT to find a definition for this crime type; it found that this refers to 'a class of sexual offences against minors' (ChatGPT, 2025b). In California, the term “crime against a child” typically refers to lewd or lascivious acts on a minor' (Legal Clarity California, 2024). As such, I combined this into the "Sexual Offences" category.

I deemed two categories to be low-frequency: "Kidnapping/Trafficking" and "Murder/Manslaughter". Both contained fewer than 5,000 instances (0.2% of the dataset), so I combined them into "Assault/Violence". In a different context, it would be useful to retain the granularity of these categories, but for the purposes of my machine learning model it is better to combine them to reduce dimensionality.

## Premises Type

There are 319 premises types, with very little possibility for grouping using the same methods as above, as the vast majority have unique names with few repeating words. As such, I decided to use semantic similarity clustering to create meaningful categories for the model. I used a sentence transformer model (all-MiniLM-L6-v2) to group premises types into 12 predefined categories. The model did an acceptable job, but I followed up the clustering with manual corrections to ensure that the final groupings were logical and appropriate. I used the same keyword function as I did to categorise `crime_type`, which allowed me to retain human oversight whilst saving time.

In [6]:
#define preferred clusters
categories = ["Residence/Private Outdoor Space", "Street/Public Outdoor Space", "Transport Hub/Vehicle", "Restaurant/Eatery", "Store/Mall/Business", "Education", "Public Services/Healthcare", "Place of Worship", "Leisure/Entertainment/Sport", "Online", "Financial", "Other"]

model = SentenceTransformer("all-MiniLM-L6-v2")
unique_premises = df["premises_type"].dropna().unique()

premises_embeddings = model.encode(unique_premises)
category_embeddings = model.encode(categories)

type_clusters = {}
for i, premise in enumerate(unique_premises):
    similarities = np.dot(premises_embeddings[i], category_embeddings.T)
    best_category = categories[np.argmax(similarities)]
    type_clusters[premise] = best_category

df["premises_group"] = df["premises_type"].map(type_clusters)

#define function to regroup incorrect clusters
def premises_grouping(premises, current_group):
    if pd.isna(premises):
        return "Other"
    premises = premises.upper()
    if any(word in premises for word in ["BANK"]):
        return "Financial"
    elif any(word in premises for word in ["PUBLIC STORAGE", "DIY", "VALET", "OFFICE", "RADIO", "FACTORY", "MARKET", "OTHER BUSINESS", "CONNECTION", "SALES", "BMW", "CAR WASH", "GROVE", "EQUIPMENT", "COURIER"]):
        return "Store/Mall/Business"
    elif any(word in premises for word in ["HOME", "DRIVEWAY", "PATIO", "PORCH", "FOSTER", "GARAGE", "MOBILE", "BALCONY", "PROJECT"]):
        return "Residence/Private Outdoor Space"
    elif any(word in premises for word in ["FIRE", "SEWAGE", "CLINIC", "LIBRARY", "HOSPITAL", "MORTUARY", "HOSPICE", "ENERGY", "CARE", "WATER", "JAIL", "POLICE", "DENTAL", "RECYCLING"]):
        return "Public Services/Healthcare"
    elif any(word in premises for word in ["HARBOR", "LINE", "PARKING", "TRAM", "AIRCRAFT", "CHARTER", "MTA"]):
        return "Transport Hub/Vehicle"
    elif any(word in premises for word in ["RINK", "BASKETBALL", "ARCADE", "COCKTAIL", "MUSEUM", "STAPLES", "STADIUM", "BEVERLY", "VACATION", "HOTEL", "MOTEL", "BOWLING"]):
        return "Leisure/Entertainment/Sport"
    elif any(word in premises for word in ["ALLEY", "TRASH", "TUNNEL", "PAYPHONE", "FREEWAY", "GATHERING", "TRANSIENT", "BEACH", "RESERVOIR", "RIVER", "BRIDGE", "OTHER/OUTSIDE"]):
        return "Street/Public Outdoor Space"
    elif any(word in premises for word in ["COFFEE"]):
        return "Restaurant/Eatery"
    elif any(word in premises for word in ["SWAP", "ESCALATOR", "STAIR", "ELEVATOR", "ABATEMENT", "TACTICAL", "RETIRED", "SHED"]):
        return "Other"
    else:
        return current_group

#apply function to dataframe
df["premises_group"] = df.apply(lambda row: premises_grouping(row["premises_type"], row["premises_group"]), axis=1).fillna("Other")
df.drop("premises_type", axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,date,time_str,area,vict_age,vict_sex,vict_descent,lat,lon,time,datetime,weapon_group,crime_group,premises_group
0,2010-02-20,1350,Newton,48,Male,Hispanic/Latin/Mexican,33.9825,-118.2695,13:50:00,2010-02-20 13:50:00,Other/Unknown/No Weapon Used,Public Order/Threatening Behaviour,Residence/Private Outdoor Space
1,2010-01-05,150,Hollywood,47,Female,White,34.1016,-118.3295,01:50:00,2010-01-05 01:50:00,Gun/Firearm,Public Order/Threatening Behaviour,Street/Public Outdoor Space
2,2010-01-02,2100,Central,47,Female,Hispanic/Latin/Mexican,34.0387,-118.2488,21:00:00,2010-01-02 21:00:00,Bodily Force,Sexual Offence,Street/Public Outdoor Space
3,2010-01-04,1650,Central,23,Male,Black,34.048,-118.2577,16:50:00,2010-01-04 16:50:00,Other/Unknown/No Weapon Used,Theft-Related,Store/Mall/Business
4,2010-01-07,2005,Central,46,Male,Hispanic/Latin/Mexican,34.0389,-118.2643,20:05:00,2010-01-07 20:05:00,Other/Unknown/No Weapon Used,Theft-Related,Street/Public Outdoor Space


## Dataframe Export

In [7]:
# df.to_csv("../data/df_after_grouping.csv", index=False, encoding="utf-8")