In [1]:
import pandas as pd
import numpy as np
import datetime
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning

Source data: Computer-Aided Dispatch data from EPD dispatch between January 1, 2014 and April 18, 2025.

### Steps for cleaning

1. Studying the format, entries, and each variable’s data type, inconsistencies, and formatting errors.
2. Recoding the dataset by: (1) stripping all columns with string values, (2) using the datetime package to convert the calltime variable to datetime objects, and (3) creating a new variable, agency, using boolean logic that indicates whether CAHOOTS or EPD responds to a call.
3. Conducting visual and statistical analysis of the dataset to make sure I understand variable distributions, as well as to find outliers and missing values.
4. Fixing identified outliers by: (1) determining if each outlier is a true outlier or an error in date entry, (2) removing/correcting outliers when appropriate, and (3) re-plotting distributions and/or computing updated summary statistics.
5. Doing a final check for consistency in the dataframe by: (1) checking that each variable is of the correct data type, (2) making sure missing values are intentional or properly handled, (3) using .describe to make sure that numerical variables are within the expected ranges, and (4) checking for consistent categorical and date/time values.

### Import CAD data from 2014-2025
Import each years .csv file into a separate, corresponding DataFrame. Concatenate each individual DataFrame into the master DataFrame: `cad`.

In [2]:
cad14 = pd.read_csv("../data/class_data_2014.csv", dtype={4: str, 16: str, 18: str})
cad15 = pd.read_csv("../data/class_data_2015.csv", dtype={4: str, 16: str, 18: str})
cad16 = pd.read_csv("../data/class_data_2016.csv", dtype={4: str, 16: str, 18: str})
cad17 = pd.read_csv("../data/class_data_2017.csv", dtype={4: str, 16: str, 18: str})
cad18 = pd.read_csv("../data/class_data_2018.csv", dtype={4: str, 16: str, 18: str})
cad19 = pd.read_csv("../data/class_data_2019.csv", dtype={4: str, 16: str, 18: str})
cad20 = pd.read_csv("../data/class_data_2020.csv", dtype={4: str, 16: str, 18: str})
cad21 = pd.read_csv("../data/class_data_2021.csv", dtype={4: str, 16: str, 18: str})
cad22 = pd.read_csv("../data/class_data_2022.csv", dtype={4: str, 16: str, 18: str})
cad23 = pd.read_csv("../data/class_data_2023.csv", dtype={4: str, 16: str, 18: str})
cad24 = pd.read_csv("../data/class_data_2024.csv", dtype={4: str, 16: str, 18: str})
cad25 = pd.read_csv("../data/class_data_2025.csv", dtype={4: str, 16: str, 18: str})

cad = pd.concat([cad14, cad15, cad16, cad17, cad18, cad19, cad20, cad21, cad22, cad23, cad24, cad25], ignore_index=True)

#### Begin data cleaning steps by creating `clean`, a copy of the original DataFrame.

In [3]:
clean = cad.copy()

### Recoding

#### Strip str variables of extra spaces/blanks.

In [4]:
clean.loc[:, 'closecode'] = clean['closecode'].str.strip()
clean.loc[:, 'service'] = clean['service'].str.strip()
clean.loc[:, 'callsource'] = clean['callsource'].str.strip()
clean.loc[:, 'nature'] = clean['nature'].str.strip()
clean.loc[:, 'closed_as'] = clean['closed_as'].str.strip()
clean.loc[:, 'beat'] = clean['beat'].str.strip()
clean.loc[:, 'primeunit'] = clean['primeunit'].str.strip()
clean.loc[:, 'priority'] = clean['priority'].str.strip()
clean.loc[:, 'zip'] = clean['zip'].str.strip()

#### Convert numbered variables to `int` values.

In [5]:
clean.loc[:, 'inci_id'] = clean['inci_id'].astype(int)
clean.loc[:, 'units_dispd'] = clean['units_dispd'].astype(int)
clean.loc[:, 'units_arrived'] = clean['units_arrived'].astype(int)

#### Convert time-based variables to datetime objects.

In [6]:
clean.loc[:, 'calltime'] = pd.to_datetime(clean['calltime'], format="%Y-%m-%d %H:%M:%S.%f")

#### Create agency column using `primeunit`. If `primeunit` matches one of the CAHOOTS units, then agency is assigned to `CAHOOTS`, otherwise it is `EPD`.

In [7]:
cahoots_pattern = r"1J77\s*|3J79\s*|3J78\s*|3J77\s*|4J79\s*|3J81\s*|3J76\s*|2J28\s*|2J29\s*|CAHOOT\s*|CAHOT\s*|CAHO\s*"

def in_cahoots(unit):
    if pd.isna(unit):
        return np.nan
    unit = str(unit)
    if re.fullmatch(cahoots_pattern, unit):
        return "CAHOOTS"
    else:
        return "EPD"

clean.loc[:, 'agency'] = clean['primeunit'].apply(in_cahoots)

#### Time-based variables in minutes and hours to allow for easier analyses later on.

In [8]:
clean.loc[:, 'secs_disp_to_arrv'] = (clean['secs_to_arrv'] - clean['secs_to_disp'])
clean.loc[:, 'mins_disp_to_arrv'] = clean['secs_disp_to_arrv']/60
clean.loc[:, 'hrs_disp_to_arrv'] = clean['secs_disp_to_arrv']/60/60

clean.loc[:, 'hours_to_disp'] = clean['secs_to_disp']/60/60
clean.loc[:, 'hours_to_arrv'] = clean['secs_to_arrv']/60/60

### Fix outliers & missing values

#### Remove missing `secs_to_arrv`, `secs_to_disp`, `secs_to_close`, and `agency`. Only subset calls where 1+ units arrive.

In [9]:
clean_subset = clean[~clean['secs_to_arrv'].isnull()]
clean_subset = clean_subset[~clean_subset['secs_to_disp'].isnull()]
clean_subset = clean_subset[~clean_subset['secs_to_close'].isnull()]
clean_subset = clean_subset[~clean_subset['agency'].isnull()]
clean_subset = clean_subset[clean_subset['units_arrived'] > 0]

#### Remove duplicate row(s).

In [10]:
idx_drop = clean_subset[clean_subset.duplicated(subset=['inci_id'])].index

clean_subset = clean_subset.drop(idx_drop, axis=0)

#### Correct zipcode and replace empty values with `NA`.

In [11]:
clean_subset.loc[clean_subset['zip'] == '94701', 'zip'] = '97401'
clean_subset.loc[clean_subset['zip'] == '', 'zip'] = pd.NA

#### Replace empty closecodes with `NA`.

In [12]:
clean_subset.loc[clean_subset['closecode'] == '', 'closecode'] = pd.NA

#### Subset to only positive `secs_to_arrv` and `secs_to_disp`.

In [13]:
clean_subset = clean_subset[clean_subset['secs_to_arrv'] >= 0]
clean_subset = clean_subset[clean_subset['secs_to_disp'] >= 0]

#### Subset to positive arrival times, and less than 24 hours.

In [14]:
clean_subset = clean_subset[clean_subset['secs_to_disp'] != clean_subset['secs_to_arrv']]
clean_subset = clean_subset[clean_subset['secs_to_arrv'] != clean_subset['secs_to_close']]

clean_subset = clean_subset[clean_subset['secs_disp_to_arrv'] >= 0]
clean_subset = clean_subset[clean_subset['hrs_disp_to_arrv'] <= 24]

#### Subset to calls taking place within 2014 and 2024.

In [15]:
clean_subset = clean_subset[clean_subset['yr'] <= 2024]

#### Subset to calls that only *both* agencies can handle.

In [16]:
#clean_subset[clean_subset['agency'] == 'CAHOOTS']['nature'].unique()
#clean_subset[clean_subset['agency'] == 'EPD']['nature'].unique()

Assess 'nature' variable *visually* and subset EPD to only calls that both can handle.

In [17]:
# Define call types where EPD may be responding to CAHOOTS-like situations
cahoots_like_epd_calls = [
    "TRANSPORT", "SUBJECT DOWN", "ASSIST PUBLIC- POLICE", "CHECK WELFARE", "TRAFFIC HAZARD",
    "CRIMINAL TRESPASS", "DISPUTE", "FOUND SYRINGE", "DISORDERLY SUBJECT", "INTOXICATED SUBJECT",
    "SUICIDAL SUBJECT", "SUSPICIOUS CONDITIONS", "ASSAULT", "SUSPICIOUS SUBJECT", 
    "ASSIST FIRE DEPARTMENT", "DISORIENTED SUBJECT", "ASSIST OUTSIDE AGENCY", "LOUD NOISE",
    "HAZARD SAFETY", "MENTAL SUBJECT", "OVERDOSE", "DEATH MESSAGE", "ASSAULT WITH INJURY", 
    "HARASSMENT", "MOTOR VEH ACC NO INJURY", "MISSING JUVENILE", "EMERGENCY MESSAGE", 
    "DISORDERLY MEDICAL TRANSPORT", "FOUND CONTRABAND", "BEAT INFORMATION", "UNKNOWN PROBLEM", 
    "ACCIDENT BIKE", "ILLEGAL CAMPING", "SUBJECT SCREAMING", "DRUG INFO", "DISORDERLY JUVENILES",
    "RECKLESS DRIVIING", "CRIMINAL MISCHIEF", "DOG BITE", "INFORMATION - POLICE", "NUDE SUBJECT",
    "MENTAL TRANSPORT", "INDECENT EXPOSURE", "INCOMPLETE CALL", "ATTEMPT TO LOCATE", "FOLLOW UP",
    "TRAFFIC COMPLAINT", "POISONING", "MISSING PERSON", "DISPUTE FAMILY", "INJURED SUBJECT",
    "THEFT", "ACCIDENT VEHICLE BIKE", "SEIZURES", "RESPIRATORY ARREST", "WALKAWAY", 
    "VIOLATION OF CITY ORDINANCE", "PATROL CHECK", "SEX ABUSE", "FRAUD", "OPEN CONTAINER",
    "PUBLIC INDECENCY", "ASSIST SHERIFFS OFFICE", "UNATTENDED CHILDREN", "ANIMAL CRUELTY", 
    "BURGLARY", "ANIMAL COMPLAINT", "INJURED ANIMAL", "THEFT FROM VEHICLE", "ROBBERY", 
    "PERSON STOP", "RAPE", "AUDIBLE ALARM", "ATTEMPT TO LOCATE DRUNK DRIVER", "SHOPLIFT 3050 PROGRAM",
    "BLOCKED SIDEWALK", "SUSPICIOUS VEHICLE", "CIVIL STANDBY", "VEHICLE/PEDESTRIAN CRASH", 
    "MOTOR VEH ACC UNKNOWN INJ", "CARDIAC ARREST", "JUVENILE PROBLEM", "DISABLED VEHICLE", 
    "DOWN LINE", "GUNSHOT WOUND", "LOCATE MISSING PERSON", "REQUEST COVER", "ARMED SUBJECT", 
    "UNAUTHORIZED USE OF VEHICLE", "CHILD ABUSE", "THEFT OF SERVICES", "RECKLESS BURNING", 
    "DOG AT LARGE", "DETOXIFICATION", "ILL SUBJECT", "LOST SUBJECT", "ASSIST OREGON STATE POLICE", 
    "ILLEGAL BURNING", "OFFICER SAFETY INFO", "LOCATION RUNAWAY", "PROWLER", "BLOCKED ALLEY", 
    "UNLAWFUL VEHICLE ENTRY", "CITIZEN CONTACT", "GAS LEAK", "HIT AND RUN", "RUNAWAY JUVENILE", 
    "STAB WOUND", "MENACING", "SHOTS FIRED", "SUICIDE", "RECKLESS DRIVING", "ANIMAL ATTACK/BITE", 
    "DECEASED SUBJECT", "REQUEST ASSISTANCE", "FLAGGED DOWN", "INFORMATION- BOTH PD AND FD", 
    "WATER RESCUE", "ANIMAL AT LARGE", "LIVESTOCK AT LARGE", "ELDERLY ABUSE", 
    # CAHOOTS-prefixed calls
    "FOUND SYRINGE, CAHOOTS", "TRANSPORT, CAHOOTS", "PUBLIC ASSIST, CAHOOTS", 
    "SUICIDAL SUBJECT, CAHOOTS", "DISORDERLY SUBJECT, CAHOOTS", "CHECK WELFARE, CAHOOTS", 
    "ASSIST PD, CAHOOTS", "INTOXICATED SUBJECT, CAHOOTS", "DISORIENTED SUBJECT, CAHOOTS", 
    "ASSIST FD, CAHOOTS", "TRAFFIC HAZARD, CAHOOTS", "INFO / ATL, CAHOOTS",
    # Duplicates at end (if needed for consistency in external source)
    "CRIMINAL MISCHIEF", "CRIMINAL TRESPASS", "DEATH MESSAGE", "DISORDERLY JUVENILES",
    "DISORDERLY MEDICAL TRANSPORT", "DISORDERLY SUBJECT", "DISORIENTED SUBJECT", "DISPUTE", 
    "DOG BITE", "FOLLOW UP", "FOUND SYRINGE", "HARASSMENT", "INDECENT EXPOSURE", 
    "INJURED SUBJECT", "INTOXICATED SUBJECT", "LOUD NOISE", "MENTAL TRANSPORT", 
    "MOTOR VEH ACC NO INJURY", "OVERDOSE", "ROBBERY", "RUNAWAY JUVENILE", "SUBJECT DOWN", 
    "SUICIDAL SUBJECT", "SUICIDE", "SUSPICIOUS CONDITIONS", "SUSPICIOUS SUBJECT", "THEFT", 
    "TRAFFIC HAZARD", "TRANSPORT"
]

# Filter the dataset to only include these call types
clean_subset = clean_subset[clean_subset['nature'].isin(cahoots_like_epd_calls)]

### Quick checkpoint to see entries left in cleaned DataFrame

In [18]:
print(f'Percentage of CAD data used: {len(clean_subset)/len(cad)*100:.2f}%')
print(f'Number of entries in cleaned data: {len(clean_subset)}')
print(f'Number of entries in raw CAD data: {len(cad)}')

Percentage of CAD data used: 32.56%
Number of entries in cleaned data: 484732
Number of entries in raw CAD data: 1488577


In [19]:
complete_clean = clean_subset.reset_index(drop=True).copy()

In [26]:
complete_clean[['yr', 'inci_id', 'calltime', 'nature', 'priority', 'secs_to_arrv', 'agency']]

Unnamed: 0,yr,inci_id,calltime,nature,priority,secs_to_arrv,agency
0,2014,14000006,2014-01-01 00:05:17,DISPUTE,3,596.0,EPD
1,2014,14000011,2014-01-01 00:10:21,THEFT,3,380.0,EPD
2,2014,14000014,2014-01-01 00:12:38,ILLEGAL BURNING,1,1262.0,EPD
3,2014,14000051,2014-01-01 00:51:42,ASSAULT,3,623.0,EPD
4,2014,14000067,2014-01-01 01:16:15,DISPUTE,P,907.0,EPD
...,...,...,...,...,...,...,...
484727,2024,24349418,2024-12-31 22:40:25,ILLEGAL BURNING,5,720.0,EPD
484728,2024,24349438,2024-12-31 23:23:53,SUBJECT SCREAMING,3,257.0,EPD
484729,2024,24349443,2024-12-31 23:28:56,UNKNOWN PROBLEM,3,1016.0,EPD
484730,2024,24349461,2024-12-31 23:46:01,DISPUTE,3,1171.0,EPD


### Load complete, cleaned DataFrame into csv file named `cleaned_full_class_data.csv`

In [21]:
complete_clean.to_csv('../data/cleaned_full_class_data.csv', index=False)