In [82]:
import pandas as pd
pd.set_option('display.max_rows', None)

encounters_df = pd.read_parquet('ice_data/proc_data/encounters.parquet')

In [31]:
NUM_ROWS = encounters_df.shape[0]
NUM_COLS = encounters_df.shape[1]

print("num rows: %d" % NUM_ROWS)
print("num cols: %d" % NUM_COLS)
print("Column headers: %s" % encounters_df.columns)

num rows: 1193516
num cols: 20
Column headers: Index(['Event Date', 'Responsible AOR', 'Responsible Site', 'Lead Event Type',
       'Lead Source', 'Event Type', 'Final Program', 'Encounter Criminality',
       'Processing Disposition', 'Case Status', 'Case Category',
       'Departed Date', 'Departure Country', 'Final Order Yes No',
       'Final Order Date', 'Birth Year', 'Citizenship Country', 'Gender',
       'Event Landmark', 'Unique Identifier'],
      dtype='object')


In [None]:
# Event Date
# ~0% missing values/invalid dates. No cleaning required.
# GOOD
event_dates = encounters_df['Event Date']

valid_dates_mask = (pd.to_datetime('2023-9-1') <= event_dates) & (event_dates <= pd.to_datetime('2025-7-31'))
print("%f%% valid dates" % (100 * valid_dates_mask.size / NUM_ROWS))
encounters_df.loc[~valid_dates_mask, 'Event Date'] = pd.NaT
missingPct = (event_dates.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)

100.000000% valid dates
0.000084% missing values


In [None]:
# Responsible AOR
# ~0% missing, no duplicates
# GOOD
responsible_AOR = encounters_df['Responsible AOR']

missingPct = (responsible_AOR.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % responsible_AOR.unique().size)
print(responsible_AOR.value_counts())

0.122747% missing values
27 unique values
Responsible AOR
HQ Area of Responsibility                285900
Miami Area of Responsibility             134041
New York City Area of Responsibility      88002
Chicago Area of Responsibility            68394
Dallas Area of Responsibility             60670
Atlanta Area of Responsibility            58911
New Orleans Area of Responsibility        50109
San Antonio Area of Responsibility        43393
Houston Area of Responsibility            42487
St. Paul Area of Responsibility           32750
Boston Area of Responsibility             31383
Newark Area of Responsibility             28068
Seattle Area of Responsibility            26292
Denver Area of Responsibility             25729
Phoenix Area of Responsibility            25544
San Francisco Area of Responsibility      24486
Salt Lake City Area of Responsibility     24182
Los Angeles Area of Responsibility        24119
Washington Area of Responsibility         21892
Harlingen Area of Responsibili

In [83]:
# Responsible Site
# No null values
# GOOD

responsible_site = encounters_df['Responsible Site']

print("responsible site null values: %s" % responsible_site.isnull().sum())
print("%d unique values" % responsible_site.unique().size)
print(responsible_site.value_counts())

# Assuming "... SUB-OFFICE" and "ERO - ... Sub Office" are the same
responsible_site.replace('SACRAMENTO, CA, SUB-OFFICE', 'ERO - Sacramento, CA Sub Office', inplace=True)
responsible_site.replace('SAN BERNADINO, CA, SUB-OFFICE', 'ERO - San Bernardino, CA Sub Office', inplace=True)  # Also fixes spelling
responsible_site.replace('SANTA ANA, CA, SUB-OFFICE', 'ERO - Santa Ana, CA Sub-Office', inplace=True)
responsible_site.replace('ST. LOUIS, MO, SUB-OFFICE', 'ERO - St. Louis, MO Sub-Office', inplace=True)
responsible_site.replace('SAVANNAH, GA, SUB-OFFICE', 'ERO - Savannah, GA Sub-Office', inplace=True)
responsible_site.replace('LOUISVILLE, KY, SUB-OFFICE', 'ERO - Louisville, KY Sub-Office', inplace=True)
responsible_site.replace('MILWAUKEE, WI, SUB-OFFICE', 'ERO - Milwaukee, WI Sub-Office', inplace=True)
responsible_site.replace('FRESNO, CA, SUB-OFFICE', 'ERO - Fresno, CA Sub Office', inplace=True)
responsible_site.replace('YAKIMA, WA, SUB-OFFICE', 'ERO - Yakima, WA Sub-Office', inplace=True)
responsible_site.replace('GRAND JUNCTION, CO, SUB-OFFICE', 'ERO - Grand Junction, CO Sub-Office', inplace=True)
responsible_site.replace('WICHITA, KS, SUB-OFFICE', 'ERO - Wichita, KS Sub-Office', inplace=True)
responsible_site.replace('MEDFORD, OR, SUB-OFFICE', 'ERO - Medford, OR Sub Office', inplace=True)
responsible_site.replace('BAKERSFIELD CA IHP', 'ERO - Bakersfield, CA IHP Sub Office', inplace=True)
responsible_site.replace('RALEIGH/DURHAM, NC, SUB-OFFICE', 'ERO - Raleigh/Durham, NC Sub-Office', inplace=True)

print("num categories in responsible sites after cleaning: %s" % responsible_site.nunique())
# print(responsible_site.value_counts())


responsible site null values: 0
403 unique values
Responsible Site
ERO - Pacific Enforcement Response Center                          277384
VARRICK, NY, SERVICE PROCESSING CENTER (DOCKET CONTROL OFFICE)      55639
DALLAS, TX, DOCKET CONTROL OFFICE                                   40020
ERO - San Antonio, TX Field Office                                  33057
ERO - Montgomery County, TX Sub Office                              32800
ATLANTA, GA, DOCKET CONTROL OFFICE                                  26760
ERO - Bradenton, FL Sub Office                                      25739
KROME, MIAMI, FL, DOCKET CONTROL OFFICE                             24629
NEWARK, NJ, DOCKET CONTROL OFFICE                                   22922
NEW YORK, NY, DOCKET CONTROL OFFICE                                 20338
MIAMI, FL, DOCKET CONTROL OFFICE                                    18291
CHICAGO, IL, DOCKET CONTROL OFFICE                                  18140
PHOENIX, AZ, DOCKET CONTROL OFFICE           

In [None]:
# Lead Event Type
# ~90% are "Not applicable"
# POOR
# lead_event_type = encounters_df['Lead Event Type']
# print(lead_event_type.value_counts())

Lead Event Type
Not applicable                        1024908
CAP Local                              101448
287(g) Program Activities (Lead)        17925
Fugitive Operations (Lead)              13424
CAP Federal                             12456
Non-Detained Docket Control             11594
CAP State                                7133
Alternatives to Detention                3114
ERO Criminal Prosecutions                1083
Detained Docket Control                   284
Joint Criminal Alien Response Team        147
Name: count, dtype: int64


In [None]:
# Lead Source
# Categorical 82% missing, 41 unique
# note: these are NOT the same 82% missing departed date (i.e., not deported yet)
# POOR
lead_source = encounters_df['Lead Source']
missingPct = (lead_source.isnull().sum() / len(lead_source)) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % lead_source.unique().size)
pct_missing_departed_date_and_lead = 100 * (encounters_df['Departed Date'].isnull() | encounters_df['Lead Source'].isnull()).sum() / NUM_ROWS
print("%f%% missing both departed date and lead source" % pct_missing_departed_date_and_lead)
# print(lead_source.value_counts())

82.837013% missing values
41 unique values
96.022760% missing both departed date and country


In [6]:
# Event Type
# 46 categories, ~0% missing
# GOOD
event_type = encounters_df['Event Type']
missingPct = (event_type.isnull().sum() / len(event_type)) * 100
print("%f%% entries are 'Unknown'" % (100 * event_type.value_counts()['Unknown'] / len(event_type)))
print("%f%% missing values" % missingPct)
print("%d unique values" % event_type.unique().size)
# print(event_type.value_counts())

0.009635% entries are 'Unknown'
0.000000% missing values
46 unique values


In [12]:
# Final Program
# 0% missing, 14 categories
# GOOD
final_program = encounters_df['Final Program']
missingPct = (final_program.isnull().sum() / len(final_program)) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % final_program.unique().size)
# print(final_program.value_counts())

final_program.replace('287g Task Force', '287G Program', inplace=True)
final_program.replace('Joint Criminal Alien Response Team', 'Mobile Criminal Alien Team', inplace=True)

0.000000% missing values
14 unique values


In [None]:
# Encounter Criminality
# 3 categories, 0% missing
# GOOD
# Note: "3 Other Immigration Violator means no criminal history"
encounter_criminality = encounters_df['Encounter Criminality']
missingPct = (encounter_criminality.isnull().sum() / len(encounter_criminality)) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % encounter_criminality.unique().size)
print(encounter_criminality.value_counts())

0.000000% missing values
3 unique values
Encounter Criminality
3 Other Immigration Violator    726367
2 Pending Criminal Charges      279500
1 Convicted Criminal            187649
Name: count, dtype: int64


In [None]:
# Processing Disposition
# 1% missing values, 20% 'Other'
# GOOD
processing_disposition = encounters_df['Processing Disposition']
missingPct = (processing_disposition.isnull().sum() / len(processing_disposition)) * 100
print("%f%% missing values" % missingPct)
print("%f%% 'Other' category" % (100 * processing_disposition.value_counts()['Other'] / NUM_ROWS))
print("%d unique values" % processing_disposition.unique().size)
# print(processing_disposition.value_counts())

0.100041% missing values
18.359452% 'Other' category
40 unique values


In [None]:
# Case Status
# 14 categories, 56% missing
# POOR
case_status = encounters_df['Case Status']
missingPct = (case_status.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % case_status.unique().size)
# print(case_status.value_counts())

56.428988% missing values
14 unique values
Case Status
ACTIVE                                          294829
8-Excluded/Removed - Inadmissibility            141056
6-Deported/Removed - Deportability               49926
3-Voluntary Departure Confirmed                  12728
E-Charging Document Canceled by ICE               7628
A-Proceedings Terminated                          5353
9-VR Witnessed                                    4878
B-Relief Granted                                  1477
L-Legalization - Permanent Residence Granted      1419
7-Died                                             474
5-Title 50 Expulsion                               143
0-Withdrawal Permitted - I-275 Issued              100
Z-SAW - Permanent Residence Granted                 16
Name: count, dtype: int64


In [18]:
# Case Category
# 30 categories, 56% missing
# POOR
case_category = encounters_df['Case Category']
missingPct = (case_category.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % case_category.unique().size)
# print(case_category.value_counts())

56.428988% missing values
30 unique values


In [None]:
# Departed Date
# 82% missing BUT no departed date is information (not deported yet)
# GOOD
departed_date = encounters_df['Departed Date']
missingPct = (departed_date.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)
valid_dates = (pd.to_datetime('2023-9-1') <= departed_date) & (departed_date <= pd.to_datetime('2025-7-31'))

82.463243% missing values
min event date 1923-09-14 00:00:00:
max event date 2025-07-28 00:00:00:


In [None]:
# Departure Country
# 82% missing, but these are the same missing as departed date
# GOOD
departure_country = encounters_df['Departure Country']
missingPct = (departure_country.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % departure_country.unique().size)

mask_date_null = encounters_df['Departed Date'].isnull()
mask_country_null = encounters_df['Departure Country'].isnull()
overlap_count = (mask_date_null & mask_country_null).sum()
union_count = (mask_date_null | mask_country_null).sum()
jaccard = overlap_count / union_count
print("%f jaccard index of null values for date and country. 1 means they share all the same null rows" % jaccard)

82.502958% missing values
192 unique values
82.502958% missing both departed date and country
0.999519 jaccard index of null values for date and country. 1 means they share all the same null rows


In [None]:
# Final Order Yes No
# boolean, but 56% missing. unclear if we can assume missing value means no ... 
# BUT, this is always present for the 18% of departed rows, so valueable in that case
# GOOD
final_order_yes_no = encounters_df['Final Order Yes No']
missingPct = (final_order_yes_no.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)
print("%d unique values" % final_order_yes_no.unique().size)
print(final_order_yes_no.value_counts())

# this shows that this colum has a value for all non-null departed date rows
mask_date_null = encounters_df['Departed Date'].isnull()
mask_order_null = encounters_df['Final Order Yes No'].isnull()
overlap_count = (mask_date_null & mask_country_null).sum()
union_count = (mask_date_null | mask_order_null).sum()
print("overlap count: %d" % overlap_count)
print("union count: %d" % union_count)

56.428988% missing values
3 unique values
Final Order Yes No
NO     264000
YES    256027
Name: count, dtype: int64
overlap count: 984212
union count: 984212
1.461363140303702


In [None]:
# Final Order Date
# 78% missing, but high jaccard score with deported date
# GOOD

final_order_date = encounters_df['Final Order Date']
missingPct = (final_order_date.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)

mask_date_null = encounters_df['Departed Date'].isnull()
mask_final_null = encounters_df['Final Order Date'].isnull()
overlap_count = (mask_date_null & mask_final_null).sum()
union_count = (mask_date_null | mask_final_null).sum()
print("jaccard with deported date: %f" % (overlap_count / union_count))

78.550183% missing values
jaccard with deported date: 0.922904


In [None]:
# Birth Year
# few missing, no non-physical
# GOOD
birth_year = encounters_df['Birth Year']
missingPct = (birth_year.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)

invalid_year_mask = 2025 < birth_year
print("num birth years after 2025: %d" % invalid_year_mask.sum())
invalid_year_mask = birth_year < 1900
print("num birth years before 1900: %d" % invalid_year_mask.sum())

0.004189% missing values
num birth years after 2025: 0
num birth years before 1900: 0


In [84]:
# Citizenship Country
# no missing values. Though there are former entities like CZECHOSLOVAKIA and USSR, and Yugoslavia (Serbia & montenegro)
# which I've chosen to ignore. However, I have rolled up all territories and soviergn states.
# GOOD
citizenship_country = encounters_df['Citizenship Country']
missingPct = (citizenship_country.isnull().sum() / NUM_ROWS) * 100
print("%f%% missing values" % missingPct)
print("%d unique values before replacements" % citizenship_country.unique().size)

# miscellaneous
citizenship_country.replace('PALESTINE BORN BEFORE 1948', 'PALESTINE', inplace=True)
citizenship_country.replace('Palestine', 'PALESTINE', inplace=True)

# Soveirgn states and territories
citizenship_country.replace('CHINA, PEOPLES REPUBLIC OF', 'CHINA', inplace=True)
citizenship_country.replace('HONG KONG', 'CHINA', inplace=True)
citizenship_country.replace('MACAU', 'CHINA', inplace=True)
citizenship_country.replace('BRITISH VIRGIN ISLANDS', 'UNITED KINGDOM', inplace=True)
citizenship_country.replace('BERMUDA', 'UNITED KINGDOM', inplace=True)
citizenship_country.replace('NETHERLANDS ANTILLES', 'NETHERLANDS', inplace=True)
citizenship_country.replace('SAINT MARTIN(FRENCH)', 'FRANCE', inplace=True)
citizenship_country.replace('FRENCH POLYNESIA', 'FRANCE', inplace=True)
citizenship_country.replace('GUADELOUPE', 'FRANCE', inplace=True)
citizenship_country.replace('SAINT BARTHELEMY', 'FRANCE', inplace=True)
citizenship_country.replace('NEW CALEDONIA', 'FRANCE', inplace=True)
citizenship_country.replace('REUNION', 'FRANCE', inplace=True)
citizenship_country.replace('FRENCH GUIANA', 'FRANCE', inplace=True)
citizenship_country.replace('SINT MAARTEN(DUTCH)', 'NETHERLANDS', inplace=True)
citizenship_country.replace('ARUBA', 'NETHERLANDS', inplace=True)
citizenship_country.replace('SINT EUSTATIUS', 'NETHERLANDS', inplace=True)
citizenship_country.replace('SABA', 'NETHERLANDS', inplace=True)
citizenship_country.replace('BONAIRE', 'NETHERLANDS', inplace=True)
citizenship_country.replace('CURACAO', 'NETHERLANDS', inplace=True)
citizenship_country.replace('TURKS AND CAICOS ISLANDS', 'UNITED KINGDOM', inplace=True)
citizenship_country.replace('CAYMAN ISLANDS', 'UNITED KINGDOM', inplace=True)
citizenship_country.replace('MONTSERRAT', 'UNITED KINGDOM', inplace=True)
citizenship_country.replace('ANGUILLA', 'UNITED KINGDOM', inplace=True)
citizenship_country.replace('ST. HELENA', 'UNITED KINGDOM', inplace=True)
citizenship_country.replace('CHRISTMAS ISLAND', 'AUSTRALIA', inplace=True)

print("%d unique values after replacements" % citizenship_country.unique().size)

# citizenship_country.value_counts()

0.000000% missing values
225 unique values before replacements
200 unique values after replacements


In [None]:
gender = encounters_df['Gender']

In [None]:
event_landmark = encounters_df['Event Landmark']

In [None]:
unique_identifier = encounters_df['Unique Identifier']