In [1]:
import pandas as pd
import numpy as np
import pathlib
import re
from datetime import datetime
from detentions_columns import DetentionsColumns

In [2]:
proc_dir = pathlib.Path('./proc_data/')
d = pd.read_parquet(proc_dir / 'detentions.parquet')
use_cols = [v.value[0] for k, v in DetentionsColumns.__members__.items() if not v.value[2] and not v.value[3]]
print(f'{use_cols=}')
d = d[use_cols].copy()
d.head()

use_cols=['Stay Book In Date Time', 'Book In Date Time', 'Detention Facility Code', 'Detention Book Out Date Time', 'Stay Book Out Date Time', 'Detention Release Reason', 'Stay Book Out Date', 'Stay Release Reason', 'Gender', 'Marital Status', 'Birth Year', 'Ethnicity', 'Entry Status', 'Felon', 'Case Status', 'Case Category', 'Final Order Yes No', 'Final Order Date', 'Case Threat Level', 'Book In Criminality', 'Final Charge', 'Departed Date', 'Departure Country', 'Initial Bond Set Amount', 'Citizenship Country', 'Final Program', 'MSC Charge', 'Unique Identifier']


Unnamed: 0,Stay Book In Date Time,Book In Date Time,Detention Facility Code,Detention Book Out Date Time,Stay Book Out Date Time,Detention Release Reason,Stay Book Out Date,Stay Release Reason,Gender,Marital Status,...,Case Threat Level,Book In Criminality,Final Charge,Departed Date,Departure Country,Initial Bond Set Amount,Citizenship Country,Final Program,MSC Charge,Unique Identifier
0,2024-07-04 02:15:00,2024-07-04 02:15:00,FLO,2024-07-04 10:00:00,2024-07-04 10:00:00,Removed,2024-07-04,Removed,Male,Single,...,,3 Other Immigration Violator,PREVIOUSLY ORDERED REMOVED AND ENTERED OR ATTE...,2024-07-04,MEXICO,,MEXICO,Border Patrol,,000031423837d9b430050e9298882932f74ee3dd
1,2023-09-09 03:20:00,2023-09-09 04:30:00,FLO,2023-09-09 09:04:00,2023-09-09 09:04:00,Removed,2023-09-09,Removed,Female,Single,...,,3 Other Immigration Violator,PREVIOUSLY ORDERED REMOVED AND ENTERED OR ATTE...,2023-09-09,MEXICO,,MEXICO,Border Patrol,,00007be9691d4682465db5bd1a5a0cca4af9019c
2,2023-09-09 03:20:00,2023-09-09 03:20:00,FSF,2023-09-09 04:19:00,2023-09-09 09:04:00,Transferred,2023-09-09,Removed,Female,Single,...,,3 Other Immigration Violator,PREVIOUSLY ORDERED REMOVED AND ENTERED OR ATTE...,2023-09-09,MEXICO,,MEXICO,Border Patrol,,00007be9691d4682465db5bd1a5a0cca4af9019c
3,2024-06-16 13:18:00,2024-06-16 13:18:00,RGRNDTX,2024-06-25 03:00:00,2024-06-25 03:00:00,Removed,2024-06-25,Removed,Male,Single,...,,3 Other Immigration Violator,IMMIGRANT WITHOUT AN IMMIGRANT VISA,2024-06-25,COLOMBIA,,COLOMBIA,Border Patrol,,0000cecbf0050979574b60e7d2f2fefdb2c8eb6c
4,2024-07-05 11:58:00,2024-07-05 14:50:00,NYCHOLD,2024-07-05 15:00:00,2024-09-04 19:18:00,Transferred,2024-09-04,Removed,Female,Single,...,1.0,1 Convicted Criminal,"Aggravated Felony: 101(a)(43)(A) Murder, Rape ...",2024-09-05,GUYANA,,GUYANA,ERO Criminal Alien Program,Sex Assault - Carnal Abuse,0000f265b1ff8abbfbec54888dd2cddb62e21625


In [3]:
def missing_percent_print(d, col_name):
    print(f"{col_name}, {(d[col_name].isnull().sum() / len(d)).item() * 100:.4}% missing values")

In [4]:
missing_percent_print(d, DetentionsColumns.STAY_BOOK_IN_DATE_TIME.column_name)

Stay Book In Date Time, 0.0% missing values


In [5]:
# dates filtering
print(f'before detentions date filtering: {d.shape=}')
d = d[d[DetentionsColumns.STAY_BOOK_IN_DATE_TIME.column_name].isna() |
      ((pd.to_datetime('2023-9-1') <= d[DetentionsColumns.STAY_BOOK_IN_DATE_TIME.column_name]) & 
      (d[DetentionsColumns.STAY_BOOK_IN_DATE_TIME.column_name] <= pd.to_datetime('2025-7-31')))].copy()
d = d[d[DetentionsColumns.BOOK_IN_DATE_TIME.column_name].isna() |
      ((pd.to_datetime('2023-9-1') <= d[DetentionsColumns.BOOK_IN_DATE_TIME.column_name]) & 
      (d[DetentionsColumns.BOOK_IN_DATE_TIME.column_name] <= pd.to_datetime('2025-7-31')))].copy()
d = d[d[DetentionsColumns.STAY_BOOK_OUT_DATE_TIME.column_name].isna() |
      ((pd.to_datetime('2023-9-1') <= d[DetentionsColumns.STAY_BOOK_OUT_DATE_TIME.column_name]) & 
      (d[DetentionsColumns.STAY_BOOK_OUT_DATE_TIME.column_name] <= pd.to_datetime('2025-7-31')))].copy()
d = d[d[DetentionsColumns.STAY_BOOK_OUT_DATE.column_name].isna() |
      ((pd.to_datetime('2023-9-1') <= d[DetentionsColumns.STAY_BOOK_OUT_DATE.column_name]) & 
      (d[DetentionsColumns.STAY_BOOK_OUT_DATE.column_name] <= pd.to_datetime('2025-7-31')))].copy()
print(f'after detentions date filtering: {d.shape=}')

before detentions date filtering: d.shape=(1322499, 28)
after detentions date filtering: d.shape=(1260522, 28)


In [6]:
missing_percent_print(d, DetentionsColumns.DETENTION_FACILITY_CODE.column_name)

Detention Facility Code, 0.0% missing values


In [7]:
# d[DetentionsColumns.DETENTION_FACILITY_CODE.column_name].value_counts()

In [8]:
missing_percent_print(d, DetentionsColumns.DETENTION_RELEASE_REASON.column_name)

Detention Release Reason, 4.473% missing values


In [9]:
def release_reason_replace(d, c):
    # c = DetentionsColumns.DETENTION_RELEASE_REASON.column_name
    # group similar categories back to other categories if its obvious
    d[c] = d[c].replace('Paroled - Fear Found', 'Paroled')
    d[c] = d[c].replace('Paroled - Public Benefit', 'Paroled')
    d[c] = d[c].replace('Paroled - Humanitarian', 'Paroled')
    d[c] = d[c].replace('Voluntary Return', 'Voluntary departure')
    d[c] = d[c].replace('Voluntary Return', 'Voluntary departure')
    # https://immigrationlawneworleans.com/immigration-law/order-of-supervision-osup-what-you-need-to-know/
    d[c] = d[c].replace('Order of supervision', 'Order of Supervision')
    d[c] = d[c].replace('Order of Supervision - Humanitarian', 'Order of Supervision')
    d[c] = d[c].replace('Order of Supervision - Re-Release', 'Order of Supervision')
    d[c] = d[c].replace('Order of Supervision - No SLRRFF', 'Order of Supervision')
    d[c] = d[c].replace('Order of recognizance', 'Order of Recognizance')
    d[c] = d[c].replace('Order of Recognizance - Humanitarian', 'Order of Recognizance')
    d[c] = d[c].replace('ORR - Office of Refugee Resettlement', 'Office of Refugee Resettlement')
    d[c] = d[c].replace('ORR-Runaway', 'Office of Refugee Resettlement')

release_reason_replace(d, DetentionsColumns.DETENTION_RELEASE_REASON.column_name)

In [10]:
d[DetentionsColumns.DETENTION_RELEASE_REASON.column_name].value_counts()

Detention Release Reason
Transferred                                                      720836
Removed                                                          308402
Paroled                                                           78506
Order of Recognizance                                             33286
Bonded Out - IJ                                                   21930
U.S. Marshals or other agency (explain in Detention Comments)     15071
Order of Supervision                                               9045
Processing Disposition Changed Locally                             6921
Voluntary departure                                                3103
Bonded Out - Field Office                                          3004
Relief Granted by IJ                                               2072
Proceedings Terminated                                             1736
Withdrawal                                                          167
Died                                   

In [11]:
missing_percent_print(d, DetentionsColumns.STAY_RELEASE_REASON.column_name)

Stay Release Reason, 11.75% missing values


In [12]:
release_reason_replace(d, DetentionsColumns.STAY_RELEASE_REASON.column_name)

In [13]:
d[DetentionsColumns.STAY_RELEASE_REASON.column_name].value_counts()

Stay Release Reason
Removed                                                          806675
Paroled                                                          131191
Bonded Out - IJ                                                   52056
Order of Recognizance                                             50759
U.S. Marshals or other agency (explain in Detention Comments)     26949
Order of Supervision                                              17395
Voluntary departure                                                8626
Transferred                                                        6220
Relief Granted by IJ                                               4714
Bonded Out - Field Office                                          4466
Proceedings Terminated                                             2813
Withdrawal                                                          264
Died                                                                 78
Processing Disposition Changed Locally      

In [14]:
# missing_percent_print(d, DetentionsColumns.RELIGION.column_name)

In [15]:
missing_percent_print(d, DetentionsColumns.MARITAL_STATUS.column_name)

Marital Status, 22.06% missing values


In [16]:
# impute missing values with 'Unknown'
d[DetentionsColumns.MARITAL_STATUS.column_name] = d[DetentionsColumns.MARITAL_STATUS.column_name].apply(lambda x: 'Unknown' if type(x) != str else x)

In [17]:
# assume 'separated' and 'divorced' are different?
d[DetentionsColumns.MARITAL_STATUS.column_name].value_counts()

Marital Status
Single       753660
Unknown      345148
Married      144303
Divorced      10655
Separated      5045
Widowed        1711
Name: count, dtype: int64

In [18]:
missing_percent_print(d, DetentionsColumns.ETHNICITY.column_name)

Ethnicity, 54.01% missing values


In [19]:
d[DetentionsColumns.ETHNICITY.column_name].value_counts()

Ethnicity
Hispanic Origin           523927
Not of Hispanic Origin     44333
Unknown                    11487
Name: count, dtype: int64

In [20]:
d[DetentionsColumns.ETHNICITY.column_name] = d[DetentionsColumns.ETHNICITY.column_name].apply(lambda x: 'Unknown' if type(x) != str else x)

In [21]:
d[DetentionsColumns.ETHNICITY.column_name].value_counts()

Ethnicity
Unknown                   692262
Hispanic Origin           523927
Not of Hispanic Origin     44333
Name: count, dtype: int64

In [22]:
missing_percent_print(d, DetentionsColumns.ENTRY_STATUS.column_name)

Entry Status, 5.342% missing values


In [23]:
c = DetentionsColumns.ENTRY_STATUS.column_name
d[c] = d[c].replace('Crew - Absconder', 'Crew')
d[c] = d[c].replace('Crew - Deserter', 'Crew')
d[c] = d[c].replace('Crew - In transit to conveyance', 'Crew')
d[c] = d[c].replace('Crew - Working', 'Crew')

d[c] = d[c].replace('False Claim with Altered Document', 'False Claim')
d[c] = d[c].replace('False Claim with Counterfeit Document', 'False Claim')
d[c] = d[c].replace('False Claim with Valid Document', 'False Claim')
d[c] = d[c].replace('ORAL FALSE CLAIMS TO OTHER THAN U.S. CITZ-', 'False Claim')
d[c] = d[c].replace('Oral False Claim to U.S. Citizenship', 'False Claim')
d[c] = d[c].replace('Fraud-Other Than False Claim to USC', 'False Claim')

d[c] = d[c].replace('United States Citizen-False Claim', 'US Citizen')

d[c] = d[c].replace('Parolee - CH - Advanced Humanitarian', 'Parolee')
d[c] = d[c].replace('Parolee - DA - Advanced Parole', 'Parolee')
d[c] = d[c].replace('Parolee - DT - Port of Entry Parole', 'Parolee')
d[c] = d[c].replace('Parolee - OP - Overseas Parole', 'Parolee')
d[c] = d[c].replace('Parolee - Parole as Asylee/Refugee', 'Parolee')
d[c] = d[c].replace('Parolee- CC - Cuban Adjustment Act', 'Parolee')
d[c] = d[c].replace('Significant Public Benefit Parole', 'Parolee')

# https://www.aila.org/library/dhs-and-dos-suspend-transit-without-visa-twov-and-
# TWOV - transit without visa (program ended in 2003? maybe outdated terminology?)
d[c] = d[c].replace('TWOV', 'No Documents')

d[c] = d[c].replace('Other Applicant for Admission', 'Other')
d[c] = d[c].replace('Other Non-Immigrant Classification', 'Other')

d[c] = d[c].replace('Refugee - Initial', 'Refugee')

d[c] = d[c].replace('Temporary Resident', 'Temporary')
d[c] = d[c].replace('Temporary Work Agriculture', 'Temporary')
d[c] = d[c].replace('Temporary Worker Other', 'Temporary')

d[c] = d[c].replace('Present Without Admission', 'PWA Other')

d[c] = d[c].replace('Lawful Permanent Resident - Seeking admission (OALICE)', 'Legal Permanent Resident')

In [24]:
# sorted(list(d[DetentionsColumns.ENTRY_STATUS.column_name].value_counts().index))

In [25]:
d[DetentionsColumns.ENTRY_STATUS.column_name].value_counts()

Entry Status
Not  Applicable                         651717
PWA Mexico                              449641
Other                                    28092
No Documents                             19488
PWA Other                                15192
PWA Canada                                9891
Non-Immigrant                             7215
Legal Permanent Resident                  3337
Parolee                                   1445
Asylum                                    1190
Visitor                                    859
Visa Waiver Program                        694
Border Crossing Card                       619
Immigrant                                  449
US Citizen                                 440
Removable                                  430
Temporary                                  377
Intercepted Attempting to EWI at POE       343
Overstay                                   337
Student                                    246
Smuggler                                   222


In [26]:
# missing_percent_print(d, DetentionsColumns.BOND_POSTED_DATE.column_name)

In [27]:
# missing_percent_print(d, DetentionsColumns.BOND_POSTED_AMOUNT.column_name)

In [28]:
missing_percent_print(d, DetentionsColumns.CASE_STATUS.column_name)

Case Status, 0.5387% missing values


In [29]:
d[DetentionsColumns.CASE_STATUS.column_name].value_counts()

Case Status
8-Excluded/Removed - Inadmissibility            628779
ACTIVE                                          410758
6-Deported/Removed - Deportability              140424
3-Voluntary Departure Confirmed                  41382
E-Charging Document Canceled by ICE              14153
9-VR Witnessed                                    8508
B-Relief Granted                                  4261
A-Proceedings Terminated                          4095
0-Withdrawal Permitted - I-275 Issued              547
5-Title 50 Expulsion                               500
7-Died                                             199
L-Legalization - Permanent Residence Granted        83
Z-SAW - Permanent Residence Granted                 43
Name: count, dtype: int64

In [30]:
d[DetentionsColumns.CASE_CATEGORY.column_name].value_counts()

Case Category
[8C] Excludable / Inadmissible - Administrative Final Order Issued             342985
[16] Reinstated Final Order                                                    235967
[8G] Expedited Removal - Credible Fear Referral                                180103
[8K] Expedited Removal Terminated due to Credible Fear Finding / NTA Issued    125037
[8B] Excludable / Inadmissible - Under Adjudication by IJ                      116297
[8F] Expedited Removal                                                          99074
[3] Deportable - Administratively Final Order                                   49820
[8A] Excludable / Inadmissible - Hearing Not Commenced                          29250
[2A] Deportable - Under Adjudication by IJ                                      23648
[11] Administrative Deportation / Removal                                       17533
[9] VR Under Safeguards                                                          8915
[1A] Voluntary Departure - Un-Expired an

In [31]:
missing_percent_print(d, DetentionsColumns.CASE_THREAT_LEVEL.column_name)

Case Threat Level, 66.16% missing values


In [32]:
d[DetentionsColumns.CASE_THREAT_LEVEL.column_name].value_counts()

Case Threat Level
1.0    188644
3.0    134020
2.0    103912
Name: count, dtype: int64

In [33]:
missing_percent_print(d, DetentionsColumns.BOOK_IN_CRIMINALITY.column_name)

Book In Criminality, 0.0% missing values


In [34]:
d[DetentionsColumns.BOOK_IN_CRIMINALITY.column_name].value_counts()

Book In Criminality
3 Other Immigration Violator    599080
1 Convicted Criminal            424673
2 Pending Criminal Charges      236769
Name: count, dtype: int64

In [35]:
missing_percent_print(d, DetentionsColumns.FINAL_CHARGE.column_name)

Final Charge, 34.96% missing values


In [36]:
c = DetentionsColumns.FINAL_CHARGE.column_name
d[c] = d[c].apply(lambda x: x.replace('** ', '') if type(x) == str else x)
d[c] = d[c].apply(lambda x: x.upper() if type(x) == str else x)

d[c] = d[c].replace('CONVICTION OF ONE CRIME INVOLVING MORAL TURPITUDE', 'CONVICTION OF CRIME INVOLVING MORAL TURPITUDE')
d[c] = d[c].replace('CONVICTION OF TWO CRIMES INVOLVING MORAL TURPITUDE', 'CONVICTION OF CRIME INVOLVING MORAL TURPITUDE')
d[c] = d[c].replace('CONVICTION OR COMMISSION OF A CRIME INVOLVING MORAL TURPITUDE', 'CONVICTION OF CRIME INVOLVING MORAL TURPITUDE')
d[c] = d[c].replace('MORAL TURPITUDE-1 CONVICTION', 'CONVICTION OF CRIME INVOLVING MORAL TURPITUDE')

d[c] = d[c].replace('FALSE CLAIM OF UNITED STATES CITIZENSHIP', 'FALSE CLAIM TO U.S. CITIZENSHIP')
d[c] = d[c].replace('TRAFFICKER IN CONTROLLED SUBSTANCE', 'CONTROLLED SUBSTANCE TRAFFICKERS')
d[c] = d[c].replace('IMMIGRANT WITHOUT AN IMMIGRANT VISA', 'IMMIGRANT WITHOUT VISA')
d[c] = d[c].replace('DRUG ABUSER OR ADDICT', 'DRUG CONVICTION')
d[c] = d[c].replace('NARCOTICS', 'DRUG CONVICTION')
d[c] = d[c].replace('CONVICTION -- VIOLATION OF NARCOTIC DRUG LAW OR REGULATION', 'DRUG CONVICTION')

d[c] = d[c].replace('ENTERED WITHOUT INSPECTION', 'ENTRY WITHOUT INSPECTION')
d[c] = d[c].replace('EWI', 'ENTRY WITHOUT INSPECTION')

d[c] = d[c].replace('SECURITY AND RELATED GROUNDS:  TERRORIST ACTIVITY', 'TERRORISM RELATED')
d[c] = d[c].replace('ENGAGED IN TERRORIST ACTIVITY', 'TERRORISM RELATED')
d[c] = d[c].replace('POLITICAL ENDORSER OF TERRORIST ORGANIZATION', 'TERRORISM RELATED')
d[c] = d[c].replace('LIKELY TO ENGAGE IN TERRORIST ACTIVITY', 'TERRORISM RELATED')   # ?????

d[c] = d[c].replace('SECURITY AND RELATED GROUNDS:  ESPIONAGE AND SABOTAGE', 'SECURITY AND RELATED GROUNDS')
d[c] = d[c].replace('UNLAWFUL ACTIVITY (SECURITY & RELATED GROUNDS)', 'SECURITY AND RELATED GROUNDS')
d[c] = d[c].replace('SECURITY AND RELATED GROUNDS:  ENDANGERING PUBLIC SAFETY OR NATIONAL SECURITY', 'SECURITY AND RELATED GROUNDS')

d[c] = d[c].replace('NONIMMIGRANT OVERSTAY: CREWMEMBER', 'NONIMMIGRANT OVERSTAY')
d[c] = d[c].replace('NONIMMIGRANT -- REMAINED LONGER', 'NONIMMIGRANT OVERSTAY')
d[c] = d[c].replace('NONIMMIGRANT STUDENT OUT OF STATUS: FAILURE TO ATTEND', 'NONIMMIGRANT STUDENT OUT OF STATUS')
d[c] = d[c].replace('NONIMMIGRANT STUDENT OUT OF STATUS:  FAILURE TO CARRY FULL COURSE OF STUDY', 'NONIMMIGRANT STUDENT OUT OF STATUS')
d[c] = d[c].replace('NONIMMIGRANT STATUS VIOLATORS: FAILED TO MAINTAIN THE NONIMMIGRANT STATUS IN WHICH THE ALIEN WAS ADMITTED', 'NONIMMIGRANT STATUS VIOLATORS')
d[c] = d[c].replace('NONIMMIGRANT FAILURE TO MAINTAIN STATUS AFTER STATUS CHANGED', 'NONIMMIGRANT STATUS VIOLATORS')
d[c] = d[c].replace('NONIMMIGRANT OUT OF STATUS: MEXICAN BORDER CROSSER', 'NONIMMIGRANT STATUS VIOLATORS')
d[c] = d[c].replace('FAILURE TO MAINTAIN STATUS:  CRIME OF VIOLENCE UNDER 8 C.F.R. 214.1(G)', 'NONIMMIGRANT STATUS VIOLATORS')
d[c] = d[c].replace('VIOLATION OF NONIMMIGRANT STATUS - FAILURE TO COMPLY WITH SPECIAL REGISTRATION REQUIREMENTS (NSEERS)', 'NONIMMIGRANT STATUS VIOLATORS')
d[c] = d[c].replace('FAIL TO MAINTAIN NONIMM STATUS', 'NONIMMIGRANT STATUS VIOLATORS')
d[c] = d[c].replace('NON-IMMIGRANT STATUS VIOLATOR', 'NONIMMIGRANT STATUS VIOLATORS')

d[c] = d[c].replace('ALIEN PRESENT WITHOUT ADMISSION OR PAROLE - (PWAS)', 'ALIEN PRESENT WITHOUT ADMISSION OR PAROLE')
d[c] = d[c].replace('ALIEN PRESENT IN THE UNITED STATES WHO WAS NOT ADMITTED OR PAROLED OR ARRIVING IN THE UNITED STATES AT A TIME OR PLACE NOT DESIGNATED BY THE AG - (212)(A)(6)(A)', 'ALIEN PRESENT WITHOUT ADMISSION OR PAROLE')

d[c] = d[c].replace('FRAUD AND MISUSE OF VISAS, PERMITS AND OTHER DOCUMENTS:  CONVICTED UNDER 18 USC 1546', 'FRAUD OR WILLFUL MISREPRESENTATION')
d[c] = d[c].replace('ALIEN PRESENT IN THE UNITED STATES WHO WAS NOT ADMITTED OR PAROLED OR ARRIVING IN THE UNITED STATES AT A TIME OR PLACE NOT DESIGNATED BY THE AG - (212)(A)(6)(A)', 'ALIEN PRESENT WITHOUT ADMISSION OR PAROLE')

d[DetentionsColumns.FINAL_CHARGE.column_name].value_counts()

Final Charge
IMMIGRANT WITHOUT VISA                                                                                                                                                                                                  275280
ALIEN PRESENT WITHOUT ADMISSION OR PAROLE                                                                                                                                                                               270563
ALIEN PREVIOUSLY REMOVED ONCE, NOT AS AN ARRIVING ALIEN (NOT AGGRAVATED FELONS)                                                                                                                                          59888
PREVIOUSLY ORDERED REMOVED AND ENTERED OR ATTEMPTED TO ENTER WITHOUT BEING ADMITTED                                                                                                                                      48953
ALIEN PREVIOUSLY REMOVED TWO OR MORE TIMES, NOT AS AN ARRIVING ALIEN, NOT AGGRAVATED FELONS    

In [37]:
[x for x in d[DetentionsColumns.FINAL_CHARGE.column_name].unique() if type(x) == str and 'STATUS' in x]

['NONIMMIGRANT STATUS VIOLATORS',
 'NONIMMIGRANT STUDENT OUT OF STATUS',
 'INADMISSIBLE AT ENTRY OR ADJUSTMENT OF STATUS DUE TO FALSE CLAIM TO U.S. CITIZENSHIP PREDICATED ON 212A6CII',
 'TERMINATION OF CONDITIONAL PERMANENT RESIDENT STATUS']

In [38]:
missing_percent_print(d, DetentionsColumns.FINAL_PROGRAM.column_name)

Final Program, 0.0% missing values


In [39]:
c = DetentionsColumns.FINAL_PROGRAM.column_name
d[c] = d[c].replace('Inspections - Land', 'Inspections')
d[c] = d[c].replace('Inspections - Air', 'Inspections')
d[c] = d[c].replace('Inspections - Sea', 'Inspections')

d[c] = d[c].replace('287G Program', '287g Task Force')

d[c] = d[c].replace('Mobile Criminal Alien Team', 'Fugitive Operations')
d[c] = d[c].replace('Violent Criminal Alien Section', 'ERO Criminal Prosecutions')

In [40]:
d[DetentionsColumns.FINAL_PROGRAM.column_name].value_counts()

Final Program
ERO Criminal Alien Program             473172
Border Patrol                          468800
Fugitive Operations                    123852
Inspections                             55811
Non-Detained Docket Control             46799
Alternatives to Detention               26945
Detained Docket Control                 18878
287g Task Force                         17931
Homeland Security Investigations        15536
Non-User Fee Investigations              3992
Intelligence                             2432
HSI Criminal Arrest Only                 2400
ERO Criminal Prosecutions                1242
Detention and Deportation                 835
Law Enforcement Area Response Units       491
User Fee Investigations                   426
Juvenile                                  377
Joint Terrorism Task Force                341
Asylum                                     93
Adjudications                              93
Quick Response Team                        31
Joint Criminal Alien

In [41]:
missing_percent_print(d, DetentionsColumns.MSC_CHARGE.column_name)

MSC Charge, 66.16% missing values


In [42]:
d[DetentionsColumns.MSC_CHARGE.column_name].value_counts()

MSC Charge
Driving Under Influence Liquor                             54003
Illegal Entry (INA SEC.101(a)(43)(O), 8USC1325 only)       36619
Assault                                                    28590
Traffic Offense                                            22359
Illegal Re-Entry (INA SEC.101(a)(43)(O), 8USC1326 only)    21973
                                                           ...  
Theft Vehicle By Bailee                                        1
Abscond While On Probation                                     1
Arson - Residence-Defraud Insurer                              1
Frequent House Ill Fame                                        1
Transport Tools For Forgery/Counterfeiting                     1
Name: count, Length: 394, dtype: int64

In [43]:
# most of these values appear clean/normalized, but we could enrich the data with a "MSC" category
# that is more general than the specific MSCs

In [44]:
# d.to_parquet(proc_data / 'detentions_cleaned.parquet')