In [1]:
import pathlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.feature_extraction import FeatureHasher  # DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder  # lazier / easier option potentially
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold

In [2]:
data_dir = pathlib.Path('../clean_data')
d = pd.merge(pd.read_parquet(data_dir / 'detainers.parquet'),
             pd.read_parquet(data_dir / 'arrests.parquet'),
             on = 'Unique Identifier', how = 'outer', suffixes = (None, '_x'))
for c in d.columns:
    if c.endswith('_x'):
        print(c, c[:-2])
        d[c] = pd.Series([x if x is not None else y for (x, y) in zip(d[c], d[c[:-2]])], index=d.index)
d = d[[c for c in d.columns if not c.endswith('_x')]].copy()
d.head()

Apprehension Date_x Apprehension Date
Final Program_x Final Program
Apprehension Method_x Apprehension Method
Case Status_x Case Status
Departed Date_x Departed Date
Departure Country_x Departure Country
Final Order Yes No_x Final Order Yes No
Final Order Date_x Final Order Date
Birth Year_x Birth Year
Citizenship Country_x Citizenship Country
Gender_x Gender


Unnamed: 0,Detainer Prepare Date,Facility State,Facility AOR,Port of Departure,Departure Country,Departed Date,Case Status,Detainer Prepared Criminality,Detention Facility,Detention Facility Code,...,Unique Identifier,Age,Census Region,Total Sentence Days,Apprehension Month,Apprehension State,Apprehension AOR,Apprehension Criminality,Case Category,Apprehension Site Landmark
0,2023-09-26,IL,Chicago Area of Responsibility,,,NaT,Pending,Criminal,TAYLORVILLE CORRECTIONS,ILTAYOC,...,000014d475106e2713492c3f2dd11798b86b0bc6,30.0,Midwest,,,,,,,
1,2024-07-05,FL,Miami Area of Responsibility,,,NaT,Pending,Pending Criminal,COLLIER COUNTY SHERIFF,COLLIFL,...,0000352e3ef2fdd291bc0bf48bfda8b240a9b8b5,37.0,South,0.0,,,,,,
2,2024-10-01,CA,San Francisco Area of Responsibility,,,NaT,Unknown,Criminal,SAN FRANCISCO CO JAIL,SFCOJCA,...,0000508ee708538630e90af0ba0a186386fa3786,35.0,West,,,,,,,
3,2024-10-07,CA,San Francisco Area of Responsibility,,,NaT,Unknown,Pending Criminal,SACRAMENTO COUNTY JAIL,SACRACA,...,0000508ee708538630e90af0ba0a186386fa3786,35.0,West,0.0,,,,,,
4,2025-07-20,CA,San Francisco Area of Responsibility,,,NaT,Unknown,Pending Criminal,SACRAMENTO COUNTY JAIL,SACRACA,...,0000508ee708538630e90af0ba0a186386fa3786,35.0,West,0.0,,,,,,


In [3]:
d.columns

Index(['Detainer Prepare Date', 'Facility State', 'Facility AOR',
       'Port of Departure', 'Departure Country', 'Departed Date',
       'Case Status', 'Detainer Prepared Criminality', 'Detention Facility',
       'Detention Facility Code', 'Facility City',
       'Detainer Prep Threat Level', 'Gender', 'Citizenship Country',
       'Birth Country', 'Birth Year', 'Entry Status',
       'Most Serious Conviction (MSC) Charge', 'MSC Sentence Days',
       'MSC Sentence Months', 'MSC Sentence Years', 'MSC Charge Date',
       'MSC Conviction Date', 'Processing Disposition', 'Final Program',
       'Apprehension Method', 'Case Final Order Yes No', 'Final Order Date',
       'Apprehension Date', 'Entry Date', 'Deportation Ordered Yes No',
       'Order to Show Cause Served Yes No', 'Biometric Match Yes No',
       'Statements Made Yes No', 'Final Order Yes No', 'Resume Custody Yes No',
       'Detainer Lift Reason', 'Unique Identifier', 'Age', 'Census Region',
       'Total Sentence Days',

In [12]:
feature_cols = ['Apprehension Criminality', 'Apprehension AOR',
                'Entry Status', 'Case Status', 'Gender', 
                'Citizenship Country', 'Final Program', 'Facility State',
                'Birth Year', 'MSC Sentence Months']
outcome_col = 'Deportation Ordered Yes No'
dummy_cols = [c for c in feature_cols if d[c].dtype == np.dtype('O')]
print(dummy_cols)

['Apprehension Criminality', 'Apprehension AOR', 'Entry Status', 'Case Status', 'Gender', 'Citizenship Country', 'Final Program', 'Facility State']


In [38]:
d_use = d[feature_cols + [outcome_col]].copy().reset_index(drop=True)
d_use = d_use[~d_use[outcome_col].isna() & ~d_use[outcome_col].apply(np.isnan)]
d_use.head()

Unnamed: 0,Apprehension Criminality,Apprehension AOR,Entry Status,Case Status,Gender,Citizenship Country,Final Program,Facility State,Birth Year,MSC Sentence Months,Deportation Ordered Yes No
0,,,Not Applicable,Pending,Male,MEXICO,ERO Criminal Alien Program,IL,1995.0,,0.0
1,,,Not Applicable,Pending,Male,CUBA,287G Program,FL,1988.0,0.0,0.0
2,,,Not Applicable,Unknown,Male,MEXICO,ERO Criminal Alien Program,CA,1990.0,,1.0
3,,,Not Applicable,Unknown,Male,MEXICO,ERO Criminal Alien Program,CA,1990.0,0.0,1.0
4,,,Not Applicable,Unknown,Male,MEXICO,ERO Criminal Alien Program,CA,1990.0,0.0,1.0


In [39]:
d2 = pd.get_dummies(d_use, columns=dummy_cols, dummy_na=True, drop_first=True, dtype=int)
d2.head()

Unnamed: 0,Birth Year,MSC Sentence Months,Deportation Ordered Yes No,Apprehension Criminality_2 Pending Criminal Charges,Apprehension Criminality_3 Other Immigration Violator,Apprehension Criminality_nan,Apprehension AOR_Baltimore Area of Responsibility,Apprehension AOR_Boston Area of Responsibility,Apprehension AOR_Buffalo Area of Responsibility,Apprehension AOR_Chicago Area of Responsibility,...,Facility State_TX,Facility State_UT,Facility State_VA,Facility State_VI,Facility State_VT,Facility State_WA,Facility State_WI,Facility State_WV,Facility State_WY,Facility State_nan
0,1995.0,,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1988.0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1990.0,,1.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1990.0,0.0,1.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1990.0,0.0,1.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
X_cols = np.array([c for c in d2.columns if c != outcome_col])
X = np.stack(d2[X_cols].fillna(0).values)
y_deported = d2[outcome_col].to_numpy()

In [53]:
pd.Series(y_deported).value_counts()

0.0    139269
1.0    102544
Name: count, dtype: int64

In [54]:
X.shape

(241813, 342)

In [47]:
# setup K-fold for cross validation
kf_1 = KFold(n_splits=5, shuffle=True, random_state=119)
kf_1.get_n_splits(X)

5

In [48]:
def try_classifier(X, y, create_classifier_func):
    fold_index_to_classifier = {}
    fold_index_to_test_metrics = {}
    for i, (train_index, test_index) in enumerate(kf_1.split(X)):
        fold_index_to_classifier[i] = create_classifier_func()
        fold_index_to_classifier[i].fit(X[train_index, :], y[train_index])
        y_pred = fold_index_to_classifier[i].predict(X[test_index, :])
        fold_index_to_test_metrics[i] = {'accuracy': accuracy_score(y[test_index], y_pred),
                                         'f1': f1_score(y[test_index], y_pred)}
    return fold_index_to_classifier, fold_index_to_test_metrics

In [49]:
%%time
dec_tree_clfs, dec_tree_test_metrics = try_classifier(X, y_deported, lambda: DecisionTreeClassifier(random_state=19))
dec_tree_test_metrics

CPU times: total: 1min 35s
Wall time: 1min 37s


{0: {'accuracy': 0.672394185637781, 'f1': 0.5974184368330114},
 1: {'accuracy': 0.6745652668362178, 'f1': 0.6025003156964264},
 2: {'accuracy': 0.6731178793705932, 'f1': 0.5965136162936117},
 3: {'accuracy': 0.6735453455192093, 'f1': 0.599655137437874},
 4: {'accuracy': 0.6784665646582028, 'f1': 0.6045470728854077}}

In [50]:
%%time
bag_dec_tree_clfs, bag_dec_tree_test_metrics = try_classifier(
    X, y_deported, lambda: BaggingClassifier(estimator=DecisionTreeClassifier(random_state=19), n_estimators=5, random_state=119))
bag_dec_tree_test_metrics

CPU times: total: 4min 11s
Wall time: 4min 16s


{0: {'accuracy': 0.6822364204040279, 'f1': 0.6184517602661502},
 1: {'accuracy': 0.6805409093728677, 'f1': 0.6176499703029104},
 2: {'accuracy': 0.6798792465314393, 'f1': 0.6149522483087942},
 3: {'accuracy': 0.679417724659857, 'f1': 0.6162186246843904},
 4: {'accuracy': 0.6816922377072908, 'f1': 0.6171027758431997}}

In [56]:
%%time
knn_clfs, knn_test_metrics = try_classifier(X, y_deported, lambda: KNeighborsClassifier(n_neighbors=5, metric='cosine')) 
knn_test_metrics

CPU times: total: 39min 15s
Wall time: 14min


{0: {'accuracy': 0.6691272253582284, 'f1': 0.5939609236234459},
 1: {'accuracy': 0.6663978661373364, 'f1': 0.5902163974398049},
 2: {'accuracy': 0.6691272253582284, 'f1': 0.5909090909090909},
 3: {'accuracy': 0.6650676150696828, 'f1': 0.586722457519008},
 4: {'accuracy': 0.6721599602994086, 'f1': 0.5959994903809402}}

In [None]:
# Other Things to classify:
# - year somebody was deported?
# - MSC
# - whether they have been deported or not