In [118]:
import pathlib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/#example-3-working-with-sparse-representations
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [89]:
data_dir = pathlib.Path('../clean_data')
d = pd.merge(pd.read_parquet(data_dir / 'detentions.parquet'),
             pd.read_parquet(data_dir / 'arrests.parquet'),
             on = 'Unique Identifier', how = 'outer', suffixes = (None, '_x'))
d = pd.merge(d, pd.read_parquet(data_dir / 'detainers.parquet'),
             on = 'Unique Identifier', how = 'outer', suffixes = (None, '_y'))
d = pd.merge(d, pd.read_parquet(data_dir / 'encounters.parquet'),
             on = 'Unique Identifier', how = 'outer', suffixes = (None, '_z'))

# assume duplicate column names represent duplicate information when possible
for suffix in ['_x', '_y', '_z']:
    for c in d.columns:
        if c.endswith(suffix):
            print(c, c[:-2])
            d[c] = pd.Series([x if x is not None else y for (x, y) in zip(d[c], d[c[:-2]])], index=d.index)
    d = d[[c for c in d.columns if not c.endswith(suffix)]].copy()
d.head()

Final Program_x Final Program
Case Status_x Case Status
Case Category_x Case Category
Departed Date_x Departed Date
Departure Country_x Departure Country
Final Order Yes No_x Final Order Yes No
Final Order Date_x Final Order Date
Birth Year_x Birth Year
Citizenship Country_x Citizenship Country
Gender_x Gender
Departure Country_y Departure Country
Departed Date_y Departed Date
Case Status_y Case Status
Detention Facility_y Detention Facility
Detention Facility Code_y Detention Facility Code
Gender_y Gender
Citizenship Country_y Citizenship Country
Birth Year_y Birth Year
Entry Status_y Entry Status
Final Program_y Final Program
Apprehension Method_y Apprehension Method
Final Order Date_y Final Order Date
Apprehension Date_y Apprehension Date
Final Order Yes No_y Final Order Yes No
Final Program_z Final Program
Case Status_z Case Status
Case Category_z Case Category
Departed Date_z Departed Date
Departure Country_z Departure Country
Final Order Yes No_z Final Order Yes No
Final Order Da

Unnamed: 0,Stay Book In Date Time,Book In Date Time,Detention Facility,Detention Facility Code,Detention Book Out Date Time,Stay Book Out Date Time,Detention Release Reason,Stay Book Out Date,Stay Release Reason,Religion,...,Census Region,Total Sentence Days,Apprehension Month,Event Date,Responsible AOR,Event Type,Encounter Criminality,Deported,Days After Start,Event Month
0,NaT,NaT,,,NaT,NaT,,NaT,,,...,Midwest,,,2023-09-26,Chicago Area of Responsibility,CAP State,1 Convicted Criminal,False,25.0,September
1,NaT,NaT,,,NaT,NaT,,NaT,,,...,Midwest,,,2023-10-31,Chicago Area of Responsibility,Fugitive Operations (Event),3 Other Immigration Violator,False,60.0,October
2,2024-07-04 02:15:00,2024-07-04 02:15:00,FLORENCE SPC,FLO,2024-07-04 10:00:00,2024-07-04 10:00:00,Removed,2024-07-04,Removed,,...,,,,NaT,,,,,,
3,NaT,NaT,,,NaT,NaT,,NaT,,,...,South,0.0,,2024-07-05,Miami Area of Responsibility,287(g) Program Activities,2 Pending Criminal Charges,False,308.0,July
4,NaT,NaT,,,NaT,NaT,,NaT,,,...,,,,2024-11-04,Chicago Area of Responsibility,CAP Local,3 Other Immigration Violator,False,430.0,November


In [90]:
d['STAY_TIME_DAYS'] = (d['Stay Book Out Date Time'] - d['Stay Book In Date Time']).apply(lambda x: x.days)
d = d[d['STAY_TIME_DAYS'] < 3000]   # remove erroneous outlier
d['IS_DEPORTED'] = (~d['Departure Country'].isna() & ~d['Departed Date'].isna()).apply(lambda x: 1 if x else 0)

In [91]:
d.columns

Index(['Stay Book In Date Time', 'Book In Date Time', 'Detention Facility',
       'Detention Facility Code', 'Detention Book Out Date Time',
       'Stay Book Out Date Time', 'Detention Release Reason',
       'Stay Book Out Date', 'Stay Release Reason', 'Religion', 'Gender',
       'Marital Status', 'Birth Date', 'Birth Year', 'Ethnicity',
       'Entry Status', 'Felon', 'Bond Posted Date', 'Bond Posted Amount',
       'Case Status', 'Case Category', 'Final Order Yes No',
       'Final Order Date', 'Case Threat Level', 'Book In Criminality',
       'Final Charge', 'Departed Date', 'Departure Country',
       'Initial Bond Set Amount', 'Citizenship Country', 'Final Program',
       'MSC Charge', 'Unique Identifier', 'Apprehension Date',
       'Apprehension State', 'Apprehension AOR', 'Apprehension Method',
       'Apprehension Criminality', 'Apprehension Site Landmark',
       'Detainer Prepare Date', 'Facility State', 'Facility AOR',
       'Port of Departure', 'Detainer Prepared Cr

In [107]:
%%time

# apriori_input_cols = ['IS_DEPORTED', 'Apprehension Criminality', 'Book In Criminality', 'Apprehension AOR',
#                       'Entry Status', 'Case Status', 'Marital Status', 'Departure Country',  # 'Gender', 
#                       'Citizenship Country', 'Final Program', 'MSC Charge', 'Felon']
# apriori_input_cols = ['IS_DEPORTED', 'Apprehension AOR', 'Entry Status', 'Citizenship Country', 'MSC Charge', 'Marital Status', 'Felon']
apriori_input_cols = ['Census Region', 'Entry Status', 'Citizenship Country',
                      'Most Serious Conviction (MSC) Charge', 'Apprehension Criminality', 'Deportation Ordered Yes No',
                      'Detention Facility', 'MSC Sentence Years', 'Marital Status', 'Felon']
# try to remove missing values since the results were dominated by that.
dataset_rows = [[f'{k}: {v}' for k, v in r.items()
                 if type(v) == str and str(v) != 'nan' and str(v) != 'None' and str(v) != 'Not Applicable' and str(v) != 'Not  Applicable']
                for i, r in d[apriori_input_cols].iterrows()]

CPU times: total: 3min 2s
Wall time: 3min 4s


In [108]:
len(dataset_rows)

2327205

In [109]:
%%time

te = TransactionEncoder()
arrests_detentions_df = te.fit(dataset_rows).transform(dataset_rows, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(arrests_detentions_df, columns=te.columns_)
sparse_df.head()

rules_results = apriori(sparse_df, min_support=0.01, use_colnames=True, verbose=1, max_len=5)



Processing 2085 combinations | Sampling itemset size 5
CPU times: total: 3min 1s
Wall time: 3min 4s


In [110]:
rules_results['itemsets_size'] = rules_results['itemsets'].apply(len)
# remove item sets that are a single item
rules_results = rules_results[rules_results['itemsets_size'] > 1]
rules_results['itemsets_list'] = rules_results['itemsets'].apply(lambda x: sorted(list(x)))
rules_results['first_item'] = rules_results['itemsets_list'].apply(lambda ls: ls[0])
rules_results['itemsets_list_str'] = rules_results['itemsets_list'].apply(lambda ls: ', '.join(ls))
rules_results = rules_results.sort_values(by = 'first_item', ascending=True)
rules_results.head()

Unnamed: 0,support,itemsets,itemsets_size,itemsets_list,first_item,itemsets_list_str
62,0.037541,(Apprehension Criminality: 1 Convicted Crimina...,2,[Apprehension Criminality: 1 Convicted Crimina...,Apprehension Criminality: 1 Convicted Criminal,Apprehension Criminality: 1 Convicted Criminal...
256,0.012634,(Detention Facility: MONTGOMERY PROCESSING CTR...,3,[Apprehension Criminality: 1 Convicted Crimina...,Apprehension Criminality: 1 Convicted Criminal,Apprehension Criminality: 1 Convicted Criminal...
257,0.197743,(Apprehension Criminality: 1 Convicted Crimina...,3,[Apprehension Criminality: 1 Convicted Crimina...,Apprehension Criminality: 1 Convicted Criminal,Apprehension Criminality: 1 Convicted Criminal...
258,0.039246,(Apprehension Criminality: 1 Convicted Crimina...,3,[Apprehension Criminality: 1 Convicted Crimina...,Apprehension Criminality: 1 Convicted Criminal,Apprehension Criminality: 1 Convicted Criminal...
259,0.134696,(Apprehension Criminality: 1 Convicted Crimina...,3,[Apprehension Criminality: 1 Convicted Crimina...,Apprehension Criminality: 1 Convicted Criminal,Apprehension Criminality: 1 Convicted Criminal...


In [111]:
rules_results.shape

(426, 6)

In [112]:
rules_results[['support', 'itemsets_size', 'itemsets_list_str']].to_csv('../out/rules_assoc_view.csv', index=False)

In [113]:
# what proportion of itemsets involve citizenship?
rules_results[rules_results['itemsets'].apply(lambda x: any(['Citizenship' in _ for _ in x]))].shape[0] / rules_results.shape[0]

0.4624413145539906

In [114]:
d['Census Region'].value_counts()

Apprehension AOR
Houston Area of Responsibility           172104
Miami Area of Responsibility             171576
New Orleans Area of Responsibility       146082
Dallas Area of Responsibility            127219
Chicago Area of Responsibility           126597
Atlanta Area of Responsibility           111844
Los Angeles Area of Responsibility       109023
San Antonio Area of Responsibility        97819
Phoenix Area of Responsibility            85011
Salt Lake City Area of Responsibility     84699
Newark Area of Responsibility             57605
Washington Area of Responsibility         55408
Boston Area of Responsibility             54588
Harlingen Area of Responsibility          43206
Philadelphia Area of Responsibility       42245
St. Paul Area of Responsibility           41943
San Francisco Area of Responsibility      36785
New York City Area of Responsibility      28139
Denver Area of Responsibility             26002
El Paso Area of Responsibility            21925
Detroit Area of Respons

In [115]:
citizenship_top10 = d['Citizenship Country'].value_counts().head(10)
cit_top10_d = pd.DataFrame({'n': citizenship_top10}, index=citizenship_top10.index)
cit_top10_d['proportion'] = cit_top10_d['n'] / d.shape[0]
cit_top10_d

Unnamed: 0_level_0,n,proportion
Citizenship Country,Unnamed: 1_level_1,Unnamed: 2_level_1
MEXICO,792812,0.340671
GUATEMALA,305849,0.131423
HONDURAS,286765,0.123223
VENEZUELA,152744,0.065634
EL SALVADOR,116093,0.049885
COLOMBIA,101834,0.043758
ECUADOR,91175,0.039178
NICARAGUA,65412,0.028108
DOMINICAN REPUBLIC,55126,0.023688
PERU,37598,0.016156


In [116]:
msc_top10 = citizenship_top10 = d['Most Serious Conviction (MSC) Charge'].value_counts().head(10)
msc_top10_d = pd.DataFrame({'n': msc_top10}, index=msc_top10.index)
msc_top10_d['proportion'] = msc_top10_d['n'] / d.shape[0]
msc_top10_d

Unnamed: 0_level_0,n,proportion
Most Serious Conviction (MSC) Charge,Unnamed: 1_level_1,Unnamed: 2_level_1
TRAFFIC,116560,0.050086
OTHER,102987,0.044254
ASSAULT,77918,0.033481
IMMIGRATION,57472,0.024696
THEFT_FRAUD,55943,0.024039
DRUG_TRAFFICK,29436,0.012649
DRUG_POSSESSION,29257,0.012572
SEX_OFFENSE,25147,0.010806
PUBLIC_ORDER,24883,0.010692
BURGLARY,22890,0.009836


In [117]:
entry_top10 = d['Entry Status'].value_counts().head(10)
entry_top10_d = pd.DataFrame({'n': entry_top10}, index=entry_top10.index)
entry_top10_d['proportion'] = entry_top10_d['n'] / d.shape[0]
# PWA = Present Without Admission
entry_top10_d

Unnamed: 0_level_0,n,proportion
Entry Status,Unnamed: 1_level_1,Unnamed: 2_level_1
Not Applicable,1562376,0.671353
PWA Mexico,481843,0.207048
Other Applicant for Admission,23828,0.010239
No Documents,23370,0.010042
PWA Canada,10676,0.004587
PWA Other,8048,0.003458
Present Without Admission,7672,0.003297
Non-Immigrant,7317,0.003144
Legal Permanent Resident,2600,0.001117
Parolee,1758,0.000755


In [83]:
# the dataset is very skewed for attributes we might want to look at rules associations for.
# we are seeing the problem where potentially high utility items are very low support?