In [None]:
import pathlib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets
# https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/#example-3-working-with-sparse-representations
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [None]:
data_dir = pathlib.Path('../clean_data')
backup_file = data_dir / 'all_merge.parquet'
if not backup_file.exists():
    d = pd.merge(pd.read_parquet(data_dir / 'detentions.parquet'),
                 pd.read_parquet(data_dir / 'arrests.parquet'),
                 on = 'Unique Identifier', how = 'outer', suffixes = (None, '_x'))
    d = pd.merge(d, pd.read_parquet(data_dir / 'detainers.parquet'),
                 on = 'Unique Identifier', how = 'outer', suffixes = (None, '_y'))
    d = pd.merge(d, pd.read_parquet(data_dir / 'encounters.parquet'),
                 on = 'Unique Identifier', how = 'outer', suffixes = (None, '_z'))
    
    # assume duplicate column names represent duplicate information when possible
    for suffix in ['_x', '_y', '_z']:
        for c in d.columns:
            if c.endswith(suffix):
                print(c, c[:-2])
                d[c] = pd.Series([x if x is not None else y for (x, y) in zip(d[c], d[c[:-2]])], index=d.index)
        d = d[[c for c in d.columns if not c.endswith(suffix)]].copy()
    d.to_parquet(backup_file)
else:
    d = pd.read_parquet(backup_file)
d.head()

In [None]:
d['STAY_TIME_DAYS'] = (d['Stay Book Out Date Time'] - d['Stay Book In Date Time']).apply(lambda x: x.days)
d = d[d['STAY_TIME_DAYS'] < 3000]   # remove erroneous outlier
d['IS_DEPORTED'] = (~d['Departure Country'].isna() & ~d['Departed Date'].isna()).apply(lambda x: 1 if x else 0)

In [None]:
d.columns

In [None]:
%%time

# apriori_input_cols = ['IS_DEPORTED', 'Apprehension Criminality', 'Book In Criminality', 'Apprehension AOR',
#                       'Entry Status', 'Case Status', 'Marital Status', 'Departure Country',  # 'Gender', 
#                       'Citizenship Country', 'Final Program', 'MSC Charge', 'Felon']
# apriori_input_cols = ['IS_DEPORTED', 'Apprehension AOR', 'Entry Status', 'Citizenship Country', 'MSC Charge', 'Marital Status', 'Felon']
apriori_input_cols = ['Census Region', 'Entry Status', 'Citizenship Country',
                      'Most Serious Conviction (MSC) Charge', 'Apprehension Criminality', 'Deportation Ordered Yes No',
                      'Detention Facility', 'MSC Sentence Years', 'Marital Status', 'Felon',
                      'Case Final Order Yes No', 'Statements Made Yes No', ]
# try to remove missing values since the results were dominated by that.
# each 'Unique Identifier' has one collection of features
dataset_rows = [list(set([f'{k}: {v}' 
                          for _, r in sub_df.iterrows()
                          for k, v in r.items()
                          if type(v) == str and str(v) != 'nan' and str(v) != 'None' and str(v) != 'Not Applicable' and str(v) != 'Not  Applicable']))
                for i, sub_df in d[['Unique Identifier'] + apriori_input_cols].groupby('Unique Identifier')]

In [None]:
len(dataset_rows)

In [None]:
%%time

te = TransactionEncoder()
arrests_detentions_df = te.fit(dataset_rows).transform(dataset_rows, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(arrests_detentions_df, columns=te.columns_)
sparse_df.head()

rules_results = apriori(sparse_df, min_support=0.01, use_colnames=True, verbose=1, max_len=5)

In [None]:
rules_results['itemsets_size'] = rules_results['itemsets'].apply(len)
# remove item sets that are a single item
rules_results = rules_results[rules_results['itemsets_size'] > 1]
rules_results['itemsets_list'] = rules_results['itemsets'].apply(lambda x: sorted(list(x)))
rules_results['first_item'] = rules_results['itemsets_list'].apply(lambda ls: ls[0])
rules_results['itemsets_list_str'] = rules_results['itemsets_list'].apply(lambda ls: ', '.join(ls))
rules_results = rules_results.sort_values(by = 'first_item', ascending=True)
rules_results.head()

In [None]:
rules_results.shape

In [None]:
rules_results[['support', 'itemsets_size', 'itemsets_list_str']].sort_values(by = 'support').to_csv('../out/rules_assoc_view.csv', index=False)

In [None]:
# what proportion of itemsets involve citizenship?
rules_results[rules_results['itemsets'].apply(lambda x: any(['Citizenship' in _ for _ in x]))].shape[0] / rules_results.shape[0]

In [None]:
d['Census Region'].value_counts()

In [None]:
citizenship_top10 = d[['Citizenship Country', 'Unique Identifier']].drop_duplicates()['Citizenship Country'].value_counts().head(10)
cit_top10_d = pd.DataFrame({'n': citizenship_top10}, index=citizenship_top10.index)
cit_top10_d['proportion'] = cit_top10_d['n'] / d['Unique Identifier'].nunique()
cit_top10_d

In [None]:
cit_top10_d.iloc[:5]

In [None]:
ipywidgets.HTML(cit_top10_d.iloc[:5].to_html(index=True))

In [None]:
# 60% of the dataset represented by people from 3 citizenships
cit_top10_d.iloc[:5]['proportion'].sum().item()

In [None]:
msc_top10 = d[['Most Serious Conviction (MSC) Charge', 'Unique Identifier']].drop_duplicates()['Most Serious Conviction (MSC) Charge'].value_counts().head(10)
msc_top10_d = pd.DataFrame({'n': msc_top10}, index=msc_top10.index)
msc_top10_d['proportion'] = msc_top10_d['n'] / d['Unique Identifier'].nunique()
msc_top10_d

In [None]:
entry_top10 = d[['Entry Status', 'Unique Identifier']].drop_duplicates()['Entry Status'].value_counts().head(10)
entry_top10_d = pd.DataFrame({'n': entry_top10}, index=entry_top10.index)
entry_top10_d['proportion'] = entry_top10_d['n'] / d['Unique Identifier'].nunique()
# PWA = Present Without Admission
entry_top10_d

In [None]:
# the dataset is very skewed for attributes we might want to look at rules associations for.
# we are seeing the problem where potentially high utility items are very low support?

In [None]:
rules_results['itemsets'].explode().value_counts()

In [None]:
d[['Felon', 'Unique Identifier']].drop_duplicates()['Felon'].value_counts()

In [None]:
d[['Apprehension Criminality', 'Unique Identifier']].drop_duplicates()['Apprehension Criminality'].value_counts()

In [None]:
app_crim_d = d[['Apprehension Criminality', 'Unique Identifier']].drop_duplicates().groupby('Apprehension Criminality').nunique().rename(columns={'Unique Identifier': 'n'})
app_crim_d['proportion'] = app_crim_d['n'] / d['Unique Identifier'].nunique()
ipywidgets.HTML(app_crim_d.to_html())

In [None]:
felon_d = d[['Felon', 'Unique Identifier']].drop_duplicates().groupby('Felon').nunique().rename(columns={'Unique Identifier': 'n'})
felon_d = felon_d.sort_values(by = 'n', ascending=False)
felon_d['proportion'] = felon_d['n'] / d['Unique Identifier'].nunique()
ipywidgets.HTML(felon_d.to_html())

In [None]:
ipywidgets.HTML(rules_results.sort_values(by = 'support', ascending=False).head(10)[['support', 'itemsets_list_str']].to_html(index=False))

In [None]:
print(rules_results[~rules_results['itemsets_list_str'].str.contains('Entry Status: PWA Mexico') & 
      rules_results['itemsets_list_str'].str.contains('Entry Status:')][['support', 'itemsets_list_str']].to_string())

In [None]:
print(rules_results[~rules_results['itemsets_list_str'].str.contains('Entry Status: PWA Mexico') & 
      rules_results['itemsets_list_str'].str.contains('GUATEMALA')][['support', 'itemsets_list_str']].sort_values(by='support').to_string())

In [None]:
print(rules_results[rules_results['itemsets_list_str'].str.contains('Entry Status: PWA Mexico') & 
      rules_results['itemsets_list_str'].str.contains('Citizenship')][['support', 'itemsets_list_str']].sort_values(by='support').to_string())