In [None]:
import os
os.chdir("/home/wes/projects/ml_project_final/ml-theory-project")

In [None]:
import pathlib
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans

In [None]:
%run clean_all.py

In [None]:
data_dir = pathlib.Path('ice_data/clean_data')
d = pd.merge(pd.read_parquet(data_dir / 'detentions.parquet'),
             pd.read_parquet(data_dir / 'arrests.parquet'),
             on = 'Unique Identifier', how = 'outer', suffixes = (None, '_x'))
d = pd.merge(d, pd.read_parquet(data_dir / 'detainers.parquet'),
             on = 'Unique Identifier', how = 'outer', suffixes = (None, '_y'))
d = pd.merge(d, pd.read_parquet(data_dir / 'encounters.parquet'),
             on = 'Unique Identifier', how = 'outer', suffixes = (None, '_z'))

# assume duplicate column names represent duplicate information when possible
for suffix in ['_x', '_y', '_z']:
    for c in d.columns:
        if c.endswith(suffix):
            print(c, c[:-2])
            d[c] = pd.Series([x if x is not None else y for (x, y) in zip(d[c], d[c[:-2]])], index=d.index)
    d = d[[c for c in d.columns if not c.endswith(suffix)]].copy()
d.head()

In [None]:
d.to_parquet("/home/wes/projects/ml_project_final/ml-theory-project/ice_data/clean_data/merged_data.parquet")

In [None]:
df = pd.read_parquet('/home/wes/projects/ml_project_final/ml-theory-project/ice_data/clean_data/merged_data.parquet')

In [None]:
df_copy = df.iloc[0:100000]

In [None]:
df.shape

In [None]:
features_num = ['Age', 'Case Threat Level',
                       'Biometric Match Yes No', 'Statements Made Yes No',
                       'Resume Custody Yes No']
features_cat = ['Final Program', 'Census Region']
sc = StandardScaler()
df_copy['Deported'] = df_copy['Deported'].map({True: 1, False: 0})
db_df = df_copy[features_num + features_cat + ['Deported']].dropna()
X_db = db_df[features_num + features_cat]

db_pre = ColumnTransformer([
    ('numerical_features', sc, features_num),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), features_cat),
])

X_db_proc = db_pre.fit_transform(X_db)

db = DBSCAN(eps=0.5, min_samples=80, n_jobs=-1)

db_labels = db.fit_predict(X_db_proc)
db_df['cluster_dbscan'] = db_labels

cluster_risks = db_df.groupby('cluster_dbscan')['Deported'].mean()


In [None]:
cluster_sizes = db_df['cluster_dbscan'].value_counts().sort_index()
summary = pd.DataFrame({
    'cluster_size': cluster_sizes,
    'deportation_rate': cluster_risks.astype(float)
})
summary.sort_values('deportation_rate', ascending=False)

In [None]:
cluster_sizes = db_df['cluster_dbscan'].value_counts().sort_index()
summary = pd.DataFrame({
    'cluster_size': cluster_sizes,
    'deportation_rate': cluster_risks.astype(float)
})
summary.sort_values('deportation_rate', ascending=False).reset_index().plot(kind = 'bar', x = "cluster_size", y = 'deportation_rate')

In [None]:

num_profile = db_df.groupby('cluster_dbscan')[features_num].mean()
cat_profile = db_df.groupby('cluster_dbscan')[features_cat].agg(
    lambda x: x.value_counts().index[0]
)
cluster_profile = pd.concat([num_profile, cat_profile, cluster_risks.rename('deportation_rate')], axis=1)


In [None]:
test_df = pd.read_parquet("/home/wes/projects/ml_project_final/ml-theory-project/ice_data/clean_data/detentions.parquet")

In [None]:
sorted(test_df.columns)

In [None]:
db_df['Apprehension Criminality'].value_counts()

In [None]:
cluster_profile.sort_values('deportation_rate', ascending = False).head(25)

In [None]:
cluster_profile.sort_values('deportation_rate', ascending = False).tail(25)

In [None]:
"""
RUN #1
- features: 'Responsible AOR', 'Event Type', 'Final Program', 'Encounter Criminality', 'Birth Year', 'Citizenship Country', 'Gender', 'Days After Start'
- test_train_split(stratify=y)
- Training accuracy: 0.9928
- Test accuracy: 0.8413

todo:
- try this after combining rows with the same id, and use # encounters as a feature
- add encounter month (would be last encounter month)
"""

from utils import log

log("importing ML libraries")
import pandas as pd
import clean_utils
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

log("loading clean data")
df = pd.read_parquet('ice_data/clean_data/encounters.parquet')

features = ['Responsible AOR', 'Event Type', 'Final Program', 'Encounter Criminality', 'Birth Year', 'Citizenship Country', 'Gender', 'Days After Start']
target = 'Deported'

log("one-hot encoding")
# drop_first removes one of the dummy variables to avoid multi-collinearity
X = pd.get_dummies(df[features], drop_first=True)
y = df[target].map({True: 1, False: 0})
log("%d features in new set" % len(X.columns))

log("test-train split")
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

log("fitting random forest")
model = RandomForestClassifier(n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

log(f"Training accuracy: {model.score(X_train, y_train):.4f}")
log(f"Test accuracy: {model.score(X_test, y_test):.4f}")


In [None]:
features

In [None]:
df

In [None]:
model.predict(X)

In [None]:
int(df.shape[0]*.8)

In [None]:
# %% ARGS

output_image_path = 'out/encounters_decision_tree.png'
input_parquet = 'ice_data/clean_data/encounters.parquet'

# %%
from utils import log

log("importing ML libraries")
import pandas as pd
import clean_utils
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree

df = pd.read_parquet(input_parquet)

# features = ['Gender', 'Days After Start', 'Encounter Criminality']
features = ['Responsible AOR', 'Event Type', 'Final Program', 'Encounter Criminality', 'Birth Year', 'Citizenship Country', 'Gender', 'Days After Start']
target = 'Deported'

log("creating dummy variables")
X = pd.get_dummies(df[features], drop_first=True)
X_train = X.iloc[0:int(df.shape[0]*.8)]
y_train = df[target].iloc[0:int(df.shape[0]*.8)]
X_test = X.iloc[int(df.shape[0]*.8)::]
y_test = df[target].iloc[int(df.shape[0]*.8)::]
log("fitting decision tree")
model = DecisionTreeClassifier(random_state=42, max_depth=5)
model.fit(X_train, y_train)

log(f"Decision tree trained with {len(features)} features")
log(f"Training accuracy: {model.score(X_train, y_train):.4f}")
log("%d dummy variables" % len(X.columns))
model.predict(X_test)
# log(f"Decision tree trained with {len(features)} features")
# log(f"Testing accuracy: {model.score(X_train, y_test):.4f}")
# log("%d dummy variables" % len(X.columns))
# %%
# import os
# os.makedirs('out', exist_ok=True)

# plt.figure(figsize=(30, 20))
# plot_tree(model,
#           feature_names=X.columns,
#           class_names=['Not Deported', 'Deported'],
#           filled=True,
#           rounded=True,
#           label='none',
#           fontsize=7,
#           impurity=False,  # Hide gini/entropy values
#           proportion=True)  # Show proportions instead of sample counts
# plt.title("Decision Tree Classifier Visualization")
# # plt.show()
# plt.savefig(output_image_path, dpi=200, bbox_inches='tight')
# log("Tree visualization saved to %s" % output_image_path)


In [None]:
import sklearn

In [None]:
sklearn.metrics.accuracy_score(y_test, model.predict(X_test))