In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import datetime, re, pickle
from sklearn.linear_model import LogisticRegression

In [2]:
PROCESSED_DATASET_CSV_FILE = './processed_datasets/basic_model.csv'
PROCESSED_DATASET_PKL_FILE = './processed_datasets/basic_model.pkl'
RAW_COFFENDING_CSV_FILE = './raw_datasets/Cooffending.csv'
PROCESSED_CRIME_LABEL_CSV_FILE = './processed_datasets/crime_type_mapping.csv'
PROCESSED_CRIME_LABEL_PKL_FILE = './processed_datasets/crime_type_mapping.pkl'


## Load Preprocessed Data


In [3]:
training_data = None
with open(PROCESSED_DATASET_PKL_FILE, 'rb') as file_obj:
    training_data = pickle.load(file_obj)

FileNotFoundError: [Errno 2] No such file or directory: './processed_datasets/basic_model.pkl'

## Preprocessing (*skip this if you just want to use the post-processed data*)

In [4]:
co_offending_table = pd.read_csv(RAW_COFFENDING_CSV_FILE)

# Remove duplicate rows
co_offending_table.drop_duplicates(inplace=True)

# Format the date column as a python datetime object
co_offending_table['Date'] = co_offending_table.Date.apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))

# Add a column for the number of arrests of each offender
co_offending_table['ArrestCount'] = co_offending_table.groupby('NoUnique')['SeqE'].transform('count')

In [5]:
crime_labels = None
with open(PROCESSED_CRIME_LABEL_PKL_FILE, 'rb') as file_obj:
    crime_labels = pickle.load(file_obj)

In [6]:
def process_second_arrests(df):
    if len(df) == 1: 
        return None
    else:
        return df.iloc[1]
    
def process_seq_numbers(x):
    if np.isnan(x):
        return 0
    else:
        return 1

In [None]:
def build_table_of_first_two_arrests(co_offending):
    # first sort by offender and date
    co_offending = co_offending.sort_values(by=['NoUnique', 'Date'])
    # this gets the first arrest of each offender (which is gauranteed to exist)
    print('sorted')
    first_arrest = co_offending.groupby('NoUnique').apply(lambda x: x.iloc[0])
    first_arrest.columns = ['first_arrest_'+str(x) for x in first_arrest.columns]
    print(first_arrest)
    # this gets the second arrest of each offender or NaN if the offender does not exist
    second_arrests = co_offending.groupby('NoUnique').apply(process_second_arrests)
    second_arrests.columns = ['second_arrest_'+str(x) for x in second_arrests.columns]
    print(second_arrests)
    # combine these two dataframe 
    first_and_second_arrest_data = pd.merge(first_arrest, second_arrests, how='outer', left_index=True, right_index=True)
    # add a column with a binary variable 0/1 for whether arrested again
    first_and_second_arrest_data['arrested_again'] = first_and_second_arrest_data.second_arrest_NoUnique.apply(process_seq_numbers)
    
    
    return first_and_second_arrest_data

In [None]:
# build data frame
training_data = build_table_of_first_two_arrests(co_offending_table)

# format data
training_data.first_arrest_SeqE = training_data.first_arrest_SeqE.astype('category')
training_data.first_arrest_SEXE = training_data.first_arrest_SEXE.astype('category')
training_data.first_arrest_NCD1 = training_data.first_arrest_NCD1.astype('category')
training_data.first_arrest_NCD2 = training_data.first_arrest_NCD2.astype('category')
training_data.first_arrest_NCD3 = training_data.first_arrest_NCD3.astype('category')
training_data.first_arrest_NCD4 = training_data.first_arrest_NCD4.astype('category')
training_data.first_arrest_MUN = training_data.first_arrest_MUN.astype('category')
training_data.first_arrest_ED1 = training_data.first_arrest_ED1.astype('category')
training_data.second_arrest_SeqE = training_data.second_arrest_SeqE.astype('category')
training_data.second_arrest_SEXE = training_data.second_arrest_SEXE.astype('category')
training_data.second_arrest_NCD1 = training_data.second_arrest_NCD1.astype('category')
training_data.second_arrest_NCD2 = training_data.second_arrest_NCD2.astype('category')
training_data.second_arrest_NCD3 = training_data.second_arrest_NCD3.astype('category')
training_data.second_arrest_NCD4 = training_data.second_arrest_NCD4.astype('category')
training_data.second_arrest_MUN = training_data.second_arrest_MUN.astype('category')
training_data.second_arrest_ED1 = training_data.second_arrest_ED1.astype('category')


sorted


In [None]:
def get_arrest_label(x, crime_label_attribute):
    if x != ' ' and not (x is None):
        return crime_labels.get_value(index=int(x), col=crime_label_attribute)
    else:
        return 0

training_data['first_arrest_criminiology_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'criminology_category'))
training_data['first_arrest_group_crime_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'group_crime'))
training_data['first_arrest_drug_crime_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_crime'))
training_data['first_arrest_drug_usage_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_usage'))
training_data['first_arrest_drug_trafficking_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_trafficking'))


training_data['second_arrest_criminiology_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'criminology_category'))
training_data['second_arrest_group_crime_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'group_crime'))
training_data['second_arrest_drug_crime_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_crime'))
training_data['second_arrest_drug_usage_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_usage'))
training_data['second_arrest_drug_trafficking_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_trafficking'))




training_data.first_arrest_criminiology_label = training_data.first_arrest_criminiology_label.astype('category')
training_data.first_arrest_group_crime_label = training_data.first_arrest_group_crime_label.astype('int')
training_data.first_arrest_drug_crime_label = training_data.first_arrest_drug_crime_label.astype('int')
training_data.first_arrest_drug_usage_label = training_data.first_arrest_drug_usage_label.astype('int')
training_data.first_arrest_drug_trafficking_label = training_data.first_arrest_drug_trafficking_label.astype('int')

training_data.second_arrest_criminiology_label = training_data.second_arrest_criminiology_label.astype('category')
training_data.second_arrest_group_crime_label = training_data.second_arrest_group_crime_label.astype('int')
training_data.second_arrest_drug_crime_label = training_data.second_arrest_drug_crime_label.astype('int')
training_data.second_arrest_drug_usage_label = training_data.second_arrest_drug_usage_label.astype('int')
training_data.second_arrest_drug_trafficking_label = training_data.second_arrest_drug_trafficking_label.astype('int')

In [None]:
## save the whole thing
training_data.to_csv(PROCESSED_DATASET_CSV_FILE)

with open(PROCESSED_DATASET_PKL_FILE, 'wb') as file_obj:
    pickle.dump(training_data, file=file_obj)


## Overview of Data

In [6]:
training_data.dtypes

first_arrest_NoUnique                            int64
first_arrest_Naissance                           int64
first_arrest_SEXE                             category
first_arrest_SeqE                             category
first_arrest_dateInf                             int64
first_arrest_NCD1                             category
first_arrest_NCD2                             category
first_arrest_NCD3                             category
first_arrest_NCD4                             category
first_arrest_MUN                              category
first_arrest_ED1                              category
first_arrest_Jeunes                              int64
first_arrest_Adultes                             int64
first_arrest_Date                       datetime64[ns]
first_arrest_annee                               int64
first_arrest_ArrestCount                         int64
second_arrest_NoUnique                         float64
second_arrest_Naissance                        float64
second_arr

In [7]:
training_data.shape

(539593, 43)

## Basic Model

In [8]:
print('raw recidivism rate: %f' % (sum(training_data.arrested_again)/len(training_data.arrested_again)))

raw recidivism rate: 0.341884


In [10]:
X_df = training_data[['first_arrest_SEXE','first_arrest_NCD1', 'first_arrest_MUN', 'first_arrest_ED1']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [11]:
print(X.shape, Y.shape)

(539593, 1720) (539593,)


In [12]:
baseline_model = LogisticRegression(penalty='l1', verbose=True)

In [13]:
baseline_model.fit(X, Y)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=True, warm_start=False)

In [14]:
# coefficient analysis (for interpretation)
res = np.argsort(abs(baseline_model.coef_))[0]
res = res[::-1]
print('bias: %f' % baseline_model.intercept_)
for coeff_index in res[0:50]:
    value = baseline_model.coef_[0][coeff_index]
    name = X_df.columns[coeff_index]
    print('coefficient: %s  | value: %f' % (name, value))


bias: -0.166892
coefficient: first_arrest_NCD1_1360  | value: -2.521032
coefficient: first_arrest_MUN_35906  | value: -2.205536
coefficient: first_arrest_NCD1_38401  | value: 1.944629
coefficient: first_arrest_NCD1_6450  | value: 1.716849
coefficient: first_arrest_NCD1_75003  | value: -1.689019
coefficient: first_arrest_MUN_79020  | value: 1.679443
coefficient: first_arrest_NCD1_16103  | value: 1.659428
coefficient: first_arrest_NCD1_4430  | value: -1.616540
coefficient: first_arrest_NCD1_21354  | value: 1.591024
coefficient: first_arrest_MUN_97806  | value: 1.527043
coefficient: first_arrest_MUN_89005  | value: 1.484777
coefficient: first_arrest_NCD1_21357  | value: 1.444198
coefficient: first_arrest_NCD1_16701  | value: -1.419969
coefficient: first_arrest_NCD1_71009  | value: -1.400631
coefficient: first_arrest_MUN_83802  | value: -1.387462
coefficient: first_arrest_NCD1_71002  | value: -1.384726
coefficient: first_arrest_MUN_88045  | value: -1.377395
coefficient: first_arrest_NCD1_3

In [15]:
baseline_model.score(X, Y)

0.67742724609103533

## Basic Model with Crime Class Labels

In [16]:
X_df = training_data[['first_arrest_criminiology_label']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [17]:
baseline_model_crime_types = LogisticRegression(penalty='l1', verbose=True)

In [18]:
baseline_model_crime_types.fit(X, Y)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=True, warm_start=False)

In [19]:
# coefficient analysis (for interpretation)
res = np.argsort(abs(baseline_model_crime_types.coef_))[0]
res = res[::-1]
print('bias: %f' % baseline_model_crime_types.intercept_)
for coeff_index in res:
    value = baseline_model_crime_types.coef_[0][coeff_index]
    name = X_df.columns[coeff_index]
    print('coefficient: %s  | value: %f' % (name, value))


bias: -0.635026
coefficient: first_arrest_criminiology_label_0  | value: -1.101876
coefficient: first_arrest_criminiology_label_x  | value: -0.293765
coefficient: first_arrest_criminiology_label_3  | value: 0.185365
coefficient: first_arrest_criminiology_label_2  | value: 0.175330
coefficient: first_arrest_criminiology_label_1  | value: -0.000696


In [20]:
baseline_model_crime_types.score(X, Y)

0.65811639513485165

## Incorporating Social Ties

In [22]:
X_df = training_data[['first_arrest_SEXE','first_arrest_NCD1', 'first_arrest_MUN', 
                      'first_arrest_ED1', 'first_arrest_Adultes', 'first_arrest_Jeunes']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [23]:
model_with_social_data = LogisticRegression(penalty='l1', verbose=True)

In [24]:
model_with_social_data.fit(X, Y)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=True, warm_start=False)

In [25]:
model_with_social_data.score(X, Y)

0.6774216863450786

In [26]:
# coefficient analysis (for interpretation)
res = np.argsort(abs(model_with_social_data.coef_))[0]
res = res[::-1]
print('bias: %f' % model_with_social_data.intercept_)
for coeff_index in res[0:50]:
    value = baseline_model.coef_[0][coeff_index]
    name = X_df.columns[coeff_index]
    print('coefficient: %s  | value: %f' % (name, value))

bias: -0.364120
coefficient: first_arrest_NCD1_1360  | value: -0.469234
coefficient: first_arrest_NCD1_38401  | value: 1.195065
coefficient: first_arrest_MUN_35906  | value: 1.120118
coefficient: first_arrest_NCD1_38403  | value: -0.157494
coefficient: first_arrest_NCD1_75003  | value: 0.000000
coefficient: first_arrest_MUN_79020  | value: 0.028510
coefficient: first_arrest_NCD1_6450  | value: -0.590019
coefficient: first_arrest_NCD1_4430  | value: -0.346509
coefficient: first_arrest_NCD1_16103  | value: 0.000000
coefficient: first_arrest_NCD1_21354  | value: 1.101845
coefficient: first_arrest_MUN_97806  | value: 0.000000
coefficient: first_arrest_MUN_89005  | value: 0.000000
coefficient: first_arrest_NCD1_71009  | value: -0.753185
coefficient: first_arrest_NCD1_16701  | value: 0.057124
coefficient: first_arrest_NCD1_21357  | value: 0.892312
coefficient: first_arrest_MUN_83802  | value: 0.011393
coefficient: first_arrest_MUN_88045  | value: 0.096740
coefficient: first_arrest_NCD1_71002