In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import datetime

## Preprocessing 

In [None]:
co_offending_table = pd.read_csv('./Cooffending.csv')

# Remove duplicate rows
co_offending_table.drop_duplicates(inplace=True)

# Format the date column as a python datetime object
co_offending_table['Date'] = co_offending_table.Date.apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))

# Add a column for the number of arrests of each offender
co_offending_table['ArrestCount'] = co_offending_table.groupby('NoUnique')['NoUnique'].transform('count')

# Get the right datatype 
co_offending_table.SeqE = co_offending_table.SeqE.astype('category')
co_offending_table.SEXE = co_offending_table.SEXE.astype('category')
co_offending_table.NCD1 = co_offending_table.NCD1.astype('category')
co_offending_table.NCD2 = co_offending_table.NCD2.astype('category')
co_offending_table.NCD3 = co_offending_table.NCD3.astype('category')
co_offending_table.NCD4 = co_offending_table.NCD4.astype('category')
co_offending_table.MUN = co_offending_table.MUN.astype('category')
co_offending_table.ED1 = co_offending_table.ED1.astype('category')

In [None]:
def process_second_arrests(df):
    if len(df) == 1: 
        return None
    else:
        return df.iloc[1]
    
def process_seq_numbers(x):
    if np.isnan(x):
        return 0
    else:
        return 1

In [None]:
def build_table_of_first_two_arrests(co_offending):
    # first sort by offender and date
    co_offending = co_offending.sort_values(by=['NoUnique', 'Date'])
    # this gets the first arrest of each offender (which is gauranteed to exist)
    print('sorted')
    first_arrest = co_offending.groupby('NoUnique').apply(lambda x: x.iloc[0])
    first_arrest.columns = ['first_arrest_'+str(x) for x in first_arrest.columns]
    print(first_arrest)
    # this gets the second arrest of each offender or NaN if the offender does not exist
    second_arrests = co_offending.groupby('NoUnique').apply(process_second_arrests)
    second_arrests.columns = ['second_arrest_'+str(x) for x in second_arrests.columns]
    print(second_arrests)
    # combine these two dataframe 
    first_and_second_arrest_data = pd.merge(first_arrest, second_arrests, how='outer', left_index=True, right_index=True)
    # add a column with a binary variable 0/1 for whether arrested again
    first_and_second_arrest_data['arrested_again'] = first_and_second_arrest_data.second_arrest_NoUnique.apply(process_seq_numbers)
    
    
    return first_and_second_arrest_data

In [90]:
#training_data = build_table_of_first_two_arrests(co_offending_table)

# or read from csv
training_data = pd.read_csv('./basic_model_data.csv')

# format data
training_data.first_arrest_SeqE = training_data.first_arrest_SeqE.astype('category')
training_data.first_arrest_SEXE = training_data.first_arrest_SEXE.astype('category')
training_data.first_arrest_NCD1 = training_data.first_arrest_NCD1.astype('category')
training_data.first_arrest_NCD2 = training_data.first_arrest_NCD2.astype('category')
training_data.first_arrest_NCD3 = training_data.first_arrest_NCD3.astype('category')
training_data.first_arrest_NCD4 = training_data.first_arrest_NCD4.astype('category')
training_data.first_arrest_MUN = training_data.first_arrest_MUN.astype('category')
training_data.first_arrest_ED1 = training_data.first_arrest_ED1.astype('category')
training_data.second_arrest_SeqE = training_data.second_arrest_SeqE.astype('category')
training_data.second_arrest_SEXE = training_data.second_arrest_SEXE.astype('category')
training_data.second_arrest_NCD1 = training_data.second_arrest_NCD1.astype('category')
training_data.second_arrest_NCD2 = training_data.second_arrest_NCD2.astype('category')
training_data.second_arrest_NCD3 = training_data.second_arrest_NCD3.astype('category')
training_data.second_arrest_NCD4 = training_data.second_arrest_NCD4.astype('category')
training_data.second_arrest_MUN = training_data.second_arrest_MUN.astype('category')
training_data.second_arrest_ED1 = training_data.second_arrest_ED1.astype('category')

In [95]:
print('raw recidivism rate: %f' % (sum(training_data.arrested_again)/len(training_data.arrested_again)))

raw recidivism rate: 0.341884


In [96]:
training_data.columns

Index(['NoUnique', 'first_arrest_NoUnique', 'first_arrest_Naissance',
       'first_arrest_SEXE', 'first_arrest_SeqE', 'first_arrest_dateInf',
       'first_arrest_NCD1', 'first_arrest_NCD2', 'first_arrest_NCD3',
       'first_arrest_NCD4', 'first_arrest_MUN', 'first_arrest_ED1',
       'first_arrest_Jeunes', 'first_arrest_Adultes', 'first_arrest_Date',
       'first_arrest_annee', 'first_arrest_ArrestCount',
       'second_arrest_NoUnique', 'second_arrest_Naissance',
       'second_arrest_SEXE', 'second_arrest_SeqE', 'second_arrest_dateInf',
       'second_arrest_NCD1', 'second_arrest_NCD2', 'second_arrest_NCD3',
       'second_arrest_NCD4', 'second_arrest_MUN', 'second_arrest_ED1',
       'second_arrest_Jeunes', 'second_arrest_Adultes', 'second_arrest_Date',
       'second_arrest_annee', 'second_arrest_ArrestCount', 'arrested_again'],
      dtype='object')

## Basic Model

In [97]:
X_df = training_data[['first_arrest_SEXE','first_arrest_NCD1', 'first_arrest_NCD2', 'first_arrest_NCD3',
                      'first_arrest_NCD4', 'first_arrest_MUN', 'first_arrest_ED1']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [98]:
print(X.shape, Y.shape)

(539593, 2179) (539593,)


In [99]:
from sklearn.linear_model import LogisticRegression

In [100]:
baseline_model = LogisticRegression(penalty='l1')

In [101]:
baseline_model.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [114]:
# coefficient analysis (for interpretation)
res = np.argsort(abs(baseline_model.coef_))[0]
res = res[::-1]
print('bias: %f' % baseline_model.intercept_)
for coeff_index in res[0:20]:
    value = baseline_model.coef_[0][coeff_index]
    name = X_df.columns[coeff_index]
    print('coefficient: %s  | value: %f' % (name, value))


bias: -0.198022
coefficient: first_arrest_NCD3_32301  | value: 2.563165
coefficient: first_arrest_NCD1_1360  | value: -2.480866
coefficient: first_arrest_MUN_35906  | value: -2.183184
coefficient: first_arrest_NCD4_3410  | value: 2.024439
coefficient: first_arrest_NCD1_38401  | value: 1.986479
coefficient: first_arrest_NCD1_6450  | value: 1.892725
coefficient: first_arrest_NCD4_21701  | value: -1.882249
coefficient: first_arrest_NCD2_3365  | value: -1.866170
coefficient: first_arrest_NCD2_14551  | value: -1.706969
coefficient: first_arrest_NCD1_16103  | value: 1.611319
coefficient: first_arrest_NCD1_75003  | value: -1.598319
coefficient: first_arrest_MUN_97806  | value: 1.580838
coefficient: first_arrest_NCD1_3520  | value: 1.556186
coefficient: first_arrest_NCD1_3410  | value: 1.553928
coefficient: first_arrest_NCD1_21357  | value: 1.520572
coefficient: first_arrest_NCD1_21354  | value: 1.504655
coefficient: first_arrest_NCD1_14552  | value: -1.442569
coefficient: first_arrest_MUN_790

In [110]:
baseline_model.score(X, Y)

0.6876534721540124

In [111]:
1-0.34188360486514835

0.6581163951348517