In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import datetime, re, pickle
from sklearn.linear_model import LogisticRegression

In [6]:
PROCESSED_DATASET_CSV_FILE = './processed_datasets/basic_model.csv'
PROCESSED_DATASET_PKL_FILE = './processed_datasets/basic_model.pkl'
RAW_COFFENDING_CSV_FILE = './raw_datasets/Cooffending.csv'
PROCESSED_CRIME_LABEL_CSV_FILE = './processed_datasets/crime_type_mapping.csv'
PROCESSED_CRIME_LABEL_PKL_FILE = './processed_datasets/crime_type_mapping.pkl'


## Load Preprocessed Data


In [None]:
training_data = None
with open(PROCESSED_DATASET_PKL_FILE, 'rb') as file_obj:
    training_data = pickle.load(file_obj)

## Preprocessing (skip this if you just want to use the post-processed data)

In [4]:
co_offending_table = pd.read_csv(RAW_COFFENDING_CSV_FILE)

# Remove duplicate rows
co_offending_table.drop_duplicates(inplace=True)

# Format the date column as a python datetime object
co_offending_table['Date'] = co_offending_table.Date.apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))

# Add a column for the number of arrests of each offender
co_offending_table['ArrestCount'] = co_offending_table.groupby('NoUnique')['SeqE'].transform('count')

In [8]:
crime_labels = None
with open(PROCESSED_CRIME_LABEL_PKL_FILE, 'rb') as file_obj:
    crime_labels = pickle.load(file_obj)

In [10]:
def process_second_arrests(df):
    if len(df) == 1: 
        return None
    else:
        return df.iloc[1]
    
def process_seq_numbers(x):
    if np.isnan(x):
        return 0
    else:
        return 1

In [11]:
def build_table_of_first_two_arrests(co_offending):
    # first sort by offender and date
    co_offending = co_offending.sort_values(by=['NoUnique', 'Date'])
    # this gets the first arrest of each offender (which is gauranteed to exist)
    print('sorted')
    first_arrest = co_offending.groupby('NoUnique').apply(lambda x: x.iloc[0])
    first_arrest.columns = ['first_arrest_'+str(x) for x in first_arrest.columns]
    print(first_arrest)
    # this gets the second arrest of each offender or NaN if the offender does not exist
    second_arrests = co_offending.groupby('NoUnique').apply(process_second_arrests)
    second_arrests.columns = ['second_arrest_'+str(x) for x in second_arrests.columns]
    print(second_arrests)
    # combine these two dataframe 
    first_and_second_arrest_data = pd.merge(first_arrest, second_arrests, how='outer', left_index=True, right_index=True)
    # add a column with a binary variable 0/1 for whether arrested again
    first_and_second_arrest_data['arrested_again'] = first_and_second_arrest_data.second_arrest_NoUnique.apply(process_seq_numbers)
    
    
    return first_and_second_arrest_data

In [None]:
# build data frame
training_data = build_table_of_first_two_arrests(co_offending_table)

# format data
training_data.first_arrest_SeqE = training_data.first_arrest_SeqE.astype('category')
training_data.first_arrest_SEXE = training_data.first_arrest_SEXE.astype('category')
training_data.first_arrest_NCD1 = training_data.first_arrest_NCD1.astype('category')
training_data.first_arrest_NCD2 = training_data.first_arrest_NCD2.astype('category')
training_data.first_arrest_NCD3 = training_data.first_arrest_NCD3.astype('category')
training_data.first_arrest_NCD4 = training_data.first_arrest_NCD4.astype('category')
training_data.first_arrest_MUN = training_data.first_arrest_MUN.astype('category')
training_data.first_arrest_ED1 = training_data.first_arrest_ED1.astype('category')
training_data.second_arrest_SeqE = training_data.second_arrest_SeqE.astype('category')
training_data.second_arrest_SEXE = training_data.second_arrest_SEXE.astype('category')
training_data.second_arrest_NCD1 = training_data.second_arrest_NCD1.astype('category')
training_data.second_arrest_NCD2 = training_data.second_arrest_NCD2.astype('category')
training_data.second_arrest_NCD3 = training_data.second_arrest_NCD3.astype('category')
training_data.second_arrest_NCD4 = training_data.second_arrest_NCD4.astype('category')
training_data.second_arrest_MUN = training_data.second_arrest_MUN.astype('category')
training_data.second_arrest_ED1 = training_data.second_arrest_ED1.astype('category')


sorted


In [None]:
def get_arrest_label(x, crime_label_attribute):
    if x != ' ' and not (x is None):
        return crime_labels.get_value(index=int(x), col=crime_label_attribute)
    else:
        return 0

training_data['first_arrest_criminiology_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'criminology_category'))
training_data['first_arrest_group_crime_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'group_crime'))
training_data['first_arrest_drug_crime_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_crime'))
training_data['first_arrest_drug_usage_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_usage'))
training_data['first_arrest_drug_trafficking_label'] = training_data['first_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_trafficking'))


training_data['second_arrest_criminiology_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'criminology_category'))
training_data['second_arrest_group_crime_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'group_crime'))
training_data['second_arrest_drug_crime_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_crime'))
training_data['second_arrest_drug_usage_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_usage'))
training_data['second_arrest_drug_trafficking_label'] = training_data['second_arrest_NCD1'].apply(lambda x: get_arrest_label(x, 'drug_trafficking'))




training_data.first_arrest_criminiology_label = training_data.first_arrest_criminiology_label.astype('category')
training_data.first_arrest_group_crime_label = training_data.first_arrest_group_crime_label.astype('int')
training_data.first_arrest_drug_crime_label = training_data.first_arrest_drug_crime_label.astype('int')
training_data.first_arrest_drug_usage_label = training_data.first_arrest_drug_usage_label.astype('int')
training_data.first_arrest_drug_trafficking_label = training_data.first_arrest_drug_trafficking_label.astype('int')

training_data.second_arrest_criminiology_label = training_data.second_arrest_criminiology_label.astype('category')
training_data.second_arrest_group_crime_label = training_data.second_arrest_group_crime_label.astype('int')
training_data.second_arrest_drug_crime_label = training_data.second_arrest_drug_crime_label.astype('int')
training_data.second_arrest_drug_usage_label = training_data.second_arrest_drug_usage_label.astype('int')
training_data.second_arrest_drug_trafficking_label = training_data.second_arrest_drug_trafficking_label.astype('int')

In [None]:
training_data

In [None]:
## save the whole thing
training_data.to_csv(PROCESSED_DATASET_CSV_FILE)

with open(PROCESSED_DATASET_PKL_FILE, 'wb') as file_obj:
    pickle.dump(training_data, file=file_obj)
    
training_data.

## Basic Model

In [None]:
print('raw recidivism rate: %f' % (sum(training_data.arrested_again)/len(training_data.arrested_again)))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
X_df = training_data[['first_arrest_SEXE','first_arrest_NCD1', 'first_arrest_MUN', 'first_arrest_ED1']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [None]:
print(X.shape, Y.shape)

In [None]:
baseline_model = LogisticRegression(penalty='l1', verbose=True)

In [None]:
baseline_model.fit(X, Y)

In [None]:
# coefficient analysis (for interpretation)
res = np.argsort(abs(baseline_model.coef_))[0]
res = res[::-1]
print('bias: %f' % baseline_model.intercept_)
for coeff_index in res[0:50]:
    value = baseline_model.coef_[0][coeff_index]
    name = X_df.columns[coeff_index]
    print('coefficient: %s  | value: %f' % (name, value))


In [None]:
baseline_model.score(X, Y)

## Basic Model with Crime Class Labels

In [None]:
X_df = training_data[['first_arrest_criminiology_label']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [None]:
baseline_model_crime_types = LogisticRegression(penalty='l1', verbose=True)

In [None]:
baseline_model_crime_types.fit(X, Y)

In [None]:
# coefficient analysis (for interpretation)
res = np.argsort(abs(baseline_model_crime_types.coef_))[0]
res = res[::-1]
print('bias: %f' % baseline_model_crime_types.intercept_)
for coeff_index in res:
    value = baseline_model_crime_types.coef_[0][coeff_index]
    name = X_df.columns[coeff_index]
    print('coefficient: %s  | value: %f' % (name, value))


In [None]:
baseline_model_crime_types.score(X, Y)

## Incorporating Social Ties

In [None]:
training_data.columns

In [None]:
X_df = training_data[['first_arrest_SEXE','first_arrest_NCD1', 'first_arrest_MUN', 
                      'first_arrest_ED1', 'first_arrest_Adultes', 'first_arrest_Jeunes']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [None]:
model_with_social_data = LogisticRegression(penalty='l1', verbose=True)

In [None]:
model_with_social_data.fit(X, Y)

In [None]:
model_with_social_data.score(X, Y)

In [None]:
X_df.columns[0:2]

In [None]:
model_with_social_data.coef_[0:2]

In [None]:
# coefficient analysis (for interpretation)
res = np.argsort(abs(model_with_social_data.coef_))[0]
res = res[::-1]
print('bias: %f' % model_with_social_data.intercept_)
for coeff_index in res[0:50]:
    value = baseline_model.coef_[0][coeff_index]
    name = X_df.columns[coeff_index]
    print('coefficient: %s  | value: %f' % (name, value))

### Looking at specific types of crimes

In [None]:
X_df = training_data[['first_arrest_NCD1']]

# gives us dummy variables
X_df = pd.get_dummies(X_df)
X = X_df.as_matrix()

Y_df = training_data[['arrested_again']]
Y = Y_df.as_matrix()
Y = Y.ravel()

In [None]:
model_by_crime = LogisticRegression(penalty='l1', verbose=True)

In [None]:
model_by_crime.fit(X, Y)

In [None]:
model_by_crime.score(X,Y)

In [None]:
# coefficient analysis (for interpretation)
res = np.argsort(model_by_crime.coef_)[0]
res = res[::-1]
print('bias: %f' % model_by_crime.intercept_)
for coeff_index in res[0:100]:
    value = model_by_crime.coef_[0][coeff_index]
    try:
        crime_code = int(X_df.columns[coeff_index][18:])
        crime_name = crime_labels.get_value(index=crime_code, col='text_description')
    except KeyError:
        crime_name = X_df.columns[coeff_index]
    except ValueError:
        pass
    print('crime: %s  | coefficient: %f' % (crime_name, value))