In [2]:
import pandas as pd
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_row", 999)
import geopandas as gpd
import numpy as np
import requests
import math
import sys
import graphviz 
import csv
import sqlite3
import re
import codecs
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import tree as tr
import os; os.getcwd()

'/Users/belenmichel/Desktop/MSCAPP/7_Machine Learning/4_FinalProject_Crimes/ml-nc-recidivism/source'

In [3]:
# Constants
data_dir = "../ncdoc_data/data"

In [4]:
# # Pre-ProcessData

def to_date(df, attribute_lst, years_range=[1800, 2100]):
    '''
    Converts the data type of a string in the format YYYY-MM-DD to a datetime  
    '''
    df = df.apply(out_of_range_to_none, axis=1, args=(years_range, attribute_lst))
    for var in attribute_lst:
        df[var] = df[var].astype('datetime64[s]')#, errors = 'ignore')
    return df


def out_of_range_to_none(row, year_range, attributes_lst): 
    '''
    Converts a str representing a date out of the intended range to None
    '''
    for col in attributes_lst:
        year = int(row[col].split("-")[0])
        if year < year_range[0] or year > year_range[1]:
            row[col] = None 
    return row


def to_int(df, attribute_lst):
    '''
    Converts the data type of a string to an integer if possible or other type of number.  
    '''
    for var in attribute_lst:
        df[var] = pd.to_numeric(df[var], errors='coerce', downcast='integer') 
    return df

def remove_outliers(df, attribute_lst, sd_threshold=3):
    '''
    Takes a dataframe and number of standard deviations to be considered 
    as outlier and returns a df without the observation that have one or
    or more outliers in the attributes selected.
    input:
        df: pandas data frame
        attributes_lst: list of attributes names
        sd_threshold: standard deviations
    output:
        the new datafrane without outliers
    '''   
    
    return(df[(np.abs(stats.zscore(df[attribute_lst])) < sd_threshold).all(axis=1)])


def fill_nan(df, attributes_lst):
    '''
    Fills the nan with the mean
    input:
        df: pandas data frame
        attributes_lst: list of attributes names
    output:
        dataframe with the replaced nan
    '''   
    for attribute in attributes_lst: 
        df[attribute].fillna(df[attribute].mean(), inplace=True)

def clean_str(col_names):
    '''
    Removes special characters from a string. 
    input:
        col_names: string with column names
    output:
        sting where special characters where removed
    ''' 
    #TODO: replace this ad-hoc function with a re function. 
    #col_names = re.sub(pattern, repl, string, count=0, flags=0) 
    col_names = col_names.replace('.','')
    col_names = col_names.replace('#','')
    col_names = col_names.replace('-','')
    col_names = col_names.replace('(','')
    col_names = col_names.replace(')','')
    col_names = col_names.replace('&','')
    col_names = col_names.replace('/','')
    col_names = col_names.replace(':','')
    col_names = col_names.replace(' ','')
    col_names = col_names.replace('__','')
    return col_names


# # Generate Features/ Predictors


def discretize_variable(df, attribute_lst):
    '''
    Converts continuous variables into discrete variables
    input:
        df: pandas data frame
        attributes_lst: list of attributes names
    output:
        dataframe with the new variables
    ''' 

    for var in attribute_lst:
        new_var = var + 'cat'
        df[new_var] = pd.qcut(df[var], 10, duplicates="drop", labels=False)
    return df

def categorical_to_dummy(df, attribute_lst):
    '''
    Converts categorical variables into one variabel dummies for each category. 
    input:
        df: pandas data frame
        attributes_lst: list of attributes names
    output:
        dataframe with the new variables
    ''' 

    for var in attribute_lst:
        df = pd.get_dummies(df, columns=[var])
    return df

def remove_attribute(df, attribute_lst):
    '''
    Removes attributes in the list from the data frame
    '''     
    return df.drop(attribute_lst, axis=1)

def keep_attribute(df, attribute_lst):
    '''
    Keeps attributes in the list in the data frame
    '''
    df = df.loc[:, attribute_lst]
    return df

def flag_to_dummy(df, attribute_lst, rename=True):
    '''
    Converts a flag variable to a dummy with 1 for Yes and 0 for No
    '''
    for var in attribute_lst:
        df[var] = df[var].map({'Y': 1, 'N': 0, 'Yes': 1, 'No': 0, 'T': 1, 'F': 0,\
                               'True': 1, 'False': 0, 'OPEN': 1, 'CLOSED': 0})
        if rename:
            new_var_name = var[:-5]
            df.rename(index=str, columns={var: new_var_name}, inplace=True)
    return df

def gender_to_dummy(df, gender_var):  
    '''
    Converts a gender indicative variable to a dummy with 1 for female and 0 for male
    '''
    df[gender_var] = df[gender_var].map({'FEMALE': 1, 'MALE': 0, 'F': 1, 'M': 0})
    df.rename(index=str, columns={gender_var: "FEMALE"}, inplace=True)
    return df




In [23]:
file_name = os.path.join(data_dir, "preprocessed/INMT4AA1.csv")
Inmate_Profile = pd.read_csv(file_name, nrows=400)
Inmate_Profile.groupby('PRIOR_INCARCERATIONS_FLAG').size()

PRIOR_INCARCERATIONS_FLAG
N      4
Y    396
dtype: int64

In [5]:
#Cleaning Inmate Profile

file_name = os.path.join(data_dir, "preprocessed/INMT4AA1.csv")
Inmate_Profile = pd.read_csv(file_name) #, nrows=400
Inmate_Profile = Inmate_Profile[Inmate_Profile["INMATE_IS_FELON/MISDEMEANANT"]=="FELON"]
Inmate_Profile = flag_to_dummy(Inmate_Profile, ['ESCAPE_HISTORY_FLAG', 'PRIOR_INCARCERATIONS_FLAG'])
Inmate_Profile = gender_to_dummy(Inmate_Profile, 'INMATE_GENDER_CODE')
Inmate_Profile = categorical_to_dummy(Inmate_Profile, ['INMATE_RACE_CODE'])
#Remove outliers for TOTAL_SENTENCE_LENGTHIN_DAYS, LENGTH_OF_CURRENT_INCARCERATN, LENGTH_OF_RULING_SENTENCES
#Inmate_Profile = remove_outliers(Inmate_Profile, sd_threshold=3)
Inmate_Profile = to_date(Inmate_Profile, ["INMATE_BIRTH_DATE","INMATE_ADMISSION_DATE","FINAL_RULING_PED",\
                                          "FINAL_RULING_TRD", "FINAL_RULING_PRD", "FINAL_RULING_MAX_RELEASE_DATE",\
                                          "DATE_TRD_&_PRD_LAST_COMPUTED", "DATE_OF_LAST_ARREST_ON_PAROLE"])
Inmate_Profile["AGE_IN_DAYS"] = Inmate_Profile["INMATE_ADMISSION_DATE"] - Inmate_Profile["INMATE_BIRTH_DATE"]

Inmate_Profile = keep_attribute(Inmate_Profile,['INMATE_DOC_NUMBER', 'INMATE_LAST_NAME', 'INMATE_FIRST_NAME',\
                                                'FEMALE', 'INMATE_BIRTH_DATE',  'INMATE_RACE_CODE_ASIAN/ORTL',\
                                                'INMATE_ADMISSION_DATE', 'FINAL_RULING_PED', 'FINAL_RULING_TRD',\
                                                'FINAL_RULING_PRD', 'FINAL_RULING_MAX_RELEASE_DATE',\
                                                'DATE_TRD_&_PRD_LAST_COMPUTED', 'MOST_SERIOUS_OFFNSE_CURR_INCAR',\
                                                'DAYS_SERVED_IN_DOC_CUSTODY', 'DATE_OF_LAST_ARREST_ON_PAROLE',\
                                                'TOTAL_SENTENCE_LENGTH(IN_DAYS)', 'LENGTH_OF_CURRENT_INCARCERATN.',\
                                                'ESCAPE_HISTORY', 'PRIOR_INCARCERATIONS', 'LENGTH_OF_RULING_SENTENCES',\
                                                'CURRENT_COMMITMENT_PREFIX', 'CURRENT_SENTENCE_COMPONENT',\
                                                'INMATE_RACE_CODE_BLACK', 'INMATE_RACE_CODE_INDIAN',\
                                                'INMATE_RACE_CODE_OTHER', 'INMATE_RACE_CODE_UNKNOWN',\
                                                'INMATE_RACE_CODE_WHITE', 'AGE_IN_DAYS'])

new_file_name = file_name[:-4] + '_cleaned.csv'
Inmate_Profile.to_csv(new_file_name)


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
print(10829+282966)
Inmate_Profile.groupby('PRIOR_INCARCERATIONS').size()

293795


PRIOR_INCARCERATIONS
0.0     10829
1.0    282966
dtype: int64

In [5]:
#Cleaning SentenceComputation

file_name = os.path.join(data_dir, "preprocessed/INMT4BB1.csv")
Sentence_Computation = pd.read_csv(file_name, low_memory=False)

Sentence_Computation = categorical_to_dummy(Sentence_Computation, ['INMATE_COMPUTATION_STATUS_FLAG'])

Sentence_Computation = to_date(Sentence_Computation, ["SENTENCE_BEGIN_DATE_(FOR_MAX)","ACTUAL_SENTENCE_END_DATE",\
                                          "PROJECTED_RELEASE_DATE_(PRD)","PAROLE_DISCHARGE_DATE",\
                                          "PAROLE_SUPERVISION_BEGIN_DATE"])
Sentence_Computation['RELEASE_DATE'] = Sentence_Computation[["ACTUAL_SENTENCE_END_DATE", "PROJECTED_RELEASE_DATE_(PRD)"]].max(axis=1)
Sentence_Computation['RELEASE_365DAYS_DATE'] = Sentence_Computation['ACTUAL_SENTENCE_END_DATE'].apply(lambda x: x + pd.DateOffset(years=1))
Sentence_Computation['RELEASE_YEAR'] = Sentence_Computation['ACTUAL_SENTENCE_END_DATE'].dt.year
Sentence_Computation['BEGIN_YEAR'] = Sentence_Computation['SENTENCE_BEGIN_DATE_(FOR_MAX)'].dt.year
Sentence_Computation = keep_attribute(Sentence_Computation, ["INMATE_DOC_NUMBER","INMATE_COMMITMENT_PREFIX","INMATE_SENTENCE_COMPONENT",\
                                  "SENTENCE_BEGIN_DATE_(FOR_MAX)","ACTUAL_SENTENCE_END_DATE",\
                                  "PROJECTED_RELEASE_DATE_(PRD)","PAROLE_DISCHARGE_DATE",\
                                  "PAROLE_SUPERVISION_BEGIN_DATE","INMATE_COMPUTATION_STATUS_FLAG_ACTIVE",\
                                  "INMATE_COMPUTATION_STATUS_FLAG_EAR.TERM","INMATE_COMPUTATION_STATUS_FLAG_EXPIRED",\
                                  "INMATE_COMPUTATION_STATUS_FLAG_FUTURE","INMATE_COMPUTATION_STATUS_FLAG_PAROLED",\
                                  "INMATE_COMPUTATION_STATUS_FLAG_POST REL", "RELEASE_DATE", "RELEASE_YEAR",\
                                  "RELEASE_365DAYS_DATE",'BEGIN_YEAR'])

new_file_name = file_name[:-4] + '_cleaned.csv'
Sentence_Computation.to_csv(new_file_name)

In [6]:
#Cleaning Disciplinary Infraction Charge
file_name = os.path.join(data_dir, "preprocessed/INMT9CF1.csv")
Infraction_Charge = pd.read_csv(file_name, low_memory=False)
Infraction_Charge = categorical_to_dummy(Infraction_Charge, ['DISCIPLINARY_CHARGE_LEVEL'])
Infraction_Charge = flag_to_dummy(Infraction_Charge, ["ACTIVATE_PRIOR_SUSPENSION"], rename=False)
Infraction_Charge =keep_attribute(Infraction_Charge, ["INMATE_DOC_NUMBER", "DISCIPLINARY_CHARGE_LEVEL_APPEAL",\
                                   "DISCIPLINARY_CHARGE_LEVEL_DISCP HEAR OFFC",\
                                   "DISCIPLINARY_CHARGE_LEVEL_UNIT",'ACTIVATE_PRIOR_SUSPENSION'])
#Shall we include?:
#'DISCIPLINARY_APPEAL_DECISION'

new_file_name = file_name[:-4] + '_cleaned.csv'
Infraction_Charge.to_csv(new_file_name)

In [7]:
#Cleaning Financial_Obligation

file_name = os.path.join(data_dir, "preprocessed/OFNT1BA1.csv")
Financial_Obligation = pd.read_csv(file_name, low_memory=False)
Financial_Obligation = categorical_to_dummy(Financial_Obligation, ["COURT_ORDERED_PAYMENT_TYPE"])
Financial_Obligation = flag_to_dummy(Financial_Obligation, ["PAYEE_ACCOUNT_STATUS_CODE"], rename=False)
#Financial_Obligation = discretize_variable(Financial_Obligation, ["COP_BALANCE"])
Financial_Obligation = keep_attribute(Financial_Obligation, ["OFFENDER_NC_DOC_ID_NUMBER", "COP_COMMITMENT_PREFIX",\
                                      "COP_ACCOUNT_SEQUENCE_NUMBER","COURT_ORDERED_PAYMENT_TYPE_FINE",\
                                      "COURT_ORDERED_PAYMENT_TYPE_COMM. SERVICE FEE","PAYEE_ACCOUNT_STATUS_CODE",\
                                      "COURT_ORDERED_PAYMENT_TYPE_COURT COSTS","COURT_ORDERED_PAYMENT_TYPE_JAIL FEE",\
                                      "COURT_ORDERED_PAYMENT_TYPE_JUDGEMENT","COURT_ORDERED_PAYMENT_TYPE_RESTITUTION",\
                                      "COURT_ORDERED_PAYMENT_TYPE_SUPERVISION FEE", "COP_BALANCEcat"])

new_file_name = file_name[:-4] + '_cleaned.csv'
Financial_Obligation.to_csv(new_file_name)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [8]:
#Cleaning Court Commitment

file_name = os.path.join(data_dir, "preprocessed/OFNT3BB1.csv")
Court_Commitment = pd.read_csv(file_name, low_memory=False)#, nrows=40000)
#Court_Commitment = categorical_to_dummy(Court_Commitment, ["MOST_SERIOUS_OFFENSE_CODE"]) #TODO:convert to types of offenses
Court_Commitment = flag_to_dummy(Court_Commitment, ["NEW_PERIOD_OF_INCARCERATION_FL"], rename=False)
#Court_Commitment = discretize_variable(Court_Commitment, ["TOTAL_SENTENCE_LENGTH"])
Court_Commitment =  keep_attribute(Court_Commitment, ["OFFENDER_NC_DOC_ID_NUMBER",\
                                                      "COMMITMENT_PREFIX",\
                                                      "NEW_PERIOD_OF_INCARCERATION_FL",\
                                                      "MOST_SERIOUS_OFFENSE_CODE",\
                                                      "TOTAL_SENTENCE_LENGTH"])

new_file_name = file_name[:-4] + '_cleaned.csv'
Court_Commitment.to_csv(new_file_name)



In [9]:
#Cleaning Sentence_Component

file_name =  os.path.join(data_dir, "preprocessed/OFNT3CE1.csv")
Sentence_Component = pd.read_csv(file_name, low_memory=False)#, nrows=40000)

#I don't think we want to filter by FELON:
#Sentence_Component = Sentence_Component[Sentence_Component["PRIMARY_FELONY/MISDEMEANOR_CD."]=="FELON"]
Sentence_Component = to_int(Sentence_Component, ["MAXIMUM_SENTENCE_LENGTH"])
Sentence_Component = categorical_to_dummy(Sentence_Component, ["PRIMARY_FELONY/MISDEMEANOR_CD.",\
                                                               "PUNISHMENT_TYPE_CODE",\
                                                               "COURT_TYPE_CODE",\
                                                               "COMPONENT_DISPOSITION_CODE",\
                                                               "OFFENSE_QUALIFIER_CODE",\
                                                               "INMATE_SENTENCE_STATUS_CODE",\
                                                               "SERVING_MIN_OR_MAX_TERM_CODE"])
Sentence_Component = to_date(Sentence_Component, ["DATE_OFFENSE_COMMITTED_-_BEGIN",\
                                                  "SENTENCE_EFFECTIVE(BEGIN)_DATE",\
                                                  "INMATE_COMPONENT_STATUS_DATE"])
Sentence_Component = keep_attribute(Sentence_Component,['OFFENDER_NC_DOC_ID_NUMBER', 'COMMITMENT_PREFIX',\
                                   'SENTENCE_COMPONENT_NUMBER', 'COUNTY_OF_CONVICTION_CODE',\
                                   'PRIMARY_OFFENSE_CODE', 'DATE_OFFENSE_COMMITTED_-_BEGIN',\
                                   'MINIMUM_SENTENCE_LENGTH', 'MAXIMUM_SENTENCE_LENGTH',\
                                   'LENGTH_OF_SUPERVISION', 'SENTENCE_EFFECTIVE(BEGIN)_DATE',\
                                   'INMATE_COMPONENT_STATUS_DATE', 'PRIMARY_FELONY/MISDEMEANOR_CD._FELON',\
                                   'PRIMARY_FELONY/MISDEMEANOR_CD._MISD.',\
                                   'PRIMARY_FELONY/MISDEMEANOR_CD._UNKN.',\
                                   'PUNISHMENT_TYPE_CODE_ACTIVE  SS',\
                                   'PUNISHMENT_TYPE_CODE_COMMUNITY SS (DCC)', 'PUNISHMENT_TYPE_CODE_DWI',\
                                   'PUNISHMENT_TYPE_CODE_FAIR FELONS',\
                                   'PUNISHMENT_TYPE_CODE_FAIR MISDEMEAN',\
                                   'PUNISHMENT_TYPE_CODE_INTERMEDIATE SS',\
                                   'PUNISHMENT_TYPE_CODE_NON JUDGMENT CASES',\
                                   'PUNISHMENT_TYPE_CODE_NON-N.C. OFF.',\
                                   'PUNISHMENT_TYPE_CODE_POST RELEASE', 'PUNISHMENT_TYPE_CODE_PRE-FAIR',\
                                   'PUNISHMENT_TYPE_CODE_PRE-SS (FAIR) DCC', 'PUNISHMENT_TYPE_CODE_PSI',\
                                   'COURT_TYPE_CODE_CO RECORDR', 'COURT_TYPE_CODE_DISTRICT',\
                                   'COURT_TYPE_CODE_DOMES RELA', 'COURT_TYPE_CODE_ICC/OS/FED',\
                                   'COURT_TYPE_CODE_J.P.', 'COURT_TYPE_CODE_MAGISTRAT',\
                                   'COURT_TYPE_CODE_MAYORS', 'COURT_TYPE_CODE_SUPERIOR',\
                                   'COMPONENT_DISPOSITION_CODE_APPEAL',\
                                   'COMPONENT_DISPOSITION_CODE_BENCH TRIAL',\
                                   'COMPONENT_DISPOSITION_CODE_CRV/3M',\
                                   'COMPONENT_DISPOSITION_CODE_GUILTY',\
                                   'COMPONENT_DISPOSITION_CODE_JUDGMENT',\
                                   'COMPONENT_DISPOSITION_CODE_JURY TRIAL',\
                                   'COMPONENT_DISPOSITION_CODE_NEGOTIATED PLEA',\
                                   'COMPONENT_DISPOSITION_CODE_NOLO CONTENDRE',\
                                   'COMPONENT_DISPOSITION_CODE_NON-JUDGMENT',\
                                   'COMPONENT_DISPOSITION_CODE_NOT GUILTY',\
                                   'COMPONENT_DISPOSITION_CODE_PARTIAL REVOKE',\
                                   'COMPONENT_DISPOSITION_CODE_POST REL. REVOKED',\
                                   'COMPONENT_DISPOSITION_CODE_UNKNOWN',\
                                   'OFFENSE_QUALIFIER_CODE_ACCES A/F', 'OFFENSE_QUALIFIER_CODE_ACCES ATT',\
                                   'OFFENSE_QUALIFIER_CODE_ACCES B/F', 'OFFENSE_QUALIFIER_CODE_AID&ABET',\
                                   'OFFENSE_QUALIFIER_CODE_ATTEMPTED', 'OFFENSE_QUALIFIER_CODE_CONSPI ATT',\
                                   'OFFENSE_QUALIFIER_CODE_CONSPIRACY', 'OFFENSE_QUALIFIER_CODE_PRINCIPAL',\
                                   'OFFENSE_QUALIFIER_CODE_SOLIC ATT', 'OFFENSE_QUALIFIER_CODE_SOLICIT',\
                                   'OFFENSE_QUALIFIER_CODE_UNKNOWN', 'INMATE_SENTENCE_STATUS_CODE_ACTIVE',\
                                   'INMATE_SENTENCE_STATUS_CODE_AMENDED',\
                                   'INMATE_SENTENCE_STATUS_CODE_ARRESTED',\
                                   'INMATE_SENTENCE_STATUS_CODE_CANCEL',\
                                   'INMATE_SENTENCE_STATUS_CODE_COMM T/S',\
                                   'INMATE_SENTENCE_STATUS_CODE_COMMUTAT',\
                                   'INMATE_SENTENCE_STATUS_CODE_CORRECT',\
                                   'INMATE_SENTENCE_STATUS_CODE_COURT OR',\
                                   'INMATE_SENTENCE_STATUS_CODE_DISMISS',\
                                   'INMATE_SENTENCE_STATUS_CODE_MODIFIED',\
                                   'INMATE_SENTENCE_STATUS_CODE_P & T',\
                                   'INMATE_SENTENCE_STATUS_CODE_PAR CONS',\
                                   'INMATE_SENTENCE_STATUS_CODE_PARDON',\
                                   'INMATE_SENTENCE_STATUS_CODE_PC TERM',\
                                   'INMATE_SENTENCE_STATUS_CODE_QUASHED',\
                                   'INMATE_SENTENCE_STATUS_CODE_REINSTATE',\
                                   'INMATE_SENTENCE_STATUS_CODE_RESENTEN',\
                                   'INMATE_SENTENCE_STATUS_CODE_VACATED',\
                                   'SERVING_MIN_OR_MAX_TERM_CODE_MAX.TERM:',\
                                   'SERVING_MIN_OR_MAX_TERM_CODE_MIN.TERM:'])

new_file_name = file_name[:-4] + '_cleaned.csv'
Sentence_Component.to_csv(new_file_name)


In [10]:
#Cleaning Special_Cond_Sanctions

file_name = os.path.join(data_dir, "preprocessed/OFNT3DE1.csv")
Special_Cond = pd.read_csv(file_name, low_memory=False)#, nrows=40000)
Special_Cond = keep_attribute(Special_Cond,["OFFENDER_NC_DOC_ID_NUMBER",\
                                            "COMMITMENT_PREFIX",\
                                            "SENTENCE_COMPONENT_NUMBER",\
                                            "SPECIAL_PROVISION/SANCTION_CD",\
                                            "COMPLETION_STATUS_OF_SANCTION"])

new_file_name = file_name[:-4] + '_cleaned.csv'
Special_Cond.to_csv(new_file_name)

In [11]:
#Cleaning Parole_Analyst

file_name = os.path.join(data_dir, "preprocessed/INMT4CA1.csv")
Parole_Analyst = pd.read_csv(file_name, low_memory=False)#, nrows=40000)
Parole_Analyst = categorical_to_dummy(Parole_Analyst, ['NEXT_PAROLE_REVIEW_TYPE_CODE'])
Parole_Analyst = to_date(Parole_Analyst, ["RELEASE_DATE_(PAROLE_REVIEW)"])
Parole_Analyst = keep_attribute(Parole_Analyst,   ['INMATE_DOC_NUMBER', 'RELEASE_DATE_(PAROLE_REVIEW)',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_AFTERCARE TRT.',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_COMMISSION REVW',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_INTERIM REVIEW',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_MAPP REVIEW',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_PAROLE REVIEW',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_RT DRUG TEST',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_VOTE REVIEW',\
                                                   'NEXT_PAROLE_REVIEW_TYPE_CODE_WORK RELEASE'])

new_file_name = file_name[:-4] + '_cleaned.csv'
Parole_Analyst.to_csv(new_file_name)


In [12]:
con = sqlite3.connect(os.path.join(data_dir, "crimes.db"))
cur = con.cursor()

tables = ['INMT4AA1_cleaned', 'INMT4BB1_cleaned', 'INMT9CF1_cleaned',\
          'OFNT1BA1_cleaned', 'OFNT3BB1_cleaned', 'OFNT3CE1_cleaned',\
          'OFNT3DE1_cleaned', 'INMT4CA1_cleaned']

for table in tables:
    #print(table)
    file_name = os.path.join(data_dir, "preprocessed/{}.csv".format(table)) 
    col_names = pd.read_csv(file_name, nrows=0).columns
    n_columns = len(col_names)
    col_names = clean_str(', '.join(col_names))
    cur.execute('DROP TABLE IF EXISTS {}'.format(table))
    cur.execute("CREATE TABLE {} ({});".format(table, col_names))
    
    #File contains NULL bytes. That's why I replaced '\0' with ''
    reader = csv.reader(x.replace('\0','') for x in open(file_name))
    for row in reader:
        row = [None if x == '' else x for x in row]
        cur.execute("INSERT INTO {} VALUES ({});".format(table,",".join(['?']*n_columns)), row)

con.commit()
con.close()

In [13]:
#OTHER:
'''
Warrant_Issued = pd.read_csv("data/preprocessed/OFNT9BE1.csv")
Offender_profile = pd.read_csv("data/preprocessed/OFNT3AA1.csv")

Infraction_Charge.groupby("SUSPENSION_STATUS").size()

#Print type of data on the df:

for at in Inmate_Profile.columns:
    print(at)
    print(type(Inmate_Profile[at][0]))

#Cleaning Court Commitment

file_name = os.path.join(data_dir, "preprocessed/OFNT3BB1.csv")
Court_Commitment = pd.read_csv(file_name, low_memory=False, nrows=40000)
Court_Commitment = categorical_to_dummy(Court_Commitment, ["COURT_ORDERED_PAYMENT_TYPE"])
Court_Commitment = flag_to_dummy(Court_Commitment, ["PAYEE_ACCOUNT_STATUS_CODE"], rename=False)
#Court_Commitment = discretize_variable(Court_Commitment, ["COP_BALANCE"])
Court_Commitment =  keep_attribute(Court_Commitment, ["OFFENDER_NC_DOC_ID_NUMBER", "COP_COMMITMENT_PREFIX",\
                                      "COP_ACCOUNT_SEQUENCE_NUMBER","COURT_ORDERED_PAYMENT_TYPE_FINE",\
                                      "COURT_ORDERED_PAYMENT_TYPE_COMM. SERVICE FEE","PAYEE_ACCOUNT_STATUS_CODE",\
                                      "COURT_ORDERED_PAYMENT_TYPE_COURT COSTS","COURT_ORDERED_PAYMENT_TYPE_JAIL FEE",\
                                      "COURT_ORDERED_PAYMENT_TYPE_JUDGEMENT","COURT_ORDERED_PAYMENT_TYPE_RESTITUTION",\
                                      "COURT_ORDERED_PAYMENT_TYPE_SUPERVISION FEE", "COP_BALANCEcat"])

new_file_name = file_name[:-4] + '_cleaned.csv'
Court_Commitment.to_csv(new_file_name)
'''

'\nWarrant_Issued = pd.read_csv("data/preprocessed/OFNT9BE1.csv")\nOffender_profile = pd.read_csv("data/preprocessed/OFNT3AA1.csv")\n\nInfraction_Charge.groupby("SUSPENSION_STATUS").size()\n\n#Print type of data on the df:\n\nfor at in Inmate_Profile.columns:\n    print(at)\n    print(type(Inmate_Profile[at][0]))\n\n#Cleaning Court Commitment\n\nfile_name = os.path.join(data_dir, "preprocessed/OFNT3BB1.csv")\nCourt_Commitment = pd.read_csv(file_name, low_memory=False, nrows=40000)\nCourt_Commitment = categorical_to_dummy(Court_Commitment, ["COURT_ORDERED_PAYMENT_TYPE"])\nCourt_Commitment = flag_to_dummy(Court_Commitment, ["PAYEE_ACCOUNT_STATUS_CODE"], rename=False)\n#Court_Commitment = discretize_variable(Court_Commitment, ["COP_BALANCE"])\nCourt_Commitment =  keep_attribute(Court_Commitment, ["OFFENDER_NC_DOC_ID_NUMBER", "COP_COMMITMENT_PREFIX",                                      "COP_ACCOUNT_SEQUENCE_NUMBER","COURT_ORDERED_PAYMENT_TYPE_FINE",                                      "C

In [25]:
# JOIN  SentenceComputation and Cleaning Inmate Profile
Inmate_Profile["INMATE_DOC_NUMBER"] = Inmate_Profile["INMATE_DOC_NUMBER"].astype(str)
Sentence_Computation["INMATE_DOC_NUMBER"] = Sentence_Computation["INMATE_DOC_NUMBER"].astype(str)
Inmate_Sentence = Sentence_Computation.join(Inmate_Profile, on="INMATE_DOC_NUMBER", how='left',  rsuffix='2')
Inmate_Sentence.dropna(axis=0, inplace=True, subset=["INMATE_DOC_NUMBER2"])
Inmate_Sentence['dup'] = ~Inmate_Sentence.duplicated(subset="INMATE_DOC_NUMBER", keep='first')


In [28]:
"The table of Inmate_Profile&Sentence has {} total felonies commited - Those correspond to {} unique persons.\
Out of them there are {} females and {} males, this means that {}% are males. There are {} people that had \
prior incarcelations and there are {} inmates that have a history of scaping.\
In terms of race, there are {} black people, {} indian, {} white and {} of other or unknown race".\
format(Inmate_Sentence['INMATE_DOC_NUMBER2'].count(), Inmate_Sentence['dup'].sum(),\
(Inmate_Sentence['dup']*Inmate_Sentence['FEMALE']).sum(), (Inmate_Sentence['dup']*(1-Inmate_Sentence['FEMALE'])).sum(),\
((Inmate_Sentence['dup']*(1-Inmate_Sentence['FEMALE'])/ Inmate_Sentence['dup'].sum()).sum()),\
(Inmate_Sentence['PRIOR_INCARCERATIONS']*Inmate_Sentence['dup']).count(), Inmate_Sentence['ESCAPE_HISTORY'].sum(),\
(Inmate_Sentence['dup']*Inmate_Sentence['INMATE_RACE_CODE_BLACK']).sum(),(Inmate_Sentence['dup']*Inmate_Sentence['INMATE_RACE_CODE_INDIAN']).sum(),\
(Inmate_Sentence['dup']*Inmate_Sentence['INMATE_RACE_CODE_WHITE']).sum(),\
((Inmate_Sentence['dup']*Inmate_Sentence['INMATE_RACE_CODE_OTHER']).sum()+(Inmate_Sentence['dup']*Inmate_Sentence['INMATE_RACE_CODE_UNKNOWN']).sum()))

'The table of Inmate_Profile&Sentence has 644860 total felonies commited - Those correspond to 154099 unique persons.Out of them there are 19166.0 females and 134933.0 males, this means that 0.8756254096392576% are males. There are 639047 people that had prior incarcelations and there are 23252.0 inmates that have a history of scaping.In terms of race, there are 80751.0 black people, 3078.0 indian, 63144.0 white and 6738.0 of other or unknown race'

In [34]:
print(147347/(5396+147347))
Inmate_Sentence.groupby(['dup', 'PRIOR_INCARCERATIONS']).size()

0.9646726854913155


dup    PRIOR_INCARCERATIONS
False  0.0                      17526
       1.0                     468778
True   0.0                       5396
       1.0                     147347
dtype: int64

In [16]:
"The data base that we will work with has {} total sentences that correspond to {} unique persons".format(\
        Sentence_Computation['INMATE_DOC_NUMBER'].count(),Sentence_Computation['INMATE_DOC_NUMBER'].nunique())

'The data base that we will work with has 1704951 total sentences that correspond to 461681 unique persons'

In [17]:
"The table of Inmate Profile has {} total inmates that commited felonies - {} correspond to unique persons. \
There are {} persons that had prior incarcelations and there are {} inmates that have a history \
of scaping".format(Inmate_Profile['INMATE_DOC_NUMBER'].count(),Inmate_Profile['INMATE_DOC_NUMBER'].nunique(),\
                   Inmate_Profile['PRIOR_INCARCERATIONS'].sum(), Inmate_Profile['ESCAPE_HISTORY'].sum())


'The table of Inmate Profile has 296560 total inmates that commited felonies - 296560 correspond to unique persons. There are 282966.0 persons that had prior incarcelations and there are 10706 inmates that have a history of scaping'