## Load packages

In [2]:
import pandas as pd
import numpy as np
import random
import re
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from scipy.stats import zscore



## Load and inspect data 

In [3]:
sentencing_data_cleaned = pd.read_csv("../Data/sentencing_cleaned.csv", low_memory = False)

sentencing_data_cleaned.head()
sentencing_data_cleaned.shape
sentencing_data_cleaned.info

Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,PRIMARY_CHARGE_FLAG,CHARGE_ID,CHARGE_VERSION_ID,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_COUNT,DISPOSITION_DATE,...,simplified_offense_derived,is_black_derived,is_hisp_derived,is_white_derived,is_other_derived,is_male_derived,age_derived,sentenceymd_derived,sentenceym_derived,judgeid_derived
0,149765331439,175691153649,8/15/1984 12:00:00 AM,PROMIS Conversion,True,50510062193,112898098217,FIRST DEGREE MURDER,1,12/17/2014 12:00:00 AM,...,Homicide,True,False,False,False,True,27.0,2014-10-16,2014-10-01,judge_40
1,150065796098,162105612284,8/23/1984 12:00:00 AM,PROMIS Conversion,True,50792360681,113332130159,FIRST DEGREE MURDER,1,8/6/2014 12:00:00 AM,...,Homicide,True,False,False,False,True,30.0,2014-08-06,2014-08-01,judge_310
2,154954734978,225758446387,6/8/2001 12:00:00 AM,PROMIS Conversion,True,54885211141,174293345821,VIO BAIL BOND/CLASS 1,1,12/2/2013 12:00:00 AM,...,PROMIS Conversion,False,True,False,False,True,38.0,2013-12-02,2013-12-01,judge_162
3,155222744754,217349881776,1/31/2001 12:00:00 AM,PROMIS Conversion,True,53899906462,280120721775,POSS AMT CON SUB EXCEPT (A)/(D),1,9/10/2012 12:00:00 AM,...,Narcotics,True,False,False,False,False,33.0,2012-09-10,2012-09-01,judge_331
4,155327892699,217212381455,2/6/2001 12:00:00 AM,PROMIS Conversion,True,53938518259,164877860811,DUI LIC SUSPENDED OR REVOKED (EFFECTIVE 4-13-2...,1,9/19/2014 12:00:00 AM,...,PROMIS Conversion,False,False,True,False,True,49.0,2014-09-19,2014-09-01,judge_314


(135165, 52)

<bound method DataFrame.info of              CASE_ID  CASE_PARTICIPANT_ID          RECEIVED_DATE  \
0       149765331439         175691153649  8/15/1984 12:00:00 AM   
1       150065796098         162105612284  8/23/1984 12:00:00 AM   
2       154954734978         225758446387   6/8/2001 12:00:00 AM   
3       155222744754         217349881776  1/31/2001 12:00:00 AM   
4       155327892699         217212381455   2/6/2001 12:00:00 AM   
...              ...                  ...                    ...   
135160  445516248775         905289187184   2/3/2021 12:00:00 AM   
135161  445527409730         905322500577   2/4/2021 12:00:00 AM   
135162  445587767000         905518794790   2/9/2021 12:00:00 AM   
135163  445592613204         905533705601   2/9/2021 12:00:00 AM   
135164  445593200623         905535215556   2/9/2021 12:00:00 AM   

                                 OFFENSE_CATEGORY  PRIMARY_CHARGE_FLAG  \
0                               PROMIS Conversion                 True   
1  

## Create Term Length and Sentence Length Variables

In [4]:
## Creating Term Length Variable

## subsetting to just units of days, years, months, and life sentences (Note: we are equating one life sentence to be 100 years)

sentencing_df = sentencing_data_cleaned[sentencing_data_cleaned["COMMITMENT_UNIT"].isin(["Year(s)", "Natural Life", "Days", "Months"]) & sentencing_data_cleaned["COMMITMENT_TYPE"].isin(["Illinois Department of Corrections", "Cook County Department of Corrections"])]


sentencing_df["COMMITMENT_TYPE"].unique()
sentencing_df[["COMMITMENT_UNIT", "COMMITMENT_TERM"]][sentencing_df["COMMITMENT_TYPE"] == "Cook County Boot Camp"]




## Creating sentence length variable (in years)

sentencing_df["COMMITMENT_TERM"] = sentencing_df["COMMITMENT_TERM"].astype(float)


sentencing_df["sentence_length"] = sentencing_df.apply(lambda row: row["COMMITMENT_TERM"] if row["COMMITMENT_UNIT"] == "Year(s)" else  
                                                       100 if row["COMMITMENT_UNIT"] == "Natural Life" else
                                                       row["COMMITMENT_TERM"] / 365 if row["COMMITMENT_UNIT"] == "Days" else 
                                                       row["COMMITMENT_TERM"] / 12 if row["COMMITMENT_UNIT"] == "Months" else None, axis = 1)



array(['Illinois Department of Corrections',
       'Cook County Department of Corrections'], dtype=object)

Unnamed: 0,COMMITMENT_UNIT,COMMITMENT_TERM


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["COMMITMENT_TERM"] = sentencing_df["COMMITMENT_TERM"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["sentence_length"] = sentencing_df.apply(lambda row: row["COMMITMENT_TERM"] if row["COMMITMENT_UNIT"] == "Year(s)" else


## Make sentencing unit standardizable by creating Z score 

In [14]:
## Need to convert sentencing unit to something standardizable across crimes 
## To do this will use a z score (how many standard deviations something is from the mean) grouped by the exact same charge type


sentencing_df["sentence_length_zscore"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(zscore)

## looking at z score stats just quickly

np.mean(sentencing_df["sentence_length_zscore"])

## 0 z score would mean perfectly average or neutral sentence length for the crime


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["sentence_length_zscore"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(zscore)


4.088176049413104e-15

## Add mean sentence length for offense type column to df

In [15]:
sentencing_df["sentence_length_mean"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(np.mean)

sentencing_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["sentence_length_mean"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(np.mean)


Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,PRIMARY_CHARGE_FLAG,CHARGE_ID,CHARGE_VERSION_ID,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_COUNT,DISPOSITION_DATE,...,is_white_derived,is_other_derived,is_male_derived,age_derived,sentenceymd_derived,sentenceym_derived,judgeid_derived,sentence_length,sentence_length_mean,sentence_length_zscore
0,149765331439,175691153649,8/15/1984 12:00:00 AM,PROMIS Conversion,True,50510062193,112898098217,FIRST DEGREE MURDER,1,12/17/2014 12:00:00 AM,...,False,False,True,27.0,2014-10-16,2014-10-01,judge_40,62.0,40.575758,0.960749
1,150065796098,162105612284,8/23/1984 12:00:00 AM,PROMIS Conversion,True,50792360681,113332130159,FIRST DEGREE MURDER,1,8/6/2014 12:00:00 AM,...,False,False,True,30.0,2014-08-06,2014-08-01,judge_310,100.0,40.575758,2.664821
2,154954734978,225758446387,6/8/2001 12:00:00 AM,PROMIS Conversion,True,54885211141,174293345821,VIO BAIL BOND/CLASS 1,1,12/2/2013 12:00:00 AM,...,False,False,True,38.0,2013-12-02,2013-12-01,judge_162,3.0,4.1,-0.734968
5,155685924389,221749325828,6/15/2001 12:00:00 AM,PROMIS Conversion,True,55017536987,687589180737,AGGRAVATED CRIMINAL SEXUAL ASSAULT,1,5/3/2013 12:00:00 AM,...,False,False,True,,2013-05-03,2013-05-01,judge_287,10.0,15.089835,-0.354557
7,156558681704,231553844285,7/17/2001 12:00:00 AM,PROMIS Conversion,True,58235136444,720561595284,2ND DEGREE MURDER,1,12/4/2014 12:00:00 AM,...,False,False,True,17.0,2014-12-04,2014-12-01,judge_329,20.0,14.797468,0.942273


## Export data as csv file to Data folder 

In [16]:
sentencing_df.to_csv(r'../Data/sentencing_data_withZ.csv')