## Load packages

In [1]:
import pandas as pd
import numpy as np
import random
import re
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from scipy.stats import zscore



## Load and inspect data 

In [2]:
sentencing_data_cleaned = pd.read_csv("../Data/sentencing_cleaned.csv", low_memory = False)

print(sentencing_data_cleaned.head())
print(sentencing_data_cleaned.shape)
print(sentencing_data_cleaned.info)

sentencing_data_cleaned.columns


        CASE_ID  CASE_PARTICIPANT_ID          RECEIVED_DATE  \
0  149765331439         175691153649  8/15/1984 12:00:00 AM   
1  150065796098         162105612284  8/23/1984 12:00:00 AM   
2  154954734978         225758446387   6/8/2001 12:00:00 AM   
3  155222744754         217349881776  1/31/2001 12:00:00 AM   
4  155327892699         217212381455   2/6/2001 12:00:00 AM   

    OFFENSE_CATEGORY  PRIMARY_CHARGE_FLAG    CHARGE_ID  CHARGE_VERSION_ID  \
0  PROMIS Conversion                 True  50510062193       112898098217   
1  PROMIS Conversion                 True  50792360681       113332130159   
2  PROMIS Conversion                 True  54885211141       174293345821   
3  PROMIS Conversion                 True  53899906462       280120721775   
4  PROMIS Conversion                 True  53938518259       164877860811   

                   DISPOSITION_CHARGED_OFFENSE_TITLE  CHARGE_COUNT  \
0                                FIRST DEGREE MURDER             1   
1                 

Index(['CASE_ID', 'CASE_PARTICIPANT_ID', 'RECEIVED_DATE', 'OFFENSE_CATEGORY',
       'PRIMARY_CHARGE_FLAG', 'CHARGE_ID', 'CHARGE_VERSION_ID',
       'DISPOSITION_CHARGED_OFFENSE_TITLE', 'CHARGE_COUNT', 'DISPOSITION_DATE',
       'DISPOSITION_CHARGED_CHAPTER', 'DISPOSITION_CHARGED_ACT',
       'DISPOSITION_CHARGED_SECTION', 'DISPOSITION_CHARGED_CLASS',
       'DISPOSITION_CHARGED_AOIC', 'CHARGE_DISPOSITION',
       'CHARGE_DISPOSITION_REASON', 'SENTENCE_JUDGE', 'SENTENCE_COURT_NAME',
       'SENTENCE_COURT_FACILITY', 'SENTENCE_PHASE', 'SENTENCE_DATE',
       'SENTENCE_TYPE', 'CURRENT_SENTENCE_FLAG', 'COMMITMENT_TYPE',
       'COMMITMENT_TERM', 'COMMITMENT_UNIT', 'LENGTH_OF_CASE_in_Days',
       'AGE_AT_INCIDENT', 'RACE', 'GENDER', 'INCIDENT_CITY',
       'INCIDENT_BEGIN_DATE', 'INCIDENT_END_DATE', 'LAW_ENFORCEMENT_AGENCY',
       'LAW_ENFORCEMENT_UNIT', 'ARREST_DATE', 'FELONY_REVIEW_DATE',
       'FELONY_REVIEW_RESULT', 'ARRAIGNMENT_DATE', 'UPDATED_OFFENSE_CATEGORY',
       'is_changed_

## Create Term Length and Sentence Length Variables

In [3]:
## Creating Term Length Variable

## subsetting to just units of days, years, months, and life sentences (Note: we are equating one life sentence to be 100 years)

sentencing_df = sentencing_data_cleaned[sentencing_data_cleaned["COMMITMENT_UNIT"].isin(["Year(s)", "Natural Life", "Days", "Months"]) & sentencing_data_cleaned["COMMITMENT_TYPE"].isin(["Illinois Department of Corrections", "Cook County Department of Corrections"]) & sentencing_data_cleaned["CHARGE_DISPOSITION"].isin(["Plea Of Guilty", "Verdict Guilty"]) & sentencing_data_cleaned["PRIMARY_CHARGE_FLAG"] == True]

print(sentencing_df.shape)

## Creating sentence length variable (in years)

sentencing_df["COMMITMENT_TERM"] = sentencing_df["COMMITMENT_TERM"].astype(float)


sentencing_df["sentence_length"] = sentencing_df.apply(lambda row: row["COMMITMENT_TERM"] if row["COMMITMENT_UNIT"] == "Year(s)" else  
                                                       100 if row["COMMITMENT_UNIT"] == "Natural Life" else
                                                       row["COMMITMENT_TERM"] / 365 if row["COMMITMENT_UNIT"] == "Days" else 
                                                       row["COMMITMENT_TERM"] / 12 if row["COMMITMENT_UNIT"] == "Months" else None, axis = 1)


print(sentencing_df["sentence_length"].head)




(69346, 52)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["COMMITMENT_TERM"] = sentencing_df["COMMITMENT_TERM"].astype(float)


<bound method NDFrame.head of 0          62.000000
1         100.000000
2           3.000000
7          20.000000
8           6.000000
             ...    
135158      1.000000
135159      0.328767
135162      0.123288
135163      0.123288
135164      0.123288
Name: sentence_length, Length: 69346, dtype: float64>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["sentence_length"] = sentencing_df.apply(lambda row: row["COMMITMENT_TERM"] if row["COMMITMENT_UNIT"] == "Year(s)" else


## Make sentencing unit standardizable by creating Z score 

In [4]:
## Need to convert sentencing unit to something standardizable across crimes 
## To do this will use a z score (how many standard deviations something is from the mean) grouped by the exact same charge type


sentencing_df["sentence_length_zscore"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(zscore)


print(sentencing_df["sentence_length_zscore"].head())



0    1.071864
1    2.846758
2   -0.734968
7    0.836916
8    0.064078
Name: sentence_length_zscore, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["sentence_length_zscore"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(zscore)


# Eliminate Outliers That Would Skew Z Scores Before Conducting Analysis

In [5]:
## Eliminating outliers by subsetting to judges that have seen a certain number of cases and eliminating crazy z scores

## Examples of why we need to eliminate outliers: 

sentencing_df[sentencing_df["sentence_length_zscore"] == max(sentencing_df["sentence_length_zscore"])]
sentencing_df[sentencing_df["sentence_length_zscore"] == min(sentencing_df["sentence_length_zscore"])]


## using IQR +/- 1.5 to remove outliers 

q1 = sentencing_df["sentence_length_zscore"].quantile(.25)
q3 = sentencing_df["sentence_length_zscore"].quantile(.75)

IQR = q3 - q1

sentencing_df_no_outliers = sentencing_df.loc[(sentencing_df["sentence_length_zscore"] > ((q1 - 1.5) * IQR)) & (sentencing_df["sentence_length_zscore"] < ((q3 + 1.5) * IQR))]

## Checking to see that this method worked and brought max and min magnitudes closer: 

sentencing_df_no_outliers[sentencing_df_no_outliers["sentence_length_zscore"] == max(sentencing_df_no_outliers["sentence_length_zscore"])]
sentencing_df_no_outliers[sentencing_df_no_outliers["sentence_length_zscore"] == min(sentencing_df_no_outliers["sentence_length_zscore"])]

## How many rows did we lose 

print(sentencing_df.shape, sentencing_df_no_outliers.shape)


Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,PRIMARY_CHARGE_FLAG,CHARGE_ID,CHARGE_VERSION_ID,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_COUNT,DISPOSITION_DATE,...,is_hisp_derived,is_white_derived,is_other_derived,is_male_derived,age_derived,sentenceymd_derived,sentenceym_derived,judgeid_derived,sentence_length,sentence_length_zscore
116222,429410550059,853817163984,3/1/2018 12:00:00 AM,Narcotics,True,421881792765,797531172032,POSSESSION OF A CONTROLLED SUBSTANCE,1,8/30/2018 12:00:00 AM,...,False,False,False,True,31.0,2018-08-30,2018-08-01,judge_4,364.0,98.691965


Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,PRIMARY_CHARGE_FLAG,CHARGE_ID,CHARGE_VERSION_ID,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_COUNT,DISPOSITION_DATE,...,is_hisp_derived,is_white_derived,is_other_derived,is_male_derived,age_derived,sentenceymd_derived,sentenceym_derived,judgeid_derived,sentence_length,sentence_length_zscore
133901,440611743299,888607576468,1/9/2020 12:00:00 AM,Driving With Suspended Or Revoked License,True,440285743720,822116942245,"FELONY DRIVING WHILE DRIVER?S LICENSE, PERMIT,...",1,10/20/2020 12:00:00 AM,...,False,False,False,True,56.0,2020-10-20,2020-10-01,judge_314,0.082192,-3.482217


Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,PRIMARY_CHARGE_FLAG,CHARGE_ID,CHARGE_VERSION_ID,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_COUNT,DISPOSITION_DATE,...,is_hisp_derived,is_white_derived,is_other_derived,is_male_derived,age_derived,sentenceymd_derived,sentenceym_derived,judgeid_derived,sentence_length,sentence_length_zscore
8505,387465037559,724679353540,8/3/2011 12:00:00 AM,Aggravated Assault Police Officer Firearm,True,348806151893,648725011450,AGGRAVATED ASSAULT,1,1/4/2013 12:00:00 AM,...,False,False,False,True,20.0,2013-01-04,2013-01-01,judge_310,4.0,1.542805
19024,391597821734,736187668018,4/6/2012 12:00:00 AM,Aggravated Fleeing and Eluding,True,356411519518,663054231311,AGGRAVATED ASSAULT,1,8/3/2012 12:00:00 AM,...,False,False,False,True,42.0,2012-08-03,2012-08-01,judge_188,4.0,1.542805
28289,394963583958,745754179742,9/21/2012 12:00:00 AM,Aggravated Assault Police Officer,True,361516019164,672662910049,AGGRAVATED ASSAULT,1,5/1/2013 12:00:00 AM,...,False,False,False,True,38.0,2013-04-24,2013-04-01,judge_31,4.0,1.542805
38138,398617181339,756493455017,3/28/2013 12:00:00 AM,Aggravated Assault Police Officer,True,367681387982,684266304679,AGGRAVATED ASSAULT,1,9/9/2015 12:00:00 AM,...,True,False,False,True,31.0,2015-09-09,2015-09-01,judge_222,4.0,1.542805
39817,399292125410,758508962474,4/15/2013 12:00:00 AM,Aggravated Battery Police Officer,True,369518314049,687722958985,AGGRAVATED ASSAULT,1,4/2/2014 12:00:00 AM,...,False,False,False,False,36.0,2017-03-20,2017-03-01,judge_61,4.0,1.542805
43780,400806197077,763210963856,5/6/2013 12:00:00 AM,Aggravated Assault Police Officer,True,371283144652,691027804874,AGGRAVATED ASSAULT,1,10/22/2013 12:00:00 AM,...,False,False,False,True,27.0,2013-10-22,2013-10-01,judge_331,4.0,1.542805
51373,403730954721,772292779869,12/7/2013 12:00:00 AM,Aggravated Assault Police Officer,True,376043908810,699994497492,AGGRAVATED ASSAULT,1,9/1/2016 12:00:00 AM,...,False,False,False,True,36.0,2016-09-01,2016-09-01,judge_109,4.0,1.542805
83295,415709156030,809808190144,10/1/2015 12:00:00 AM,Aggravated Assault Police Officer,True,395903494339,737604869383,AGGRAVATED ASSAULT,1,2/26/2016 12:00:00 AM,...,False,False,False,True,27.0,2016-02-25,2016-02-01,judge_115,4.0,1.542805
108684,425841687824,842380666763,7/11/2017 12:00:00 AM,Aggravated Assault Police Officer,True,415173495830,774179674317,AGGRAVATED ASSAULT,1,3/27/2018 12:00:00 AM,...,False,False,False,True,20.0,2018-03-27,2018-03-01,judge_334,4.0,1.542805
113699,428262146525,850151841415,12/13/2017 12:00:00 AM,Aggravated Assault Police Officer,True,419733911017,782920943114,AGGRAVATED ASSAULT,1,8/30/2018 12:00:00 AM,...,False,False,False,True,22.0,2018-08-30,2018-08-01,judge_310,4.0,1.542805


Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,PRIMARY_CHARGE_FLAG,CHARGE_ID,CHARGE_VERSION_ID,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_COUNT,DISPOSITION_DATE,...,is_hisp_derived,is_white_derived,is_other_derived,is_male_derived,age_derived,sentenceymd_derived,sentenceym_derived,judgeid_derived,sentence_length,sentence_length_zscore
103853,423700546708,835575486134,2/28/2017 12:00:00 AM,Aggravated DUI,True,411015085757,766301704956,AGGRAVATED FLEEING OR ATTEMPT TO ELUDE A PEACE...,1,11/20/2018 12:00:00 AM,...,False,False,False,True,20.0,2018-11-20,2018-11-01,judge_92,0.054795,-1.74384


(69346, 54) (65857, 54)


## Export data as csv file to Data folder 

In [6]:
## Exporting CSV to be used in the next file (Creating dependent variables)


sentencing_df_no_outliers.to_csv(r'../Data/sentencing_data_withZ.csv')