## Load packages

In [20]:
import pandas as pd
import numpy as np
import random
import re
import os
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load and inspect Data

In [8]:
sentencing_df = pd.read_csv("../Data/sentencing_data_withZ.csv", low_memory = False)

In [9]:
sentencing_df.head()
sentencing_df.info()
sentencing_df.shape

Unnamed: 0.1,Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,PRIMARY_CHARGE_FLAG,CHARGE_ID,CHARGE_VERSION_ID,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_COUNT,...,is_white_derived,is_other_derived,is_male_derived,age_derived,sentenceymd_derived,sentenceym_derived,judgeid_derived,sentence_length,sentence_length_zscore,judges_total_cases
0,0,149765331439,175691153649,8/15/1984 12:00:00 AM,PROMIS Conversion,True,50510062193,112898098217,FIRST DEGREE MURDER,1,...,False,False,True,27.0,2014-10-16,2014-10-01,judge_40,62.0,-0.014996,769
1,1,163734000970,119572148070,5/27/2002 12:00:00 AM,PROMIS Conversion,True,60663908958,740041557966,POSSESSION OF A CONTROLLED SUBSTANCE,1,...,False,False,True,26.0,2015-11-09,2015-11-01,judge_40,1.0,-0.258566,769
2,2,182050890480,179762088011,8/23/2004 12:00:00 AM,PROMIS Conversion,True,69728430461,658529032588,AGGRAVATED BATTERY,1,...,False,False,True,35.0,2012-02-08,2012-02-01,judge_40,10.0,2.181622,769
3,3,188763617530,185554465998,8/9/2005 12:00:00 AM,PROMIS Conversion,True,77575875953,740046322415,VIOLATION OF BAIL BOND,1,...,False,False,True,,2015-11-09,2015-11-01,judge_40,2.0,-0.439882,769
4,4,195256796846,190097450194,2/21/2005 12:00:00 AM,PROMIS Conversion,True,84905279838,212776826308,ATTEMPT FIRST DEGREE MURDER,1,...,False,False,True,54.0,2012-04-05,2012-04-01,judge_40,40.0,1.460349,769


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73222 entries, 0 to 73221
Data columns (total 56 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         73222 non-null  int64  
 1   CASE_ID                            73222 non-null  int64  
 2   CASE_PARTICIPANT_ID                73222 non-null  int64  
 3   RECEIVED_DATE                      73222 non-null  object 
 4   OFFENSE_CATEGORY                   73222 non-null  object 
 5   PRIMARY_CHARGE_FLAG                73222 non-null  bool   
 6   CHARGE_ID                          73222 non-null  int64  
 7   CHARGE_VERSION_ID                  73222 non-null  int64  
 8   DISPOSITION_CHARGED_OFFENSE_TITLE  73222 non-null  object 
 9   CHARGE_COUNT                       73222 non-null  int64  
 10  DISPOSITION_DATE                   73222 non-null  object 
 11  DISPOSITION_CHARGED_CHAPTER        73222 non-null  obj

(73222, 56)

## Investigate Judges Z scores 

In [10]:
## Investigating which judges have the highest and lowest mean z scores 

## About 200 judges

judges = sentencing_df.groupby("SENTENCE_JUDGE")["sentence_length_zscore"].apply(np.mean).sort_values(ascending = False)

## list of top 50% and bottom 50% judges in terms of sentencing severity

max_judges = judges[:123].index

min_judges = judges[124:].index

max_subset = sentencing_df[sentencing_df["SENTENCE_JUDGE"].isin(max_judges)]

min_subset = sentencing_df[sentencing_df["SENTENCE_JUDGE"].isin(min_judges)]


min_subset.columns

Index(['Unnamed: 0', 'CASE_ID', 'CASE_PARTICIPANT_ID', 'RECEIVED_DATE',
       'OFFENSE_CATEGORY', 'PRIMARY_CHARGE_FLAG', 'CHARGE_ID',
       'CHARGE_VERSION_ID', 'DISPOSITION_CHARGED_OFFENSE_TITLE',
       'CHARGE_COUNT', 'DISPOSITION_DATE', 'DISPOSITION_CHARGED_CHAPTER',
       'DISPOSITION_CHARGED_ACT', 'DISPOSITION_CHARGED_SECTION',
       'DISPOSITION_CHARGED_CLASS', 'DISPOSITION_CHARGED_AOIC',
       'CHARGE_DISPOSITION', 'CHARGE_DISPOSITION_REASON', 'SENTENCE_JUDGE',
       'SENTENCE_COURT_NAME', 'SENTENCE_COURT_FACILITY', 'SENTENCE_PHASE',
       'SENTENCE_DATE', 'SENTENCE_TYPE', 'CURRENT_SENTENCE_FLAG',
       'COMMITMENT_TYPE', 'COMMITMENT_TERM', 'COMMITMENT_UNIT',
       'LENGTH_OF_CASE_in_Days', 'AGE_AT_INCIDENT', 'RACE', 'GENDER',
       'INCIDENT_CITY', 'INCIDENT_BEGIN_DATE', 'INCIDENT_END_DATE',
       'LAW_ENFORCEMENT_AGENCY', 'LAW_ENFORCEMENT_UNIT', 'ARREST_DATE',
       'FELONY_REVIEW_DATE', 'FELONY_REVIEW_RESULT', 'ARRAIGNMENT_DATE',
       'UPDATED_OFFENSE_CATEGORY'

## Investigate where judges biases lie 

In [11]:
max_subset["is_white_derived"].value_counts()
min_subset["is_white_derived"].value_counts()

## number of cases 
max_subset["CASE_ID"].nunique()
min_subset["CASE_ID"].nunique()

False    51334
True      6430
Name: is_white_derived, dtype: int64

False    11902
True      2231
Name: is_white_derived, dtype: int64

57764

14133

## Investigate impact number of cases has on judges mean severity scores 

In [30]:
#determine the number of cases for each judge 
judges_total_cases_df=sentencing_df.groupby('judgeid_derived').CASE_ID.count().reset_index().rename(columns={'CASE_ID': 'judges_total_cases'})
judges_total_cases_df.judgeid_derived.sort_values()
judges_total_cases_df

#merge with sentencing_df to add column judges_total_cases
sentencing_df = pd.merge(sentencing_df, judges_total_cases_df, on='judgeid_derived')

0        judge_1
1       judge_10
2      judge_100
3      judge_101
4      judge_102
         ...    
223     judge_93
224     judge_94
225     judge_95
226     judge_98
227     judge_99
Name: judgeid_derived, Length: 228, dtype: object

Unnamed: 0,judgeid_derived,judges_total_cases
0,judge_1,69
1,judge_10,408
2,judge_100,686
3,judge_101,2
4,judge_102,4
...,...,...
223,judge_93,1
224,judge_94,1
225,judge_95,1
226,judge_98,7
