## Creating Additional Columns (Independent Variables) Needed For Analysis

## Load packages

In [1]:
import pandas as pd
import numpy as np
import random
import re
import os
import matplotlib.pyplot as plt

## Note the code below is not a function, it just resets the output so it shows all lines

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load and Inspect Cleaned Data (with Z Scores)

In [2]:
sentencing_df = pd.read_csv("../Data/sentencing_data_withZ.csv", low_memory = False)

In [3]:
print(sentencing_df.head(), sentencing_df.info(), sentencing_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65857 entries, 0 to 65856
Data columns (total 55 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         65857 non-null  int64  
 1   CASE_ID                            65857 non-null  int64  
 2   CASE_PARTICIPANT_ID                65857 non-null  int64  
 3   RECEIVED_DATE                      65857 non-null  object 
 4   OFFENSE_CATEGORY                   65857 non-null  object 
 5   PRIMARY_CHARGE_FLAG                65857 non-null  bool   
 6   CHARGE_ID                          65857 non-null  int64  
 7   CHARGE_VERSION_ID                  65857 non-null  int64  
 8   DISPOSITION_CHARGED_OFFENSE_TITLE  65857 non-null  object 
 9   CHARGE_COUNT                       65857 non-null  int64  
 10  DISPOSITION_DATE                   65857 non-null  object 
 11  DISPOSITION_CHARGED_CHAPTER        65857 non-null  obj

## Changing existing boolean race and gender data to binary

In [10]:
## for regression, is_(race)_derived and is_male_derived variables need to be binary

sentencing_df[["is_black_derived", "is_white_derived", "is_hisp_derived", "is_other_derived", "is_male_derived"]] = sentencing_df[["is_black_derived", "is_white_derived", "is_hisp_derived", "is_other_derived", "is_male_derived"]].astype(int)


## Checking to make sure binary variables created properly

print(sentencing_df[["is_black_derived", "is_other_derived", "is_hisp_derived", "is_other_derived", "is_male_derived"]])

       is_black_derived  is_other_derived  is_hisp_derived  is_other_derived  \
0                     1                 0                0                 0   
1                     0                 0                1                 0   
2                     1                 0                0                 0   
3                     0                 0                1                 0   
4                     1                 0                0                 0   
...                 ...               ...              ...               ...   
65852                 0                 0                0                 0   
65853                 1                 0                0                 0   
65854                 0                 0                0                 0   
65855                 0                 0                0                 0   
65856                 0                 0                0                 0   

       is_male_derived  
0             

## Creating is_guilty_plea and is_innocent_plea

In [11]:
## Need to create a binary variable for innocent and guilty pleas out of CHARGE_DISPOSITION

## Binary Guilty: 

sentencing_df["is_guilty_plea"] = np.where(sentencing_df["CHARGE_DISPOSITION"] == "Plea Of Guilty", 1, 0)

## Binary Not Guilty:

sentencing_df["is_innocent_plea"] = np.where(sentencing_df["CHARGE_DISPOSITION"] == "Plea Of Guilty", 0, 1)


## Checking to make sure the variable worked: 

print(sentencing_df[["is_guilty_plea", "is_innocent_plea", "CHARGE_DISPOSITION"]])

       is_guilty_plea  is_innocent_plea CHARGE_DISPOSITION
0                   1                 0     Plea Of Guilty
1                   1                 0     Plea Of Guilty
2                   1                 0     Plea Of Guilty
3                   1                 0     Plea Of Guilty
4                   0                 1     Verdict Guilty
...               ...               ...                ...
65852               1                 0     Plea Of Guilty
65853               1                 0     Plea Of Guilty
65854               1                 0     Plea Of Guilty
65855               1                 0     Plea Of Guilty
65856               1                 0     Plea Of Guilty

[65857 rows x 3 columns]


## Creating is_female_derived (is_male_derived already exists)

In [12]:
## Need to create a binary variable for females to complement is_male

sentencing_df["is_female_derived"] = np.where(sentencing_df["GENDER"] == "Female", 1, 0)

## Checking to make sure it worked:

print(sentencing_df[["is_female_derived", "GENDER"]])


       is_female_derived GENDER
0                      0   Male
1                      0   Male
2                      0   Male
3                      0   Male
4                      0   Male
...                  ...    ...
65852                  0   Male
65853                  0   Male
65854                  0   Male
65855                  0   Male
65856                  0   Male

[65857 rows x 2 columns]


## Converting Sentenceymd_derived from String to Datetime object

In [13]:
## Converting to Datetime

sentencing_df["sentenceymd_derived_dt"] = pd.to_datetime(sentencing_df["sentenceymd_derived"])

## Checking to make sure it worked 

print(type(sentencing_df["sentenceymd_derived_dt"][1]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


## Creating Nth Case (To measure how judicial experience corresponds to severity) 

In [8]:
## Creating a variable for each case that indicates how many cases the assigned judge has seen (measure of judicial experience)
## Did not make this into a function because the task is too specific

## Sorting Values by Sentence Date and Judge: 

sentencing_df_sorted = sentencing_df.sort_values(["SENTENCE_JUDGE", "sentenceymd_derived_dt"])

## checking to make suresorting worked 

print(sentencing_df_sorted.head)

## Creating column for Nth case (nth_case)

sentencing_df_sorted["nth_case"] = sentencing_df_sorted.groupby("SENTENCE_JUDGE").cumcount() + 1

## checking to make sure nth case works:

print(sentencing_df_sorted[["nth_case", "SENTENCE_JUDGE", "sentenceymd_derived_dt"]])


<bound method NDFrame.head of        Unnamed: 0       CASE_ID  CASE_PARTICIPANT_ID           RECEIVED_DATE  \
57587      116398  429485886505         854062814867    3/7/2018 12:00:00 AM   
58879      119085  430780557292         858166118899   5/25/2018 12:00:00 AM   
62770      127700  435531599636         872618846575   3/12/2019 12:00:00 AM   
60794      123275  432903928993         864718665101   9/26/2018 12:00:00 AM   
60718      123109  432818606428         864445174414   9/20/2018 12:00:00 AM   
...           ...           ...                  ...                     ...   
43250       87511  417323529438         814831906416    1/9/2016 12:00:00 AM   
42545       85980  416715404242         812927097571  11/28/2015 12:00:00 AM   
52248      105567  424442309656         837900156976   4/14/2017 12:00:00 AM   
52514      106082  424655395784         838575767684   4/27/2017 12:00:00 AM   
42211       85308  416479702494         812173252295  11/12/2015 12:00:00 AM   

         

## Exporting CSV to Data folder

In [9]:
## Data to be used in regression and visualizations

sentencing_df_sorted.to_csv(r'../Data/sentencing_data_for_analysis.csv')