## Cleaning Data and Creating Sentence Severity Score

## Load packages

In [1]:
import pandas as pd
import numpy as np
import random
import re
import os

## Note the code below is not a function, it just resets the output so it shows all lines

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## Functions

In [2]:
## Z Score

from scipy.stats import zscore


"""
Scipy Source Docstring: 

    Compute the z score.
    Compute the z score of each value in the sample, relative to the
    sample mean and standard deviation.
    Parameters
    ----------
    a : array_like
        An array like object containing the sample data.
    axis : int or None, optional
        Axis along which to operate. Default is 0. If None, compute over
        the whole array `a`.
    ddof : int, optional
        Degrees of freedom correction in the calculation of the
        standard deviation. Default is 0.
    nan_policy : {'propagate', 'raise', 'omit'}, optional
        Defines how to handle when input contains nan. 'propagate' returns nan,
        'raise' throws an error, 'omit' performs the calculations ignoring nan
        values. Default is 'propagate'.  Note that when the value is 'omit',
        nans in the input also propagate to the output, but they do not affect
        the z-scores computed for the non-nan values.
    Returns
    -------
    zscore : array_like
        The z-scores, standardized by mean and standard deviation of
        input array `a`.

"""

"\nScipy Source Docstring: \n\n    Compute the z score.\n    Compute the z score of each value in the sample, relative to the\n    sample mean and standard deviation.\n    Parameters\n    ----------\n    a : array_like\n        An array like object containing the sample data.\n    axis : int or None, optional\n        Axis along which to operate. Default is 0. If None, compute over\n        the whole array `a`.\n    ddof : int, optional\n        Degrees of freedom correction in the calculation of the\n        standard deviation. Default is 0.\n    nan_policy : {'propagate', 'raise', 'omit'}, optional\n        Defines how to handle when input contains nan. 'propagate' returns nan,\n        'raise' throws an error, 'omit' performs the calculations ignoring nan\n        values. Default is 'propagate'.  Note that when the value is 'omit',\n        nans in the input also propagate to the output, but they do not affect\n        the z-scores computed for the non-nan values.\n    Returns\n    

In [3]:
## Remove Outliers

def remove_outliers(df, col):
    
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    
    iqr = q3 - q1
    
    lower = (q1 - 1.5) * iqr
    upper = (q3 + 1.5) * iqr
    
    return df.loc[(df[col] > lower) & (df[col] < upper)]

""""
    Remove outliers from a df using IQR * +/- 1.5 Method 

    Parameters:
    
    df: pandas.DataFrame
        The DataFrame from which to remove outliers
        
    col: str
        The Column from which to remove outliers

    Outputs:
    
    pandas.DataFrame
        A new DataFrame subsetted to remove rows with outliers in the column in question
"""
        
    

'"\n    Remove outliers from a df using IQR * +/- 1.5 Method \n\n    Parameters:\n    \n    df: pandas.DataFrame\n        The DataFrame from which to remove outliers\n        \n    col: str\n        The Column from which to remove outliers\n\n    Outputs:\n    \n    pandas.DataFrame\n        A new DataFrame subsetted to remove rows with outliers in the column in question\n'

## Load and inspect data 

In [4]:
sentencing_data_cleaned = pd.read_csv("../Data/sentencing_cleaned.csv", low_memory = False)

print(sentencing_data_cleaned.head())
print(sentencing_data_cleaned.shape)
print(sentencing_data_cleaned.info)


        CASE_ID  CASE_PARTICIPANT_ID          RECEIVED_DATE  \
0  149765331439         175691153649  8/15/1984 12:00:00 AM   
1  150065796098         162105612284  8/23/1984 12:00:00 AM   
2  154954734978         225758446387   6/8/2001 12:00:00 AM   
3  155222744754         217349881776  1/31/2001 12:00:00 AM   
4  155327892699         217212381455   2/6/2001 12:00:00 AM   

    OFFENSE_CATEGORY  PRIMARY_CHARGE_FLAG    CHARGE_ID  CHARGE_VERSION_ID  \
0  PROMIS Conversion                 True  50510062193       112898098217   
1  PROMIS Conversion                 True  50792360681       113332130159   
2  PROMIS Conversion                 True  54885211141       174293345821   
3  PROMIS Conversion                 True  53899906462       280120721775   
4  PROMIS Conversion                 True  53938518259       164877860811   

                   DISPOSITION_CHARGED_OFFENSE_TITLE  CHARGE_COUNT  \
0                                FIRST DEGREE MURDER             1   
1                 

## Create Standard Sentence Length Variable

In [5]:
## Creating standardized sentence length (sentence_length) variable

## Subsetting to just units of days, years, months, and life sentences (Note: we are equating one life sentence to be 100 years)

sentencing_df = sentencing_data_cleaned[sentencing_data_cleaned["COMMITMENT_UNIT"].isin(["Year(s)", "Natural Life", "Days", "Months"]) & sentencing_data_cleaned["COMMITMENT_TYPE"].isin(["Illinois Department of Corrections", "Cook County Department of Corrections"]) & sentencing_data_cleaned["CHARGE_DISPOSITION"].isin(["Plea Of Guilty", "Verdict Guilty"]) & sentencing_data_cleaned["PRIMARY_CHARGE_FLAG"] == True]

## Checking to make sure subset is correct

print(sentencing_df.shape)

## Need to change Commitment Term from String to Float

sentencing_df["COMMITMENT_TERM"] = sentencing_df["COMMITMENT_TERM"].astype(float)


## Creating sentence length variable (in years) (Didn't make this a function because we used this methodology in class, and also because it's too specific an operation to generalize)

sentencing_df["sentence_length"] = sentencing_df.apply(lambda row: row["COMMITMENT_TERM"] if row["COMMITMENT_UNIT"] == "Year(s)" else  
                                                       100 if row["COMMITMENT_UNIT"] == "Natural Life" else
                                                       row["COMMITMENT_TERM"] / 365 if row["COMMITMENT_UNIT"] == "Days" else 
                                                       row["COMMITMENT_TERM"] / 12 if row["COMMITMENT_UNIT"] == "Months" else None, axis = 1)

## Check to make sure variable created currectly 

print(sentencing_df["sentence_length"].head)




(69346, 52)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["COMMITMENT_TERM"] = sentencing_df["COMMITMENT_TERM"].astype(float)


<bound method NDFrame.head of 0          62.000000
1         100.000000
2           3.000000
7          20.000000
8           6.000000
             ...    
135158      1.000000
135159      0.328767
135162      0.123288
135163      0.123288
135164      0.123288
Name: sentence_length, Length: 69346, dtype: float64>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["sentence_length"] = sentencing_df.apply(lambda row: row["COMMITMENT_TERM"] if row["COMMITMENT_UNIT"] == "Year(s)" else


## Make Sentence Severity Score using Z score 

In [6]:
## Need to convert sentencing unit to something standardizable across crimes 

## To do this will use a z score (how many standard deviations something is from the mean) grouped by the exact same charge type (DISPOSITION_CHARGED_OFFENSE_TITLE)


sentencing_df["sentence_length_zscore"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(zscore)

## Make sure variable created correctly

print(sentencing_df["sentence_length_zscore"].head())



0    1.071864
1    2.846758
2   -0.734968
7    0.836916
8    0.064078
Name: sentence_length_zscore, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentencing_df["sentence_length_zscore"] = sentencing_df.groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["sentence_length"].transform(zscore)


# Eliminate Outliers (from sentence_length_zscore) That Would Skew Analysis

In [7]:
## Eliminating outliers w basic IQR * 1.5 Method 

## Examples of why we need to eliminate outliers (max and min magnitudes are way different): 

print(sentencing_df["sentence_length_zscore"][sentencing_df["sentence_length_zscore"] == max(sentencing_df["sentence_length_zscore"])])
print(sentencing_df["sentence_length_zscore"][sentencing_df["sentence_length_zscore"] == min(sentencing_df["sentence_length_zscore"])])

## using remove_outliers() (defined above) to remove outliers using IQR method: 

sentencing_df_no_outliers = remove_outliers(sentencing_df, "sentence_length_zscore")

## Checking to see that this method worked and brought max and min magnitudes closer: 

print(sentencing_df_no_outliers["sentence_length_zscore"][sentencing_df_no_outliers["sentence_length_zscore"] == max(sentencing_df_no_outliers["sentence_length_zscore"])])
print(sentencing_df_no_outliers["sentence_length_zscore"][sentencing_df_no_outliers["sentence_length_zscore"] == min(sentencing_df_no_outliers["sentence_length_zscore"])])

## Checking how many rows did we lose: 

print(sentencing_df.shape, sentencing_df_no_outliers.shape)


116222    98.691965
Name: sentence_length_zscore, dtype: float64
133901   -3.482217
Name: sentence_length_zscore, dtype: float64
8505      1.542805
19024     1.542805
28289     1.542805
38138     1.542805
39817     1.542805
43780     1.542805
51373     1.542805
83295     1.542805
108684    1.542805
113699    1.542805
114108    1.542805
Name: sentence_length_zscore, dtype: float64
103853   -1.74384
Name: sentence_length_zscore, dtype: float64
(69346, 54) (65857, 54)


## Export data as csv file to Data folder 

In [8]:
## Exporting CSV to be used in the next file (Creating dependent variables)

sentencing_df_no_outliers.to_csv(r'../Data/sentencing_data_withZ.csv')