## Modeling Analysis Data To Look for Causality Between Our Descriptive Variables and Sentence Severity (Decision Tree Model)

## Load Packages: 

In [1]:
import pandas as pd
import numpy as np
import random
import re
import os


## Note the code below is not a function, it just resets the output so it shows all lines

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



## Functions

In [2]:
from sklearn.tree import DecisionTreeRegressor

"""
Sklearn DecisionTreeRegressor Source Docstring:

    A decision tree regressor.
    Read more in the :ref:`User Guide <tree>`.
    Parameters
    ----------
    criterion : {"squared_error", "friedman_mse", "absolute_error", \
            "poisson"}, default="squared_error"
        The function to measure the quality of a split. Supported criteria
        are "squared_error" for the mean squared error, which is equal to
        variance reduction as feature selection criterion and minimizes the L2
        loss using the mean of each terminal node, "friedman_mse", which uses
        mean squared error with Friedman's improvement score for potential
        splits, "absolute_error" for the mean absolute error, which minimizes
        the L1 loss using the median of each terminal node, and "poisson" which
        uses reduction in Poisson deviance to find splits.
        .. versionadded:: 0.18
           Mean Absolute Error (MAE) criterion.
        .. versionadded:: 0.24
            Poisson deviance criterion.
    splitter : {"best", "random"}, default="best"
        The strategy used to choose the split at each node. Supported
        strategies are "best" to choose the best split and "random" to choose
        the best random split.
    max_depth : int, default=None
        The maximum depth of the tree. If None, then nodes are expanded until
        all leaves are pure or until all leaves contain less than
        min_samples_split samples.
    min_samples_split : int or float, default=2
        The minimum number of samples required to split an internal node:
        - If int, then consider `min_samples_split` as the minimum number.
        - If float, then `min_samples_split` is a fraction and
          `ceil(min_samples_split * n_samples)` are the minimum
          number of samples for each split.
        .. versionchanged:: 0.18
           Added float values for fractions.
    min_samples_leaf : int or float, default=1
        The minimum number of samples required to be at a leaf node.
        A split point at any depth will only be considered if it leaves at
        least ``min_samples_leaf`` training samples in each of the left and
        right branches.  This may have the effect of smoothing the model,
        especially in regression.
        - If int, then consider `min_samples_leaf` as the minimum number.
        - If float, then `min_samples_leaf` is a fraction and
          `ceil(min_samples_leaf * n_samples)` are the minimum
          number of samples for each node.
        .. versionchanged:: 0.18
           Added float values for fractions.
    min_weight_fraction_leaf : float, default=0.0
        The minimum weighted fraction of the sum total of weights (of all
        the input samples) required to be at a leaf node. Samples have
        equal weight when sample_weight is not provided.
    max_features : int, float or {"auto", "sqrt", "log2"}, default=None
        The number of features to consider when looking for the best split:
        - If int, then consider `max_features` features at each split.
        - If float, then `max_features` is a fraction and
          `max(1, int(max_features * n_features_in_))` features are considered at each
          split.
        - If "auto", then `max_features=n_features`.
        - If "sqrt", then `max_features=sqrt(n_features)`.
        - If "log2", then `max_features=log2(n_features)`.
        - If None, then `max_features=n_features`.
        .. deprecated:: 1.1
            The `"auto"` option was deprecated in 1.1 and will be removed
            in 1.3.
        Note: the search for a split does not stop until at least one
        valid partition of the node samples is found, even if it requires to
        effectively inspect more than ``max_features`` features.
    random_state : int, RandomState instance or None, default=None
        Controls the randomness of the estimator. The features are always
        randomly permuted at each split, even if ``splitter`` is set to
        ``"best"``. When ``max_features < n_features``, the algorithm will
        select ``max_features`` at random at each split before finding the best
        split among them. But the best found split may vary across different
        runs, even if ``max_features=n_features``. That is the case, if the
        improvement of the criterion is identical for several splits and one
        split has to be selected at random. To obtain a deterministic behaviour
        during fitting, ``random_state`` has to be fixed to an integer.
        See :term:`Glossary <random_state>` for details.
    max_leaf_nodes : int, default=None
        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
        Best nodes are defined as relative reduction in impurity.
        If None then unlimited number of leaf nodes.
    min_impurity_decrease : float, default=0.0
        A node will be split if this split induces a decrease of the impurity
        greater than or equal to this value.
        The weighted impurity decrease equation is the following::
            N_t / N * (impurity - N_t_R / N_t * right_impurity
                                - N_t_L / N_t * left_impurity)
        where ``N`` is the total number of samples, ``N_t`` is the number of
        samples at the current node, ``N_t_L`` is the number of samples in the
        left child, and ``N_t_R`` is the number of samples in the right child.
        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
        if ``sample_weight`` is passed.
        .. versionadded:: 0.19
    ccp_alpha : non-negative float, default=0.0
        Complexity parameter used for Minimal Cost-Complexity Pruning. The
        subtree with the largest cost complexity that is smaller than
        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
        :ref:`minimal_cost_complexity_pruning` for details.
        .. versionadded:: 0.22
    Attributes
    ----------
    feature_importances_ : ndarray of shape (n_features,)
        The feature importances.
        The higher, the more important the feature.
        The importance of a feature is computed as the
        (normalized) total reduction of the criterion brought
        by that feature. It is also known as the Gini importance [4]_.
        Warning: impurity-based feature importances can be misleading for
        high cardinality features (many unique values). See
        :func:`sklearn.inspection.permutation_importance` as an alternative.
    max_features_ : int
        The inferred value of max_features.
    n_features_in_ : int
        Number of features seen during :term:`fit`.
        .. versionadded:: 0.24
    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.
        .. versionadded:: 1.0
    n_outputs_ : int
        The number of outputs when ``fit`` is performed.
    tree_ : Tree instance
        The underlying Tree object. Please refer to
        ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
        :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
        for basic usage of these attributes.
    See Also
    --------
    DecisionTreeClassifier : A decision tree classifier.
    Notes
    -----
    The default values for the parameters controlling the size of the trees
    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
    unpruned trees which can potentially be very large on some data sets. To
    reduce memory consumption, the complexity and size of the trees should be
    controlled by setting those parameter values.
    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
    .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
           and Regression Trees", Wadsworth, Belmont, CA, 1984.
    .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
           Learning", Springer, 2009.
    .. [4] L. Breiman, and A. Cutler, "Random Forests",
           https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm

"""



In [3]:
from sklearn.model_selection import train_test_split
"""
Sklearn train_test_split Source Docstring:


    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.
    train_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, default=None
        Controls the shuffling applied to the data before applying the split.
        Pass an int for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.
    shuffle : bool, default=True
        Whether or not to shuffle the data before splitting. If shuffle=False
        then stratify must be None.
    stratify : array-like, default=None
        If not None, data is split in a stratified fashion, using this as
        the class labels.
        Read more in the :ref:`User Guide <stratification>`.
    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.
        .. versionadded:: 0.16
            If the input is sparse, the output will be a
            ``scipy.sparse.csr_matrix``. Else, output type is the same as the
            input type.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import train_test_split
    >>> X, y = np.arange(10).reshape((5, 2)), range(5)
    >>> X
    array([[0, 1],
           [2, 3],
           [4, 5],
           [6, 7],
           [8, 9]])
    >>> list(y)
    [0, 1, 2, 3, 4]
    >>> X_train, X_test, y_train, y_test = train_test_split(
    ...     X, y, test_size=0.33, random_state=42)
    ...
    >>> X_train
    array([[4, 5],
           [0, 1],
           [6, 7]])
    >>> y_train
    [2, 0, 3]
    >>> X_test
    array([[2, 3],
           [8, 9]])
    >>> y_test
    [1, 4]
    >>> train_test_split(y, shuffle=False)
    [[0, 1, 2], [3, 4]]
"""



'\nSklearn train_test_split Source Docstring:\n\n\n    Parameters\n    ----------\n    *arrays : sequence of indexables with same length / shape[0]\n        Allowed inputs are lists, numpy arrays, scipy-sparse\n        matrices or pandas dataframes.\n    test_size : float or int, default=None\n        If float, should be between 0.0 and 1.0 and represent the proportion\n        of the dataset to include in the test split. If int, represents the\n        absolute number of test samples. If None, the value is set to the\n        complement of the train size. If ``train_size`` is also None, it will\n        be set to 0.25.\n    train_size : float or int, default=None\n        If float, should be between 0.0 and 1.0 and represent the\n        proportion of the dataset to include in the train split. If\n        int, represents the absolute number of train samples. If None,\n        the value is automatically set to the complement of the test size.\n    random_state : int, RandomState instan

In [4]:
from sklearn.metrics import mean_absolute_error
""""
Sklearn mean_absolute_error Source Docstring:



Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.
    multioutput : {'raw_values', 'uniform_average'}  or array-like of shape \
            (n_outputs,), default='uniform_average'
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
        'raw_values' :
            Returns a full set of errors in case of multioutput input.
        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.
    Returns
    -------
    loss : float or ndarray of floats
        If multioutput is 'raw_values', then mean absolute error is returned
        for each output separately.
        If multioutput is 'uniform_average' or an ndarray of weights, then the
        weighted average of all output errors is returned.
        MAE output is non-negative floating point. The best value is 0.0.



"""

'"\nSklearn mean_absolute_error Source Docstring:\n\n\n\nParameters\n    ----------\n    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n        Ground truth (correct) target values.\n    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n        Estimated target values.\n    sample_weight : array-like of shape (n_samples,), default=None\n        Sample weights.\n    multioutput : {\'raw_values\', \'uniform_average\'}  or array-like of shape             (n_outputs,), default=\'uniform_average\'\n        Defines aggregating of multiple output values.\n        Array-like value defines weights used to average errors.\n        \'raw_values\' :\n            Returns a full set of errors in case of multioutput input.\n        \'uniform_average\' :\n            Errors of all outputs are averaged with uniform weight.\n    Returns\n    -------\n    loss : float or ndarray of floats\n        If multioutput is \'raw_values\', then mean absolute error is retur

In [5]:
from sklearn.metrics import mean_squared_error
"""
Sklearn mean_squared_error Source Docstring:

Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.
    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
            (n_outputs,), default='uniform_average'
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
        'raw_values' :
            Returns a full set of errors in case of multioutput input.
        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.
    squared : bool, default=True
        If True returns MSE value, if False returns RMSE value.
    Returns
    -------
    loss : float or ndarray of floats
        A non-negative floating point value (the best value is 0.0), or an
        array of floating point values, one for each individual target



"""

"\nSklearn mean_squared_error Source Docstring:\n\nParameters\n    ----------\n    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n        Ground truth (correct) target values.\n    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n        Estimated target values.\n    sample_weight : array-like of shape (n_samples,), default=None\n        Sample weights.\n    multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'\n        Defines aggregating of multiple output values.\n        Array-like value defines weights used to average errors.\n        'raw_values' :\n            Returns a full set of errors in case of multioutput input.\n        'uniform_average' :\n            Errors of all outputs are averaged with uniform weight.\n    squared : bool, default=True\n        If True returns MSE value, if False returns RMSE value.\n    Returns\n    -------\n    loss : float or ndarray of f

In [6]:
from sklearn.metrics import r2_score
"""
Sklearn r2_score Source Docstring:

Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.
    multioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, \
            array-like of shape (n_outputs,) or None, default='uniform_average'
        Defines aggregating of multiple output scores.
        Array-like value defines weights used to average scores.
        Default is "uniform_average".
        'raw_values' :
            Returns a full set of scores in case of multioutput input.
        'uniform_average' :
            Scores of all outputs are averaged with uniform weight.
        'variance_weighted' :
            Scores of all outputs are averaged, weighted by the variances
            of each individual output.
        .. versionchanged:: 0.19
            Default value of multioutput is 'uniform_average'.
    force_finite : bool, default=True
        Flag indicating if ``NaN`` and ``-Inf`` scores resulting from constant
        data should be replaced with real numbers (``1.0`` if prediction is
        perfect, ``0.0`` otherwise). Default is ``True``, a convenient setting
        for hyperparameters' search procedures (e.g. grid search
        cross-validation).
        .. versionadded:: 1.1
    Returns
    -------
    z : float or ndarray of floats
        The :math:`R^2` score or ndarray of scores if 'multioutput' is
        'raw_values'.
    Notes
    -----
    This is not a symmetric function.
    Unlike most other scores, :math:`R^2` score may be negative (it need not
    actually be the square of a quantity R).
    This metric is not well-defined for single samples and will return a NaN
    value if n_samples is less than two.



"""

'\nSklearn r2_score Source Docstring:\n\nParameters\n    ----------\n    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n        Ground truth (correct) target values.\n    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n        Estimated target values.\n    sample_weight : array-like of shape (n_samples,), default=None\n        Sample weights.\n    multioutput : {\'raw_values\', \'uniform_average\', \'variance_weighted\'},             array-like of shape (n_outputs,) or None, default=\'uniform_average\'\n        Defines aggregating of multiple output scores.\n        Array-like value defines weights used to average scores.\n        Default is "uniform_average".\n        \'raw_values\' :\n            Returns a full set of scores in case of multioutput input.\n        \'uniform_average\' :\n            Scores of all outputs are averaged with uniform weight.\n        \'variance_weighted\' :\n            Scores of all outputs are averaged, weighted

## Load in and Inspect Analysis Data:

In [7]:
sentencing_data_cleaned = pd.read_csv("../Data/sentencing_data_for_analysis.csv", low_memory = False)

print(sentencing_data_cleaned.head, sentencing_data_cleaned.shape, sentencing_data_cleaned.info)



<bound method NDFrame.head of        Unnamed: 0.1  Unnamed: 0       CASE_ID  CASE_PARTICIPANT_ID  \
0             57587      116398  429485886505         854062814867   
1             58879      119085  430780557292         858166118899   
2             62770      127700  435531599636         872618846575   
3             60794      123275  432903928993         864718665101   
4             60718      123109  432818606428         864445174414   
...             ...         ...           ...                  ...   
65852         43250       87511  417323529438         814831906416   
65853         42545       85980  416715404242         812927097571   
65854         52248      105567  424442309656         837900156976   
65855         52514      106082  424655395784         838575767684   
65856         42211       85308  416479702494         812173252295   

                RECEIVED_DATE                           OFFENSE_CATEGORY  \
0        3/7/2018 12:00:00 AM                        

## Assigning X and Y variables for the Decision Tree Model:

In [8]:
## Starting By Dropping Nas: 

no_nas = sentencing_data_cleaned.dropna(subset = ["age_derived", "is_guilty_plea", "nth_case", "is_female_derived", "is_innocent_plea", "is_male_derived", "is_black_derived", "is_white_derived", 
                                                  "is_hisp_derived", "is_other_derived"])

## X and Y columns (variables) selected from Dataframe:

x_cols = no_nas[["is_guilty_plea", "nth_case", "is_female_derived", "is_innocent_plea", "is_male_derived", "is_black_derived", "is_white_derived", 
                                                  "is_hisp_derived", "is_other_derived", "age_derived"]]
y_col = no_nas["sentence_length_zscore"]



## Checking columns are correct:

print(x_cols.head, y_col.head)

<bound method NDFrame.head of        is_guilty_plea  nth_case  is_female_derived  is_innocent_plea  \
0                   1         1                  0                 0   
1                   1         2                  0                 0   
2                   1         3                  0                 0   
4                   1         5                  0                 0   
5                   1         6                  0                 0   
...               ...       ...                ...               ...   
65852               1        18                  0                 0   
65853               1        19                  0                 0   
65854               1        20                  0                 0   
65855               1        21                  0                 0   
65856               1        22                  1                 0   

       is_male_derived  is_black_derived  is_white_derived  is_hisp_derived  \
0                    1    

## Splitting Data into Train Test Split

In [9]:
## Splitting Data into Train Test Split for Model 
## Note: Using basic split- 80% train, 20% test (pareto principle)
## Random state set to same number as class (it doesn't really matter as long as its consistent across the model)

X_train, X_test, y_train, y_test = train_test_split(x_cols, y_col, test_size = 0.2, random_state = 10)

## checking shape to make sure this split worked 

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(50612, 10) (12654, 10)
(50612,) (12654,)


## Creating Decision Tree model using DecisionTreeClassifier()

In [10]:
## initializing dt_classifier
## Keeping random state constant with train test data, and max_depth same as class (10) 

dt_regressor = DecisionTreeRegressor(random_state = 10, max_depth = 10)

## Fitting dt_regressor on training data

In [11]:
## Fitting the Decision Tree Model to the training data

model = dt_regressor.fit(X_train, y_train)

## Testing Model on Test Set

In [12]:
## Testing Model: 

predictions = model.predict(X_test)

## Checking Model Results

In [16]:
## Mean Squared Error: 
mse = mean_squared_error(y_test, predictions)

## Mean Absolute Error:
mae = mean_absolute_error(y_test, predictions)

## R Squared: 

r2 = r2_score(y_test, predictions)

## Checking Performance Scores:
print("Decision Tree Model:")
print("MSE: ", mse)
print("MAE: ", mae)
print("R-Squared: ", r2)

## Checking Feature Importances: 

importances = model.feature_importances_

importance_table = pd.DataFrame({"Feature": x_cols.columns, "Importance": importances})

print(importance_table)


Decision Tree Model:
MSE:  0.3112457637972652
MAE:  0.42646236420670114
R-Squared:  0.0034546387339225992
             Feature  Importance
0     is_guilty_plea    0.093501
1           nth_case    0.386827
2  is_female_derived    0.009376
3   is_innocent_plea    0.000000
4    is_male_derived    0.049550
5   is_black_derived    0.034641
6   is_white_derived    0.024113
7    is_hisp_derived    0.007185
8   is_other_derived    0.009068
9        age_derived    0.385740


## Turning Error Results into Exportable Table:

In [14]:
## Making table to be exported: 
## Must be lists for the index, otherwise doesn't convert to df

table_df = pd.DataFrame({"Mean Squared Error": [mse], "Mean Absolute Error": [mae], "R-Squared": [r2]})

## Printing table to make sure it worked: 

print(table_df)

   Mean Squared Error  Mean Absolute Error  R-Squared
0            0.311246             0.426462   0.003455


## Exporting DecisionTreeRegression tables 

In [15]:
## Exporting Error Table

table_df.to_csv(r'../Output/Tables/DT_standard_error_table.csv')

## Exporting Feature Importance Table


importance_table.to_csv(r'../Output/Tables/DT_feature_importance_table.csv')


