## Modeling Analysis Data To Look for Causality Between Our Descriptive Variables and Sentence Severity (OLS Model)

## Load Packages: 

In [1]:
import pandas as pd
import numpy as np
import random
import re
import os

## For OLS Model: 

import statsmodels.api as sm

## Note the code below is not a function, it just resets the output so it shows all lines

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



## Functions

In [2]:
## summary_col() creates summary table(s) from multiple regressions 

from statsmodels.iolib.summary2 import summary_col

"""
Statsmodels.iolib.summary2 source docstring for summary_col: 


    Summarize multiple results instances horizontally (side-by-side).

    Parameters
    ----------
    results : list[Result]
        List of results instances to summarize.
    float_format : str, optional
        String formatting code to use for float results.
    model_names : list[str], optional
        List of model names. If not None, then the number of model names
        should equal the number of results instances in the results list.
    stars : bool, optional
        Whether to include significance stars.
    info_dict : dict or None, optional
        Dictionary of additional information to include in the table. The
        dictionary should be structured so that the keys are column names
        and the values are lists or arrays with the same number of elements
        as there are models being summarized.
    regressor_order : list[str] or None, optional
        List of regressor names in the order they should appear in the
        output table. If not provided, then regressors will be displayed
        in the order they appear in the model results summary tables.
    drop_omitted : bool, optional
        Whether to drop columns for omitted variables.
    title : str or None, optional
        Title to include at the top of the table.
    return_txt : bool, optional
        Whether to return the summary table as a plain-text string.

    Returns
    -------
    table : Summary
        A summary table instance.
    """



'\nStatsmodels.iolib.summary2 source docstring for summary_col: \n\n\n    Summarize multiple results instances horizontally (side-by-side).\n\n    Parameters\n    ----------\n    results : list[Result]\n        List of results instances to summarize.\n    float_format : str, optional\n        String formatting code to use for float results.\n    model_names : list[str], optional\n        List of model names. If not None, then the number of model names\n        should equal the number of results instances in the results list.\n    stars : bool, optional\n        Whether to include significance stars.\n    info_dict : dict or None, optional\n        Dictionary of additional information to include in the table. The\n        dictionary should be structured so that the keys are column names\n        and the values are lists or arrays with the same number of elements\n        as there are models being summarized.\n    regressor_order : list[str] or None, optional\n        List of regressor n

In [3]:
## ols() creates a basic Linear Model (OLS) for the x and y columns (variables) inputted 

def ols(x_col, y_col, df):
    
    if not isinstance(x_col, list) & isinstance(y_col, list):
        raise TypeError("X and Y variables must be passed as lists")
    

    ## get rid of nas that would interrupt the model 
    df_no_nas = df.dropna(subset = x_col + y_col)


    ## set x and y equal to correct columns in df 
    x = df_no_nas[x_col]
    y = df_no_nas[y_col]

    ## add a constant to model all independent variables at once
    x = sm.add_constant(x)

    ## fit and create model
    model = sm.OLS(y, x).fit()

    return model
    
    """
    Use ordinary least squares regression to estimate the linear relationship between independent (x) and dependent variables (y)
    

    Parameters:
    
    x_col: List
        List containing column names of independent variable(s)
    
    y_col: List
        List containing column name for dependent variable(s)
    
    df: pandas.DataFrame 
        DataFrame containing the dataset to be modeld 

    Outputs: statsmodels.regression.linear_model.RegressionResultsWrapper
        OLS Model 
    
   TypeError 
        If x_col and y_col are not passed as lists.
    
    """


## Load in and Inspect Analysis Data:

In [4]:
sentencing_data_cleaned = pd.read_csv("../Data/sentencing_data_for_analysis.csv", low_memory = False)

print(sentencing_data_cleaned.head, sentencing_data_cleaned.shape, sentencing_data_cleaned.info)



<bound method NDFrame.head of        Unnamed: 0.1  Unnamed: 0       CASE_ID  CASE_PARTICIPANT_ID  \
0             57587      116398  429485886505         854062814867   
1             58879      119085  430780557292         858166118899   
2             62770      127700  435531599636         872618846575   
3             60794      123275  432903928993         864718665101   
4             60718      123109  432818606428         864445174414   
...             ...         ...           ...                  ...   
65852         43250       87511  417323529438         814831906416   
65853         42545       85980  416715404242         812927097571   
65854         52248      105567  424442309656         837900156976   
65855         52514      106082  424655395784         838575767684   
65856         42211       85308  416479702494         812173252295   

                RECEIVED_DATE                           OFFENSE_CATEGORY  \
0        3/7/2018 12:00:00 AM                        

## Computing Linear Regression of Continuous Variables

In [5]:
## Using Descriptive Variables We Identified vs Sentence Severity

## X and Y Variables:
## ols function must be passed lists

x_vars = ["age_derived", "is_guilty_plea", "nth_case", "is_female_derived", "is_innocent_plea", "is_male_derived", "is_black_derived", "is_white_derived", 
                                                  "is_hisp_derived", "is_other_derived"]
y_var = ["sentence_length_zscore"]

## Creating the model 

model = ols(x_vars, y_var, sentencing_data_cleaned)

## checking model results 

print(model.summary2())



                     Results: Ordinary least squares
Model:              OLS                    Adj. R-squared:     0.021      
Dependent Variable: sentence_length_zscore AIC:                106467.4132
Date:               2023-03-13 01:22       BIC:                106548.9091
No. Observations:   63266                  Log-Likelihood:     -53225.    
Df Model:           8                      F-statistic:        173.7      
Df Residuals:       63257                  Prob (F-statistic): 1.58e-291  
R-squared:          0.021                  Scale:              0.31501    
---------------------------------------------------------------------------
                       Coef.   Std.Err.     t      P>|t|    [0.025   0.975]
---------------------------------------------------------------------------
const                 -0.1326    0.0324   -4.0894  0.0000  -0.1962  -0.0691
age_derived            0.0055    0.0002   28.9912  0.0000   0.0051   0.0058
is_guilty_plea        -0.2956    0.0180  -

## Combining Model Results Into A Simple Table:

In [6]:
## Use summary_col() to create a simplified table that can be exported for paper

table = summary_col(model, stars = True, model_names = ["Sentence Severity"])

## Check table before exporting
print(table)



                  Sentence Severity
-----------------------------------
const             -0.1326***       
                  (0.0324)         
age_derived       0.0055***        
                  (0.0002)         
is_guilty_plea    -0.2956***       
                  (0.0180)         
nth_case          -0.0000***       
                  (0.0000)         
is_female_derived 0.0535           
                  (0.0552)         
is_innocent_plea  0.1629***        
                  (0.0219)         
is_male_derived   0.1567***        
                  (0.0546)         
is_black_derived  -0.0092          
                  (0.0107)         
is_white_derived  -0.0393***       
                  (0.0117)         
is_hisp_derived   -0.0467***       
                  (0.0115)         
is_other_derived  -0.0375          
                  (0.0256)         
R-squared         0.0215           
R-squared Adj.    0.0214           
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


## Exporting Regression table 

In [7]:
## Exporting Table
table_df = table.tables[0]

table_df.to_csv(r'../Output/Tables/OLS_table.csv')
