## Load Packages: 

In [112]:
import pandas as pd
import numpy as np
import random
import re
import os
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col


## Load in and Inspect Regression Data:

In [93]:
sentencing_data_cleaned = pd.read_csv("../Data/sentencing_data_for_regression.csv", low_memory = False)

print(sentencing_data_cleaned.head, sentencing_data_cleaned.shape, sentencing_data_cleaned.info)


## Fixing import issue with converting dates to datetime 

sentencing_data_cleaned["sentenceymd_derived_dt"] = pd.to_datetime(sentencing_data_cleaned["sentenceymd_derived_dt"])

## checking to make sure this conversion worked: 

print(type(sentencing_data_cleaned["sentenceymd_derived_dt"][1]))

<bound method NDFrame.head of        Unnamed: 0.1  Unnamed: 0       CASE_ID  CASE_PARTICIPANT_ID  \
0             57587      116398  429485886505         854062814867   
1             58879      119085  430780557292         858166118899   
2             62770      127700  435531599636         872618846575   
3             60794      123275  432903928993         864718665101   
4             60718      123109  432818606428         864445174414   
...             ...         ...           ...                  ...   
65852         43250       87511  417323529438         814831906416   
65853         42545       85980  416715404242         812927097571   
65854         52248      105567  424442309656         837900156976   
65855         52514      106082  424655395784         838575767684   
65856         42211       85308  416479702494         812173252295   

                RECEIVED_DATE                           OFFENSE_CATEGORY  \
0        3/7/2018 12:00:00 AM                        

## Computing Linear Regression of Continuous Variables

In [107]:
## Using variables created previously vs Sentence Severity

## Dropping nas in the rows of the variables we will model 

age_df = sentencing_data_cleaned.dropna(subset = ["age_derived", "is_guilty_plea", "nth_case", "is_female_derived", "is_innocent_plea",
                                                 "is_male_derived", "is_black_derived", "is_white_derived", 
                                                  "is_hisp_derived", "is_other_derived"])


## Creating the model 
x = age_df[["age_derived", "is_guilty_plea", "nth_case", "is_female_derived", "is_innocent_plea", "is_male_derived", "is_black_derived", "is_white_derived", 
                                                  "is_hisp_derived", "is_other_derived"]]

y = age_df["sentence_length_zscore"]

## creating a constant 

x = sm.add_constant(x)

model = sm.OLS(y, x).fit()

## checking model results 

print(model.summary())


                              OLS Regression Results                              
Dep. Variable:     sentence_length_zscore   R-squared:                       0.021
Model:                                OLS   Adj. R-squared:                  0.021
Method:                     Least Squares   F-statistic:                     173.7
Date:                    Sat, 11 Mar 2023   Prob (F-statistic):          1.58e-291
Time:                            20:30:18   Log-Likelihood:                -53225.
No. Observations:                   63266   AIC:                         1.065e+05
Df Residuals:                       63257   BIC:                         1.065e+05
Df Model:                               8                                         
Covariance Type:                nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

## Combining Model Results as a Table:

In [114]:
table = summary_col(model, stars = True, model_names = ["Sentence Severity"])

print(table)


                  Sentence Severity
-----------------------------------
const             -0.1326***       
                  (0.0324)         
age_derived       0.0055***        
                  (0.0002)         
is_guilty_plea    -0.2956***       
                  (0.0180)         
nth_case          -0.0000***       
                  (0.0000)         
is_female_derived 0.0535           
                  (0.0552)         
is_innocent_plea  0.1629***        
                  (0.0219)         
is_male_derived   0.1567***        
                  (0.0546)         
is_black_derived  -0.0092          
                  (0.0107)         
is_white_derived  -0.0393***       
                  (0.0117)         
is_hisp_derived   -0.0467***       
                  (0.0115)         
is_other_derived  -0.0375          
                  (0.0256)         
R-squared         0.0215           
R-squared Adj.    0.0214           
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


## Exporting Regression table 

In [120]:
table_df = table.tables[0]

table_df.to_csv(r'../Output/Tables/OLS_table.csv')