# <b> Multi Logit Model 1 <b>

### <b> Multinomial Logistic Regression Model using Age, Race, Sex, and Offense Types as Features <b>

Load Necessary Libraries

In [251]:
# importing libraries
import pandas as pd
import numpy as np
import scipy as scp
import sklearn
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import confusion_matrix

Read Dataset

In [252]:
# reading dataset using pandas
data = pd.read_csv(r'iowa_rec_clean.csv')
# print data
data

Unnamed: 0,Fiscal _Year_Released,Recidivism_Reporting_Year,Age_At_Release,Days_To_Return,Release_Type,Race_Ethnicity,Sex,Offense_Classification,Offense_Type,Offense_Subtype,New_Offense_Classification,New_Offense_Type,New_Offense_Subtype,Return_To_Prison,Recidivism_Type
0,2010.0,2013.0,2.0,433.0,4.0,7.0,2.0,3.0,4.0,16.0,4.0,2.0,23.0,1.0,0.0
1,2010.0,2013.0,2.0,453.0,2.0,11.0,2.0,4.0,2.0,21.0,0.0,0.0,0.0,1.0,2.0
2,2010.0,2013.0,3.0,832.0,4.0,11.0,2.0,2.0,0.0,23.0,0.0,0.0,0.0,1.0,2.0
3,2010.0,2013.0,2.0,0.0,4.0,11.0,2.0,2.0,1.0,11.0,0.0,0.0,0.0,0.0,1.0
4,2010.0,2013.0,3.0,116.0,2.0,7.0,2.0,4.0,4.0,3.0,0.0,0.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26015,2015.0,2018.0,1.0,0.0,6.0,10.0,2.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0
26016,2015.0,2018.0,3.0,0.0,11.0,11.0,2.0,3.0,4.0,17.0,0.0,0.0,0.0,0.0,1.0
26017,2015.0,2018.0,2.0,0.0,5.0,11.0,1.0,1.0,3.0,22.0,0.0,0.0,0.0,0.0,1.0
26018,2015.0,2018.0,2.0,0.0,10.0,11.0,2.0,4.0,2.0,21.0,0.0,0.0,0.0,0.0,1.0


In [253]:
# remove unnecessary column
data.drop(['Recidivism_Reporting_Year'], axis=1, inplace=True)

In [254]:
# count of Age_At_Release contents
data_age = data['Age_At_Release'].value_counts()
print(data_age)

2.0    9554
3.0    6223
1.0    4590
4.0    4347
5.0    1303
0.0       3
Name: Age_At_Release, dtype: int64


In [255]:
# count of Race_Ethnicity contents
data_race = data['Race_Ethnicity'].value_counts()
print(data_race)

11.0    17584
7.0      6109
10.0     1522
2.0       502
4.0       192
6.0        37
0.0        30
1.0        20
9.0        12
3.0         5
8.0         5
5.0         2
Name: Race_Ethnicity, dtype: int64


In [256]:
# count of Sex contents
data_sex = data['Sex'].value_counts()
print(data_sex)

2.0    22678
1.0     3339
0.0        3
Name: Sex, dtype: int64


Feature Selection

In [257]:
x1 = data[['Age_At_Release', 'Race_Ethnicity', 'Sex', 'Release_Type', 'Offense_Classification',
          'Offense_Type', 'Offense_Subtype', 'New_Offense_Classification', 'New_Offense_Type', 'New_Offense_Subtype']].astype(float)
y1 = data[['Return_To_Prison']].astype(float)

In [258]:
print(list(x1.columns.values))

['Age_At_Release', 'Race_Ethnicity', 'Sex', 'Release_Type', 'Offense_Classification', 'Offense_Type', 'Offense_Subtype', 'New_Offense_Classification', 'New_Offense_Type', 'New_Offense_Subtype']


In [259]:
print(list(y1.columns.values))

['Return_To_Prison']


Split Dataset

In [260]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.20, random_state=5)

In [261]:
print(x_train1.shape)
print(x_test1.shape)
print(y_train1.shape)
print(y_test1.shape)

(20816, 10)
(5204, 10)
(20816, 1)
(5204, 1)


In [262]:
y_train1 = y_train1.values.ravel()

Predict Classification

In [263]:
model1 = LogisticRegression(random_state=5, multi_class='multinomial', penalty='none', solver='newton-cg').fit(x_train1, y_train1)

In [264]:
pred1 = model1.predict(x_test1)

In [265]:
param = model1.get_params()
print(param)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'none', 'random_state': 5, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


Print Model Parameters

In [266]:
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)

Intercept: 
 [-1.46584938]
Coefficients: 
 [[-1.03004394e-01  2.55002012e-03  5.66714467e-02  1.29768427e-01
   2.82184956e-02 -8.09720860e-02  8.89092565e-04  8.30445569e-01
   1.15441391e+00  1.00906429e+00]]


Calculate Odds Ratio Estimates

In [267]:
np.exp(model1.coef_)

array([[0.90212301, 1.00255327, 1.05830804, 1.13856469, 1.02862041,
        0.92221943, 1.00088949, 2.2943408 , 3.17216369, 2.74303314]])

Statsmodels to Assess Variables

In [268]:
np.asarray(data)

array([[2.010e+03, 2.000e+00, 4.330e+02, ..., 2.300e+01, 1.000e+00,
        0.000e+00],
       [2.010e+03, 2.000e+00, 4.530e+02, ..., 0.000e+00, 1.000e+00,
        2.000e+00],
       [2.010e+03, 3.000e+00, 8.320e+02, ..., 0.000e+00, 1.000e+00,
        2.000e+00],
       ...,
       [2.015e+03, 2.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [2.015e+03, 2.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [2.015e+03, 3.000e+00, 3.690e+02, ..., 1.000e+01, 1.000e+00,
        2.000e+00]])

In [269]:
logit_model1 = sm.MNLogit(y_train1, sm.add_constant(x_train1))

In [270]:
result = logit_model1.fit()
stats1 = result.summary()
stats2 = result.summary2()
print(stats1)
print(stats2)

         Current function value: 0.225548
         Iterations: 35
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                20816
Model:                        MNLogit   Df Residuals:                    20805
Method:                           MLE   Df Model:                           10
Date:                Sun, 07 May 2023   Pseudo R-squ.:                  0.6459
Time:                        13:16:30   Log-Likelihood:                -4695.0
converged:                      False   LL-Null:                       -13258.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                       y=1       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const                         -2.9317      0.234    -12.531      0.000      -3.390      -2.473
Age_At_Release   



## Testing

Accuracy

In [271]:
print('Accuracy Score:', metrics.accuracy_score(y_test1, pred1))

Accuracy Score: 0.9265949269792467


In [272]:
class_report = classification_report(y_test1, pred1)
print(class_report)

              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      3473
         1.0       1.00      0.78      0.88      1731

    accuracy                           0.93      5204
   macro avg       0.95      0.89      0.91      5204
weighted avg       0.93      0.93      0.92      5204



Overfitting vs Underfitting

In [273]:
print("Training set score: {:.4f}".format(model1.score(x_train1, y_train1)))

Training set score: 0.9247


In [274]:
print("Test set score: {:.4f}".format(model1.score(x_test1, y_test1)))

Test set score: 0.9266


Metrics Analysis

In [275]:
from sklearn.metrics import confusion_matrix
y_pred1 = model1.predict(x_test1)

In [276]:
cm1 = confusion_matrix(y_test1, y_pred1)
print(cm1)

[[3470    3]
 [ 379 1352]]


# <b> Multi Logit Model 2 <b>

Feature Selection

In [277]:
x2 = data[['Age_At_Release', 'Sex', 'Offense_Classification', 
          'Offense_Type', 'Offense_Subtype', 'New_Offense_Classification', 
          'New_Offense_Type', 'New_Offense_Subtype']].astype(float)
y2 = data[['Return_To_Prison']].astype(float)

In [278]:
print(list(x2.columns.values))

['Age_At_Release', 'Sex', 'Offense_Classification', 'Offense_Type', 'Offense_Subtype', 'New_Offense_Classification', 'New_Offense_Type', 'New_Offense_Subtype']


In [279]:
print(list(y2.columns.values))

['Return_To_Prison']


Splitting Dataset

In [280]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.20, random_state=5)

In [281]:
print(x_train2.shape)
print(x_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(20816, 8)
(5204, 8)
(20816, 1)
(5204, 1)


In [282]:
y_train2 = y_train2.values.ravel()

Predict Classification

In [283]:
model2 = LogisticRegression(random_state=5, multi_class='multinomial', penalty='none', solver='newton-cg').fit(x_train2, y_train2)

In [284]:
pred2 = model2.predict(x_test2)

In [285]:
param2 = model2.get_params()
print(param2)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'none', 'random_state': 5, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


Print Model Parameters

In [286]:
print('Intercept: \n', model2.intercept_)
print('Coefficients: \n', model2.coef_)

Intercept: 
 [-1.0617452]
Coefficients: 
 [[-0.09757709  0.0564979   0.02898776 -0.04929458  0.0074806   0.80934889
   1.13547687  1.01481757]]


Calculate Odds Ratio Estimates

In [287]:
np.exp(model2.coef_)

array([[0.90703241, 1.05812439, 1.029412  , 0.95190068, 1.00750865,
        2.24644483, 3.11265751, 2.75886005]])

Statsmodels to Assess Variables

In [288]:
np.asarray(data)

array([[2.010e+03, 2.000e+00, 4.330e+02, ..., 2.300e+01, 1.000e+00,
        0.000e+00],
       [2.010e+03, 2.000e+00, 4.530e+02, ..., 0.000e+00, 1.000e+00,
        2.000e+00],
       [2.010e+03, 3.000e+00, 8.320e+02, ..., 0.000e+00, 1.000e+00,
        2.000e+00],
       ...,
       [2.015e+03, 2.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [2.015e+03, 2.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [2.015e+03, 3.000e+00, 3.690e+02, ..., 1.000e+01, 1.000e+00,
        2.000e+00]])

In [289]:
logit_model2 = sm.MNLogit(y_train2, sm.add_constant(x_train2))

In [290]:
result2 = logit_model2.fit()
stats1 = result2.summary()
stats2 = result2.summary2()
print(stats1)
print(stats2)

         Current function value: 0.240504
         Iterations: 35
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                20816
Model:                        MNLogit   Df Residuals:                    20807
Method:                           MLE   Df Model:                            8
Date:                Sun, 07 May 2023   Pseudo R-squ.:                  0.6224
Time:                        13:16:34   Log-Likelihood:                -5006.3
converged:                      False   LL-Null:                       -13258.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                       y=1       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const                         -2.1235      0.190    -11.190      0.000      -2.495      -1.752
Age_At_Release   



## Testing

Overfitting vs Underfitting

In [291]:
print("Training set score: {:.4f}".format(model2.score(x_train2, y_train2)))

Training set score: 0.9244


In [292]:
print("Test set score: {:.4f}".format(model2.score(x_test2, y_test2)))

Test set score: 0.9252


Metrics Analysis

In [293]:
from sklearn.metrics import classification_report
y_pred2 = model2.predict(x_test2)

In [294]:
report2 = classification_report(y_test2, y_pred2)
print(report2)

              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      3473
         1.0       1.00      0.78      0.87      1731

    accuracy                           0.93      5204
   macro avg       0.95      0.89      0.91      5204
weighted avg       0.93      0.93      0.92      5204

