In [13]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('data/clean_rawdata.csv', index_col = ['Unnamed: 0'])
#check data
data.head()

Unnamed: 0,Person_ID,AssessmentID,Case_ID,Agency_Text,LastName,FirstName,MiddleName,DateOfBirth,ScaleSet_ID,ScaleSet,...,MaritalStatus_Single,MaritalStatus_Unknown,MaritalStatus_Widowed,RecSupervisionLevelText_High,RecSupervisionLevelText_Low,RecSupervisionLevelText_Medium,RecSupervisionLevelText_Medium with Override Consideration,DisplayText_Risk of Failure to Appear,DisplayText_Risk of Recidivism,DisplayText_Risk of Violence
0,50844,57167,51950,PRETRIAL,Fisher,Kevin,,1992-12-05,22,Risk and Prescreen,...,1,0,0,0,1,0,0,0,0,1
1,50844,57167,51950,PRETRIAL,Fisher,Kevin,,1992-12-05,22,Risk and Prescreen,...,1,0,0,0,1,0,0,0,1,0
2,50844,57167,51950,PRETRIAL,Fisher,Kevin,,1992-12-05,22,Risk and Prescreen,...,1,0,0,0,1,0,0,1,0,0
3,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,1984-09-16,22,Risk and Prescreen,...,0,0,0,0,1,0,0,0,0,1
4,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,1984-09-16,22,Risk and Prescreen,...,0,0,0,0,1,0,0,0,1,0


In [4]:
data.columns.values.tolist().index('AgeAtArrest')

20

In [5]:
#subset data by score type
fta = data[data['DisplayText_Risk of Failure to Appear'] == 1]
recid = data[data['DisplayText_Risk of Recidivism'] == 1]
violence = data[data['DisplayText_Risk of Violence'] == 1]

In [6]:
#train test split data 
trainFTA, testFTA = train_test_split(fta, test_size = 0.20, random_state = 221)
trainRecid, testRecid = train_test_split(recid, test_size = 0.20, random_state = 221)
trainViolence, testViolence = train_test_split(violence, test_size = 0.20, random_state = 221)

In [16]:
#predictor columns
idxPred = range(20, 57)

def getXY(df):
    
    X = df.iloc[:, idxPred]
    Y = df['DecileScore']
    
    return X, Y

def getModel(df):
    
    m = SVC(random_state = 221)
    x, y = getXY(df)
    m.fit(x, y)
    
    return m

In [17]:
#get models
mFTA = getModel(trainFTA)
mRecid = getModel(trainRecid)
mViolence = getModel(trainViolence)

In [18]:
#get prediction accuracy scores
ascoreFTA = accuracy_score(getXY(testFTA)[1], predFTA)
ascoreRecid = accuracy_score(getXY(testRecid)[1], predRecid)
ascoreViolence = accuracy_score(getXY(testViolence)[1], predViolence)

print('Test Prediction Accuracy Score: ')
print('Risk of Failure to Appear: ', ascoreFTA)
print('Risk of Recidivism: ', ascoreRecid)
print('Risk of Violence: ', ascoreViolence)

Test Prediction Accuracy Score: 
Risk of Failure to Appear:  0.38723194478678824
Risk of Recidivism:  0.44529513460113607
Risk of Violence:  0.556596794081381


In [19]:
#get predictions
predFTA = mFTA.predict(getXY(testFTA)[0])
predRecid = mRecid.predict(getXY(testRecid)[0])
predViolence = mViolence.predict(getXY(testViolence)[0])

In [20]:
#get prediction scores
scoreFTA = f1_score(getXY(testFTA)[1], predFTA, average = 'weighted')
scoreRecid = f1_score(getXY(testRecid)[1], predRecid, average = 'weighted')
scoreViolence = f1_score(getXY(testViolence)[1], predViolence, average = 'weighted')

print('Test Prediction F1 score: ')
print('Risk of Failure to Appear: ', scoreFTA)
print('Risk of Recidivism: ', scoreRecid)
print('Risk of Violence: ', scoreViolence)

Test Prediction F1 score: 
Risk of Failure to Appear:  0.2740738573976841
Risk of Recidivism:  0.3677502005157709
Risk of Violence:  0.5273706870650389
