In [73]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.feature_extraction import text
from sklearn.preprocessing import OneHotEncoder
from numpy.random import randn
from numpy.random import seed
from scipy.stats import pearsonr  
from collections import OrderedDict

# Do not display warnings
warnings.filterwarnings("ignore")

# Display whole text in columns
pd.set_option('display.max_colwidth', None)

In [74]:
data = pd.read_csv(r"C:\Users\jfox\Desktop\My Python Projects\Chrono Analysis\ClosuresData.csv", encoding='cp1252')

# Preserve a copy of df1 with the original column values for testing later
test_data = data.copy()

In [75]:
data = data[['Last10Chronos','CloseType','Gender','Race','Risk','Age','#ArrestsPastYear','ProbationType','DaysOnProbation','EpicsDosagePastYear']].loc[((data['CloseType'] == 'Successful') | (data['CloseType'] == 'Unsuccessful')) & (pd.notnull(data['Last10Chronos']))].rename(columns={'CloseType':'Outcome','Last10Chronos':'Chronos'})

# Fill in missing values. This is taken care of in the SQL source, but including here as well just in case.
data['Gender'].loc[pd.isnull(data['Gender'])] = 'Other/Unknown'
data['Race'].loc[pd.isnull(data['Race'])] = 'Other/Unknown'
data['Risk'].loc[pd.isnull(data['Risk'])] = 'Not Assessed'
#data['TopCriminogenicNeed'].loc[pd.isnull(data['TopCriminogenicNeed'])] = 'Not Assessed'
data['#ArrestsPastYear'].loc[pd.isnull(data['#ArrestsPastYear'])] = 0
data['ProbationType'].loc[pd.isnull(data['ProbationType'])] = 'Misdemeanor Probation'
data['DaysOnProbation'].loc[pd.isnull(data['DaysOnProbation'])] = 0
data['EpicsDosagePastYear'].loc[pd.isnull(data['EpicsDosagePastYear'])] = 0
data['EpicsDosagePastYear'] = data['EpicsDosagePastYear'].astype('int64')

# Create Age groups
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')
data['Age'].fillna(value=data['Age'].mean(), inplace=True)
data['AgeGroup'] = round(data.Age//10*10)
data = data.drop('Age',axis=1)

In [76]:
def col_clean(data, col):

    #stop = stopwords.words('english')
    stop = text.ENGLISH_STOP_WORDS
    
    #lemmatizer = WordNetLemmatizer()

    data[col] = data[col].str.lower()
    
    data[col] = data[col].str.replace(r'\b[uU]\b', 'you', regex=True)
    data[col] = data[col].str.replace(r"what's", "what is ", regex=True)
    data[col] = data[col].str.replace(r"can't", "cannot ", regex=True)
    data[col] = data[col].str.replace(r"i'm", "i am ", regex=True)
    data[col] = data[col].str.replace(r"\'ll", " will ", regex=True)
    data[col] = data[col].str.replace(r"\'scuse", " excuse ", regex=True)
    data[col] = data[col].str.replace("[:|♣|'|§|♠|*|/|?|=|%|&|-|#|•|~|^|>|<|►|_]", ' ', regex=True)
    data[col] = data[col].str.replace(r" w/", " with ", regex=True)
    data[col] = data[col].str.replace(r"\'s", " ", regex=True)
    data[col] = data[col].str.replace(" i/c ", " in custody ", regex=True)
    data[col] = data[col].str.replace(" ct ", " court ", regex=True)
    data[col] = data[col].str.replace(" cort ", " court ", regex=True)
    data[col] = data[col].str.replace(" crt ", " court ", regex=True)
    data[col] = data[col].str.replace("ov ", "office visit ", regex=True)
    data[col] = data[col].str.replace("o/v ", "office visit ", regex=True)
    data[col] = data[col].str.replace("ovwd", "office visit with defendant ", regex=True)
    data[col] = data[col].str.replace("hv ", "home visit ", regex=True)
    data[col] = data[col].str.replace("h/v ", "home visit ", regex=True)
    data[col] = data[col].str.replace("h v ", " home visit ", regex=True)
    data[col] = data[col].str.replace(" jl ", " jail ", regex=True)
    data[col] = data[col].str.replace(" ltr ", " letter ", regex=True)
    data[col] = data[col].str.replace(" vm ", " voicemail ", regex=True)
    data[col] = data[col].str.replace(" v/m ", " voicemail ", regex=True)
    data[col] = data[col].str.replace(" msg ", " message ", regex=True)
    data[col] = data[col].str.replace("l/m ", " left message ", regex=True)
    data[col] = data[col].str.replace("lm ", " left message ", regex=True)
    data[col] = data[col].str.replace("l m ", " left message ", regex=True)
    data[col] = data[col].str.replace("vm ", " voicemail ", regex=True)
    data[col] = data[col].str.replace("v m ", " voicemail ", regex=True)
    data[col] = data[col].str.replace(" bw ", " bench warrant ", regex=True)
    data[col] = data[col].str.replace(" b/w ", " bench warrant ", regex=True)
    data[col] = data[col].str.replace(" vop ", " violation of probation ", regex=True)
    data[col] = data[col].str.replace(" returbnd ", " returned ", regex=True)
    data[col] = data[col].str.replace(" comp ", " completed ", regex=True)
    data[col] = data[col].str.replace(" att ", " attempted ", regex=True)
    data[col] = data[col].str.replace("att ", "attempted ", regex=True)
    data[col] = data[col].str.replace("a/v ", " attempted visit ", regex=True)
    data[col] = data[col].str.replace("recvd ", " received ", regex=True)
    data[col] = data[col].str.replace("recv'd ", " received ", regex=True)
    data[col] = data[col].str.replace("rec'd ", " received ", regex=True)
    data[col] = data[col].str.replace("recv ", " received ", regex=True)
    data[col] = data[col].str.replace("recvd ", " received ", regex=True)
    data[col] = data[col].str.replace("t/c ", " call ", regex=True)
    data[col] = data[col].str.replace("t c ", " call ", regex=True)
    data[col] = data[col].str.replace("tc ", " call ", regex=True)
    data[col] = data[col].str.replace("o/c ", " outgoing call ", regex=True)
    data[col] = data[col].str.replace(" thru ", " through ", regex=True)
    data[col] = data[col].str.replace(" rpt ", " report ", regex=True)
    data[col] = data[col].str.replace(" rptd ", " reported ", regex=True)
    data[col] = data[col].str.replace(" wdef ", " with defendant ", regex=True)
    data[col] = data[col].str.replace(" d ", " defendant ", regex=True)
    data[col] = data[col].str.replace("def ", " defendant ", regex=True)
    data[col] = data[col].str.replace("def. ", " defendant ", regex=True)
    data[col] = data[col].str.replace(" def ", " defendant ", regex=True)
    data[col] = data[col].str.replace("deft ", " defendant ", regex=True)
    data[col] = data[col].str.replace(" deft's ", " defendant ", regex=True)
    data[col] = data[col].str.replace(" d's ", " defendant ", regex=True)
    data[col] = data[col].str.replace(" presemt ", " present ", regex=True)
    data[col] = data[col].str.replace(" yr ", " year ", regex=True)


    # Remove phrases that imply obvious failure (these won't be in actual data for predictions)
    data[col] = data[col].str.replace("death", "", regex=True)
    data[col] = data[col].str.replace("passed away", "", regex=True)
    data[col] = data[col].str.replace("deceased", "", regex=True)
    data[col] = data[col].str.replace("closed", "", regex=True)
    data[col] = data[col].str.replace("file close", "", regex=True)
    data[col] = data[col].str.replace("terminated", "", regex=True)
    data[col] = data[col].str.replace("convicted", "", regex=True)
    data[col] = data[col].str.replace("conviction", "", regex=True)
    data[col] = data[col].str.replace("sentenced", "", regex=True)
    data[col] = data[col].str.replace("prison sentence", "", regex=True)
    data[col] = data[col].str.replace("jail sentence", "", regex=True)


    # Remove numbers.
    #data[col] = data[col].str.replace(r'\d', ' ', regex=True)

    # Remove more than one whitespace character
    data[col] = data[col].str.replace('\s+', ' ', regex=True)
    
    # Any string of 3 or more characters are replaced by just 2.
    data[col] = data[col].str.replace(r'(.)\1+', r'\1\1', regex=True)
    
    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ', regex=True)
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3', regex=True)
    # Punctuations left single s's 
    data[col] = data[col].str.replace(" s ", " ", regex=True)
    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ', regex=True)
    
    # Patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1', regex=True)
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1', regex=True)
    data[col] = data[col].str.replace(r'[ ]{2,}',' ', regex=True).str.strip()   
    data[col] = data[col].str.replace(r'[ ]{2,}',' ', regex=True).str.strip()   
    
    # Lemmatize text
    #data[col] = data[col].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split(' ')]))
    
    # Remove stop words
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    


col_clean(data,'Chronos')

In [77]:
def evenly_distribute(df):
    df = df.sample(frac=1)
    df1 = df[df.apply(lambda x: x['Outcome'] == 'Successful', axis=1)]
    df2 = df[df.apply(lambda x: x['Outcome'] == 'Unsuccessful', axis=1)]
    if len(df1)>=len(df2):
        shrunk = df1[:len(df2)]
        return(df2.append(shrunk).sample(frac=1))
    else: 
        shrunk = df2[:len(df1)]
        return(df1.append(shrunk).sample(frac=1))


In [78]:
# One-hot encoder

object_cols = [c for c in data.columns if (data[c].dtype == object) and (c != 'Chronos') and (c != 'Outcome')]

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(data[object_cols]), columns=OH_encoder.get_feature_names(object_cols))

# One-hot encoding removed index; put it back
OH_cols_train.index = data.index

# Remove categorical columns (will replace with one-hot encoding)
data = data.drop(object_cols, axis=1)

# Add one-hot encoded columns to data
data = pd.concat([data, OH_cols_train], axis=1)


In [79]:
# calculate the Pearson's correlation between variable and outcome, store in dictionary.

pcorrelations = {}
  
for i in [c for c in data.columns if ((data[c].dtype == 'float64') or (data[c].dtype == 'int64')) and (c != 'Chronos') and (c != 'Outcome')]:
    corrdata1 = data[i]
    corrdata2 = data.Outcome.map({'Successful':1,'Unsuccessful':0})
    # calculate Pearson's correlation
    corr, _ = pearsonr(corrdata1, corrdata2)
    pcorrelations[i] = corr

pcorrelations = dict(sorted(pcorrelations.items(), key=lambda item: item[1]))

pcorrelations

{'#ArrestsPastYear': -0.2545355999911566,
 'ProbationType_AB109': -0.2374814008420108,
 'Risk_High Violent': -0.16495722517381944,
 'Gender_Male': -0.10573796024217477,
 'Risk_High Property/Violent': -0.07318721165888835,
 'Race_Black': -0.04461391729909708,
 'Risk_Not Assessed': -0.015984875360306336,
 'Race_Other/Unknown': -0.002702597001862766,
 'Gender_Other/Unknown': 0.00611130938105452,
 'Race_Hispanic/Latin/Mexican': 0.018846860921251825,
 'Race_White': 0.02142721016187698,
 'Risk_High Drug': 0.02903651347592988,
 'EpicsDosagePastYear': 0.048203137011135044,
 'AgeGroup': 0.06426452692678342,
 'DaysOnProbation': 0.0742383288953944,
 'Risk_Moderate': 0.07884377321817809,
 'ProbationType_Misdemeanor Probation': 0.10499681974185288,
 'Gender_Female': 0.10551970684161845,
 'ProbationType_Felony Probation': 0.13400412830814742,
 'Risk_Low': 0.15909027473214865}

In [80]:
# Split the data
train, test = train_test_split(data, test_size=0.3, random_state=0)


# Evenly distribute the training data
train = evenly_distribute(train)


train_x = train.drop('Outcome', axis=1)
train_y = pd.DataFrame(train['Outcome'])

test_x = test.drop('Outcome', axis=1)
test_y = pd.DataFrame(test['Outcome'])


In [81]:
# Define Vectorizer
vectorizer = TfidfVectorizer(min_df= 2, max_df = 0.5, analyzer = 'word', ngram_range = (1,4), max_features = 15000)

# Vectorize text
train_vectors = vectorizer.fit_transform(train_x['Chronos'].values.astype('str'))
test_vectors = vectorizer.transform(test_x['Chronos'].values.astype('str'))

# Add back to dataframes
train_x = pd.concat([train_x,pd.DataFrame(train_vectors.toarray(), columns= vectorizer.get_feature_names()).set_index(train_x.index)], axis=1)
test_x = pd.concat([test_x,pd.DataFrame(test_vectors.toarray(), columns= vectorizer.get_feature_names()).set_index(test_x.index)], axis=1)


In [82]:
# Define and fit model(s).

model1 = RidgeClassifier()
#model2 = svm.SVC(kernel='linear')
#model3 = LogisticRegression()
#model4 = KNeighborsClassifier()
model5 = RandomForestClassifier(n_estimators=500)

model1.fit(train_x.drop('Chronos', axis=1),train_y)
#model2.fit(train_x.drop('Chronos', axis=1),train_y)
#model3.fit(train_x.drop('Chronos', axis=1),train_y)
#model4.fit(train_x.drop('Chronos', axis=1),train_y)
model5.fit(train_x.drop('Chronos', axis=1),train_y)


RandomForestClassifier(n_estimators=500)

In [83]:
# Fit and check accuracy of each model

predictions1 = model1.predict(test_x.drop('Chronos', axis=1))
#predictions2 = model2.predict(test_x.drop('Chronos', axis=1))
#predictions3 = model3.predict(test_x.drop('Chronos', axis=1))
#predictions4 = model4.predict(test_x.drop('Chronos', axis=1))
predictions5 = model5.predict(test_x.drop('Chronos', axis=1))

print('Accuracy of model 1 is: ',accuracy_score(test['Outcome'],predictions1,'\n'))
#print('Accuracy of model 2 is: ',accuracy_score(test['Outcome'],predictions2,'\n'))
#print('Accuracy of model 3 is: ',accuracy_score(test['Outcome'],predictions3,'\n'))
#print('Accuracy of model 4 is: ',accuracy_score(test['Outcome'],predictions4,'\n'))
print('Accuracy of model 5 is: ',accuracy_score(test['Outcome'],predictions5,'\n'))

Accuracy of model 1 is:  0.8133333333333334
Accuracy of model 5 is:  0.8272463768115942


In [71]:
# Choose a model

my_model = RandomForestClassifier(n_estimators=500)
test['Predictions'] = predictions5
test.to_csv('testpredictions.csv',index=False)

In [72]:
# Train chosen model on 100% of the data (evenly distributed) and save.

model_data = evenly_distribute(data)

train_vectors = vectorizer.fit_transform(model_data['Chronos'].values.astype('str'))

model_data = pd.concat([model_data.drop('Chronos', axis=1),pd.DataFrame(train_vectors.toarray(), columns= vectorizer.get_feature_names()).set_index(model_data.index)], axis=1)

my_model.fit(model_data.drop('Outcome', axis=1),model_data['Outcome'])

with open('Success_Model.pkl', 'wb') as f:#
    pickle.dump((vectorizer, OH_encoder, my_model), f)
