In [725]:
import numpy as np
import pandas as pd
import matplotlib as plt
import time
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', None)

In [714]:
date_cols = ['DOB','dateReceivedOriginal','dateReceivedCurrent','latestReleaseDate','paroleHearingDate','maxExpirationDateParole','postReleaseMaxExpiration','paroleBoardDischargeDate']
date_cols_unpure = ['earliestRelaseDate','paroleEligibilityDate','conditionalReleaseDate','maxExpirationDate']
transition_table_cols = ['custodyStatus','dateReceivedOriginal','latestReleaseDate','earliestReleaseDate','conditionalReleaseDate','minSentence','maxSentence','crime1','class1','crime2','class2','crime3','class3','crime4','class4']
THIRTY_YRS = 10950

In [715]:
dfs = [pd.read_csv("/Users/jpouls/recidiviz/nyrecidiviz/ny_inmate_data/inmates"+str(year)+".csv",index_col=0,parse_dates=date_cols,na_filter=False) for year in range(2000,2021)]
df_full = pd.concat(dfs)

In [716]:
df = df_full.copy()
print(df.shape[0])

# ignore reentries
df = df[df.dateReceivedOriginal == df.dateReceivedCurrent]

# ignore no crime records
df = df[df.crime1 != 'NO CRIME RECORD AVAIL']

# trim to essential columns
a = df[transition_table_cols]

313291


In [717]:
# set conditionalReleaseDate as earliestReleaseDate if 'NONE'
b = a.copy()
mask = b['conditionalReleaseDate'] == 'NONE'
b.loc[mask,'conditionalReleaseDate'] = b['earliestReleaseDate']

In [718]:
# set conditionalReleaseDate as DATE_FAR_IN_FUTURE if 'LIFE'
c = b.copy()
mask = c['conditionalReleaseDate'] == 'LIFE'
c.loc[mask, 'conditionalReleaseDate'] = '2/22/2222'

c['year'] = pd.DatetimeIndex(c['conditionalReleaseDate']).year
c.head(111)

Unnamed: 0_level_0,custodyStatus,dateReceivedOriginal,latestReleaseDate,earliestReleaseDate,conditionalReleaseDate,minSentence,maxSentence,crime1,class1,crime2,class2,crime3,class3,crime4,class4,year
DIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
00A0002,RELEASED,2000-01-03,2001-04-11,,04/11/2001,0.0,2.0,SEXUAL ABUSE 1ST,D,,,,,,,2001.0
00A0003,RELEASED,2000-01-03,2002-01-04,,01/06/2002,2.0,4.0,ATT FORGERY 2ND,E,DWI:ALCOHOL/DRUGS-2ND OFFENSE,E,,,,,2002.0
00A0005,RELEASED,2000-01-03,2002-03-11,,09/12/2003,3.0,6.0,ATT CRIM SALE CONTR SUBSTANCE 3RD,C,,,,,,,2003.0
00A0009,RELEASED,2000-01-03,2001-09-17,,02/26/2005,2.33,7.0,PERJURY 1ST,D,,,,,,,2005.0
00A0010,RELEASED,2000-01-03,2001-07-27,,04/14/2001,0.0,2.0,SEXUAL ABUSE 1ST,D,,,,,,,2001.0
00A0012,DISCHARGED,2000-01-04,2002-05-15,,05/15/2002,1.5,3.0,CRIM CONTEMPT 1ST,E,,,,,,,2002.0
00A0017,RELEASED,2000-01-04,2001-01-05,,09/26/2001,1.5,3.0,ATT ROBBERY 3RD,E,,,,,,,2001.0
00A0018,RELEASED,2000-01-04,2000-08-10,,,1.0,3.0,CRIM SALE CONTR SUBSTANCE 3RD,B,,,,,,,
00A0022,RELEASED,2000-01-04,2000-12-14,,,4.0,100.0,CRIM POSS CONTR SUBSTANCE 2ND,A2,CRIM SALE CONTR SUBSTANCE 2ND,A2,,,,,
00A0027,RELEASED,2000-01-05,2000-09-08,,09/10/2000,0.0,2.0,ATT ROBBERY 2ND,D,,,,,,,2000.0


In [719]:



# set enddate, with priority to latestReleaseDate, conditionalReleaseDate otherwise
c['enddate'] = c.latestReleaseDate.combine_first(c.conditionalReleaseDate)

# set LOS as timedelta between enddate and entrydate
c['LOS'] = c.enddate - c.dateReceivedOriginal

# ignore records with erroneous negative LOS
c = c[c.LOS.dt.days > 0]

print(c.shape[0])

206953


In [720]:
# get most serious crime by class
def getCrime(df):
    df['crime'], df['crime_class'] = df['crime1'],df['class1']
    for i in [2,3,4]:
        cl = df['class'+str(i)]
        if not cl:
            return df
        if cl < df['crime_class']:
            df['crime'], df['crime_class'] = df['crime'+str(i)], cl
    return df
c = c.apply(getCrime,axis=1)

# clean up df
c['crime'] = c.crime + '|' + c.crime_class
c = c[['dateReceivedOriginal','LOS','crime']]
c['LOS'] = c.LOS.dt.days

In [721]:
# cap LOS at 30yr
d = c.copy()
mask = d['LOS'] > THIRTY_YRS
d.loc[mask,'LOS'] = THIRTY_YRS

In [734]:
transition_df_by_crime = []

for crime in d.crime.value_counts().index:
    # get sub-dataframe with specific crime
    by_crime = d[d.crime == crime]
    
    # fn to subtract released inmates from total inmates
    minus = lambda x: len(by_crime.index) - x
    
    # combine inmates with the same LOS
    LOS_count = by_crime.groupby('LOS').count()
    
    # np.cumsum gets num_of_inmates_released_so_far, minus + LOS_count gets num_of_inmates_remaining
    LOS_count['n_left'] = minus(np.cumsum(LOS_count).crime)+LOS_count.crime
    
    # calculate proportion of inmates released, of those remaining
    LOS_count['transition'] = LOS_count.crime/(LOS_count.n_left)
    
    # add crime for disaggregation and save df to list
    LOS_count['CRIME'] = crime
    transition_df_by_crime.append(LOS_count[['transition','CRIME']])

transitions = pd.concat(transition_df_by_crime)


transitions.to_csv('/Users/jpouls/recidiviz/nyrecidiviz/mm_preprocessing/transitionfull/transitionfull'+str(int(time.time()))+'.csv')

In [723]:
# OLD
# diffdates[~diffdates.paroleHearingType.str.contains("VIOLA")]
# merg = pd.merge(diffdates, parole_viol, how='inner', on=['DIN'])
# c['crime_short'] = c.crime.str.extract(r'(.*?\d).*')[0]

# s = d.cc.unique()
# s = [str(x) for x in s]
# s.sort()

# bins = np.arange(0.0, 36500.0, 365.0/12)
# bins = np.append(bins,max(d['LOS'].dt.days))
# count, bins_count = np.histogram(d['LOS'].dt.days, bins=bins)
# pdf = count / sum(count)
# cdf = np.cumsum(pdf) 
# d['LOS'].dt.days.hist(bins=bins)
# los.hist(bins=32)




#     numLOS['cum'] = cum.crime
#     numLOS['cum_left'] = minus(cum.crime)
#     numLOS['out_of_this_many'] = numLOS.cum_left+nu/mLOS.crime
#     numLOS['transition'] = numLOS.crime/(numLOS.cum_left+numLOS.crime)

#     print(numLOS)
#     print(numLOS.index.to_numpy(),len(numLOS.index.to_numpy()))
#     print(numLOS.to_numpy(),len(numLOS.to_numpy()))
#     print(np.cumsum(numLOS))#,len(np.cumsum(numLOS.to_numpy())))
#     print(square(np.cumsum(numLOS.to_numpy())))