### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from datetime import datetime
import re
import collections
import string

In [2]:
# Function that displays all the columns of a data frame
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [3]:
os.getcwd()

'C:\\Users\\Kalmerrick\\Documents\\JC Treasury Analysis\\Parole Stats Analysis'

In [4]:
PATH = 'Data/'

In [5]:
os.listdir(PATH)

['JC TOMS Assessment Outcomes On Release.xlsx',
 'linkedParoleAndTomsData.csv',
 'paroleStats.xlsx',
 'Stats for PC.xlsx']

In [6]:
# got an utf error, got around it by using engine='python'
df = pd.read_csv(f'{PATH}linkedParoleAndTomsData.csv', engine='python',      \
    parse_dates = ['DecisionDate', 'ReceptionDate', 'DischargeDate', 'DateSentenced', \
    'SentenceCommenceDate', 'EarliestReleaseDate', 'MaximumSentenceDate',
       'SentenceToMaxSentence'                             
                                 ])

In [7]:
df.head()

Unnamed: 0,StayID,ALKey,DecisionFY,DecisionDate,TOMS_ID,Ethnicity,MatterType,ReviewType,ReviewReason,ReleaseType,...,ReasonSequence,Reason,ANZSOC_Code,ReceptionDate,DischargeDate,DateSentenced,SentenceCommenceDate,EarliestReleaseDate,MaximumSentenceDate,SentenceToMaxSentence
0,,-0.000991572,2016/17,2017-12-06,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,1,You are not eligible to be released on parole ...,,NaT,NaT,NaT,,NaT,NaT,
1,,-0.00297619,2015/16,2016-03-18,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,1,"On 15 March 2016, you received a sentence of t...",,NaT,NaT,NaT,,NaT,NaT,
2,,-0.00297619,2015/16,2016-03-18,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,2,As there is currently no Supervised Release or...,,NaT,NaT,NaT,,NaT,NaT,
3,,-0.000248016,2015/16,2016-03-06,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,1,Your unmet treatment needs (substance abuse an...,,NaT,NaT,NaT,,NaT,NaT,
4,,-0.000248016,2015/16,2016-03-06,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,2,An extensive criminal history including violen...,,NaT,NaT,NaT,,NaT,NaT,


In [9]:
df.set_index('DecisionDate').head()

Unnamed: 0_level_0,StayID,ALKey,DecisionFY,TOMS_ID,Ethnicity,MatterType,ReviewType,ReviewReason,ReleaseType,ReviewSequence,...,ReasonSequence,Reason,ANZSOC_Code,ReceptionDate,DischargeDate,DateSentenced,SentenceCommenceDate,EarliestReleaseDate,MaximumSentenceDate,SentenceToMaxSentence
DecisionDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-06,,-0.000991572,2016/17,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,1,...,1,You are not eligible to be released on parole ...,,NaT,NaT,NaT,,NaT,NaT,
2016-03-18,,-0.00297619,2015/16,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,1,...,1,"On 15 March 2016, you received a sentence of t...",,NaT,NaT,NaT,,NaT,NaT,
2016-03-18,,-0.00297619,2015/16,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,1,...,2,As there is currently no Supervised Release or...,,NaT,NaT,NaT,,NaT,NaT,
2016-03-06,,-0.000248016,2015/16,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,2,...,1,Your unmet treatment needs (substance abuse an...,,NaT,NaT,NaT,,NaT,NaT,
2016-03-06,,-0.000248016,2015/16,,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,2,...,2,An extensive criminal history including violen...,,NaT,NaT,NaT,,NaT,NaT,


In [10]:
df.describe()

Unnamed: 0,ReviewSequence,ReasonSequence,ANZSOC_Code,SentenceToMaxSentence
count,74136.0,74136.0,74065.0,74035.0
mean,1.223117,2.891793,663.605131,653.699102
std,0.691491,1.64715,403.558874,577.707543
min,1.0,1.0,111.0,-309.0
25%,1.0,2.0,213.0,284.0
50%,1.0,3.0,612.0,454.0
75%,1.0,4.0,1021.0,823.0
max,12.0,15.0,1694.0,7264.0


### Missing data

In [11]:
# get the number of missing data points per column
missing_value_count = df.isnull().sum()

In [13]:
# look at the number of missing values
missing_value_count.sort_values(ascending = False).head()

EarliestReleaseDate      14014
DischargeDate             4018
EED                       1483
MaximumSentenceDate        101
SentenceToMaxSentence      101
dtype: int64

In [14]:
df.shape

(74136, 23)

In [15]:
# How many total missing values do we have?
total_cells = np.product(df.shape)
total_missing = missing_value_count.sum()

# percent of data that is missing
(total_missing / total_cells) * 100

1.1778587883138392

In [16]:
# remove all columns with at least one missing value
# This may be extreme, as I need the ANZSOC codes and only 71 are missing
# df.dropna(axis=1, inplace = True)
# Try axis =0, to delete rows
df.dropna(axis=0, inplace = True)

In [17]:
# Note about 20k rows got deleted
df.isnull().values.any()

False

In [18]:
# Convert ANZSOC Code to int as was appearing as float
df['ANZSOC_Code'] = df['ANZSOC_Code'].astype(int)
df['SentenceToMaxSentence'] = df['SentenceToMaxSentence'].astype(int)

In [19]:
df.head()

Unnamed: 0,StayID,ALKey,DecisionFY,DecisionDate,TOMS_ID,Ethnicity,MatterType,ReviewType,ReviewReason,ReleaseType,...,ReasonSequence,Reason,ANZSOC_Code,ReceptionDate,DischargeDate,DateSentenced,SentenceCommenceDate,EarliestReleaseDate,MaximumSentenceDate,SentenceToMaxSentence
71,0G28H7LU60WADZ0H,H0841768-18/10/2010,2010/11,2010-10-18,H0841768,NON ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,1,Extensive criminal history since the thirteen ...,1021,2000-06-10,2012-03-06,2002-07-25,4/10/2000,2012-03-06,2018-03-04,5732
72,0G28H7LU60WADZ0H,H0841768-18/10/2010,2010/11,2010-10-18,H0841768,NON ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,2,Serious drug dealing offences place the commun...,1021,2000-06-10,2012-03-06,2002-07-25,4/10/2000,2012-03-06,2018-03-04,5732
73,0G28H7LU60WADZ0H,H0841768-18/10/2010,2010/11,2010-10-18,H0841768,NON ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,3,Breached Commonwealth parole by re-offending i...,1021,2000-06-10,2012-03-06,2002-07-25,4/10/2000,2012-03-06,2018-03-04,5732
74,0G28H8NHT0WADZ0H,C0922538-17/08/2012,2012/13,2012-08-17,C0922538,NON ABORIGINAL,Parole,Parole,Re-application,Discretionary,...,1,Repeated penetrative sex offences over a lengt...,311,1999-11-30,2013-08-09,2000-06-20,10/05/2000,2013-08-09,2020-09-05,7264
75,0G28H8NHT0WADZ0H,C0922538-17/08/2012,2012/13,2012-08-17,C0922538,NON ABORIGINAL,Parole,Parole,Re-application,Discretionary,...,2,Your minimal participation in Cognitive Skills...,311,1999-11-30,2013-08-09,2000-06-20,10/05/2000,2013-08-09,2020-09-05,7264


In [20]:
# Sort by DecisionDate and TOMS_ID
df.sort_values(['DecisionDate','TOMS_ID'], ascending = [True, True]).head()

Unnamed: 0,StayID,ALKey,DecisionFY,DecisionDate,TOMS_ID,Ethnicity,MatterType,ReviewType,ReviewReason,ReleaseType,...,ReasonSequence,Reason,ANZSOC_Code,ReceptionDate,DischargeDate,DateSentenced,SentenceCommenceDate,EarliestReleaseDate,MaximumSentenceDate,SentenceToMaxSentence
3931,0KIZBBVBY0WADZ0K,B1073836-1/07/2010,2010/11,2010-01-07,B1073836,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,1,Serious and violent nature of current offences.,611,2009-01-05,2011-10-31,2009-10-10,1/05/2009,2011-10-31,2011-10-31,752
3932,0KIZBBVBY0WADZ0K,B1073836-1/07/2010,2010/11,2010-01-07,B1073836,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,2,"Extensive criminal history including violence,...",611,2009-01-05,2011-10-31,2009-10-10,1/05/2009,2011-10-31,2011-10-31,752
3933,0KIZBBVBY0WADZ0K,B1073836-1/07/2010,2010/11,2010-01-07,B1073836,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,3,Entrenched history of poly-substance abuse whi...,611,2009-01-05,2011-10-31,2009-10-10,1/05/2009,2011-10-31,2011-10-31,752
3934,0KIZBBVBY0WADZ0K,B1073836-1/07/2010,2010/11,2010-01-07,B1073836,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,4,Unaddressed offending behaviour (substance abu...,611,2009-01-05,2011-10-31,2009-10-10,1/05/2009,2011-10-31,2011-10-31,752
3935,0KIZBBVBY0WADZ0K,B1073836-1/07/2010,2010/11,2010-01-07,B1073836,ABORIGINAL,Parole,Parole,Parole Application,Discretionary,...,5,Parole in 2002 and 2003 breached by re-offending.,611,2009-01-05,2011-10-31,2009-10-10,1/05/2009,2011-10-31,2011-10-31,752


In [21]:
df.shape

(54843, 23)

In [22]:
# Filter for 'Denied Only'
mask = (df['Outcome'] == 'Denied')

In [23]:
df = df[mask]

In [24]:
# By filtering on individuals who were denied parole reduces dataset by approx 25k rows
df.shape

(29632, 23)

In [25]:
# Convert the 'Reason' column into a string
df['Reason'] = df['Reason'].astype(str, inplace = True)

In [26]:
# Create a new DataFrame that is grouped on DecisionDate and TOMS_ID, aggregate the Reasons
df2 = pd.DataFrame(df.groupby(['DecisionDate', 'TOMS_ID','ANZSOC_Code','ReceptionDate','DateSentenced','MaximumSentenceDate','SentenceToMaxSentence']).agg({'Reason':lambda x:'| '.join(x)}))

In [27]:
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Reason
DecisionDate,TOMS_ID,ANZSOC_Code,ReceptionDate,DateSentenced,MaximumSentenceDate,SentenceToMaxSentence,Unnamed: 7_level_1
2010-01-07,B1073836,611,2009-01-05,2009-10-10,2011-10-31,752,Serious and violent nature of current offences...
2010-01-07,B1354006,212,2010-04-13,2010-06-05,2010-10-13,161,Serious and violent nature of current offence ...
2010-01-07,B1367371,1411,2010-03-17,2010-03-17,2010-12-16,275,Repetitive nature of driving without a licence...
2010-01-07,D1859709,1513,2009-09-15,2009-09-17,2011-06-16,638,Serious and violent nature of current offences...
2010-01-07,E0871348,412,2010-12-01,2010-12-01,2011-11-01,365,Serious nature of current offences of driving ...


In [28]:
df2.shape

(7794, 1)

In [29]:
# Reset the index so that the DecisionDate is populated throughout the column
df2.reset_index(inplace = True)

In [30]:
df2.head()

Unnamed: 0,DecisionDate,TOMS_ID,ANZSOC_Code,ReceptionDate,DateSentenced,MaximumSentenceDate,SentenceToMaxSentence,Reason
0,2010-01-07,B1073836,611,2009-01-05,2009-10-10,2011-10-31,752,Serious and violent nature of current offences...
1,2010-01-07,B1354006,212,2010-04-13,2010-06-05,2010-10-13,161,Serious and violent nature of current offence ...
2,2010-01-07,B1367371,1411,2010-03-17,2010-03-17,2010-12-16,275,Repetitive nature of driving without a licence...
3,2010-01-07,D1859709,1513,2009-09-15,2009-09-17,2011-06-16,638,Serious and violent nature of current offences...
4,2010-01-07,E0871348,412,2010-12-01,2010-12-01,2011-11-01,365,Serious nature of current offences of driving ...


In [31]:
# The count of grouped reasons is 7794
df2['Reason'].count()

7794

In [32]:
# Logic for separating Deny Release on Parole Reasons into 'Treatment Needs All'
df2['Treatment Needs All'] = np.where((df2['Reason'].str.contains("treatment need", case=False, na=False) \
    | df2['Reason'].str.contains("extensive criminal", case=False, na=False)) , 'Y', 'N' )

df2['Compliance All'] = np.where(df2['Reason'].str.contains("poor", case=False, na=False), 'Y', 'N' )
#poor prison

df2['Parole Plan All'] = np.where(df2['Reason'].str.contains("release plan", case=False, na=False), 'Y', 'N' )
 
df2['s10 dr10'] = np.where(df2['Reason'].str.contains("Your ineligibility to be released", case=False, na=False), 'Y', 'N' )

df2['Deny Release on Parole All'] = (df2['Treatment Needs All'].apply(lambda x : x =='Y')) | (df2['Compliance All'].apply(lambda x : x =='Y')) \
| (df2['Parole Plan All'].apply(lambda x : x =='Y')) | (df2['s10 dr10'].apply(lambda x : x =='Y'))

In [33]:
df2['Reason'][df2['Treatment Needs All'] == 'Y'].count()

6464

### High level text mine

In [35]:
# Create a string for testing that contains all the reasons
reasonJoined = " ".join(df2['Reason']); reasonJoined[:500]

'Serious and violent nature of current offences.| Extensive criminal history including violence, having offended every year since 1998.| Entrenched history of poly-substance abuse which is directly linked to your offending.| Unaddressed offending behaviour (substance abuse and violence);| Parole in 2002 and 2003 breached by re-offending.| Community Based Order in 2002 and 2009 breached and cancelled by re-offending. Serious and violent nature of current offence committed in a domestic violence si'

In [36]:
# Make everything lower case
reasonJoinedLower = reasonJoined.lower()

In [37]:
# Strip out all the punctuation
reasonJoinedLowerStripped = "".join(l for l in reasonJoinedLower if l not in string.punctuation);reasonJoinedLowerStripped[:1000]

'serious and violent nature of current offences extensive criminal history including violence having offended every year since 1998 entrenched history of polysubstance abuse which is directly linked to your offending unaddressed offending behaviour substance abuse and violence parole in 2002 and 2003 breached by reoffending community based order in 2002 and 2009 breached and cancelled by reoffending serious and violent nature of current offence committed in a domestic violence situation extensive criminal history including domestic violence offending history of domestic violence places community safety at risk entrenched history of substance abuse which is directly linked to your offending and prior program participation has not assisted in addressing unaddressed offending behaviour domestic violence four out of four parole orders between 2004 and 2008 breached or cancelled by reoffending community based order in 2005 cancelled by reoffending intensive supervision order in 2004 cancell

In [38]:
# Get a word count of the most common words
collections.Counter(reasonJoinedLowerStripped.split()).most_common()[:20]

[('the', 38712),
 ('and', 28757),
 ('of', 27925),
 ('to', 24194),
 ('your', 24164),
 ('you', 14167),
 ('in', 13486),
 ('treatment', 13235),
 ('a', 10741),
 ('with', 9956),
 ('current', 8622),
 ('for', 8593),
 ('history', 8540),
 ('violence', 8378),
 ('as', 7965),
 ('abuse', 7880),
 ('substance', 7791),
 ('not', 7764),
 ('board', 7728),
 ('needs', 7680)]

In [39]:
# Get a count for the phrase 'due to the short duration'
reasonJoinedLowerStripped.count('due to the short duration')

681

In [40]:
# This does the same as above using regular expressions

count = 0

# Search for the phrase 'due to the short duration'    
# pattern = re.compile(r'due to the short duration', re.I) # Count is 681 validated above

# Search for the phrases 'treatment need' or 'extensive criminal'
#pattern = re.compile(r'.*treatment need|extensive criminal.*', re.I) # Count is 6464, validated above

# Search for the phrases 'treatment need' or 'extensive criminal' in any order (tbc)
#pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal)).*$', re.I) # Count is 6464, validated above

# Search for the phrase 'unmet treatment need', return the sentence
#pattern = re.compile(r'^(?=.*?unmet treatment need).*$', re.I) # Count is 5660

# Search for the phrases 'unmet treatment need' and 'treatment need' or 'extensive criminal'
#pattern = re.compile(r'^(?=.*?unmet treatment need)(?=.*?treatment need|extensive criminal).*$', re.I) # Count 5660

# Search for 'treatment need' or 'extensive criminal' and 'due to the short duration'
#pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?due to the short duration).*$', re.I) # 669 verified

# Search for phrase 'program' get ten characters on each side
# pattern = re.compile(r'..........program..........',re.I) 

# Pattern searches in order for 'treatment need' or 'extensive criminal' first, then searches for 'not been assessed')
#pattern = re.compile(r'(treatment need|extensive criminal).*(not been assessed).*(.......................program...............................................................)')

# Searches below searches for the words in any order, to get ten chars on either side of program need to 
# do additional step below

#pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?treatment program).*$', re.I) # Count 2495 verified
pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?treatment program)(?=.*?backdating).*$', re.I) # Count 267 verified

for reason in df2['Reason']:
    match = pattern.search(reason)
    if match:
        count +=1
        #print(match.group())
print('Count is ' + str(count))

Count is 267


### Additional step to get  30 or so chars on either side of the word 'program' 

In [44]:
count = 0
# First search for a pattern that contains certain words in any order
#pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?not been assessed).*$', re.I) # Count 1487 verified
#pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?outstanding treatment).*$', re.I) # Count 387 verified
pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?unmet treatment need).*$', re.I) # Count 5660 verified

# Create a second pattern to search on with prior results
pattern2 = re.compile(r'............................program..................................................')
# Create an empty string
group3 = ''
for reason in df2['Reason']:
    # Search the 'pattern' in the reason
    match = pattern.search(reason)
    if match:
        matchResult = str(match.group())
        # Search the pattern2 with the matchResult
        match2 = pattern2.findall(matchResult)
        if match2:
            count +=1
            #print(match2)
            group3 = group3 + str(match2)
#print('Count is ' + str(count))
#print(group3)

In [45]:
# 20 most common words from above where
    # Rows are removed if they contain any null values
    # The outcome is 'Denied'
    # Reasons are grouped at an individual level
    # Via RE (regular expression) the reason contains ('treatment need' or 'extensive criminal') and 'unmet treatment need'
    # Then if the result of the line contains program get the string that contains 30 or so characters either side
    # Split the string and get the 20 most common words
    # I picked popular words like 'short' and 'duration' and created columns that check whether they contained certain
    # phrases like 'due to the short duration', e.g. below. Created in notebook 'Linked Parole and TOMS V1 Binned'
    # df3['due to the short duration'] = np.where(df3['Reason'].str.contains('due to the short duration', case=False, na=False), 'Y', 'N' )
    
collections.Counter(group3.split()).most_common()[:20]

[('the', 2566),
 ('to', 2556),
 ('treatment', 2338),
 ('in', 2199),
 ('programmes', 1967),
 ('intensive', 1950),
 ('any', 1286),
 ('due', 1215),
 ('and', 1106),
 ('whilst', 1068),
 ('custody', 1054),
 ('of', 1016),
 ('for', 797),
 ('short', 685),
 ('you', 675),
 ('duration', 642),
 ('your', 593),
 ('program', 583),
 ('programme', 468),
 ('have', 451)]

### Get the count of individuals with 'unmet needs' and reason does not contain 'short duration' or 'backdating'

In [46]:
#pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal)).*$', re.I) # count 6464
#pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?(unmet treatment need)).*$', re.I) # count 5660
#pattern = re.compile(r'^(?=.*?(treatment need)).*$', re.I) # count 6037
pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?(unmet treatment need))(?!.*?(due to the short duration))(?!.*?(backdating)).*$', re.I) # count 4677 validated
count = 0
for reason in df2['Reason']:
    match = pattern.search(reason)
    if match:
        count +=1
        #print(match.group())
print('Count is ' + str(count))

Count is 4677


### RE Parole reason tests

In [47]:
sentence = 'due to short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence2 = 'due to sshort duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence3 = 'due to hort duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence4 = 'due to -short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence5 = 'due to///short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence6 = 'due to 10 short duration of 9 sentence, you will not be 8 treatment assessed'
sentence7 = 'dhdhdhdhue to short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence8 = 'due to short duration of of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence9 = 'due to short.duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'

In [48]:
# Search for the phrase 'short duration' in the sentence
m = re.search(r'short duration', sentence)
m.group()

'short duration'

In [49]:
# [ ] is used to specify a character class, metacharacters are not active inside a class
# Search for a word where the first letter is matched to any characters in [rst] and is followed by hort
m = re.search(r'[rst]hort', sentence)
m.group()

'short'

In [50]:
# You can match characters NOT listed within the class by complementng the set, indicated by '^'
# Will match any character except 'x', followed by hort
m = re.search(r'[^x]hort', sentence)
m.group()

'short'

In [51]:
# '\w' mathces andy alphanumeric character equivalent to [a-zA_Z0-9_]
# Below mathes any alphanumeric character followed by 'hort'
m  = re.search(r'\whort', sentence)
m.group()

'short'

In [52]:
# The metacharter for repeating things is '*'
# *    '- 0 or More'
# Looks like RE quantifier * searches for 0 or more of the previous
match = re.search(r's*hort', sentence2, re.I)
match.group()

'sshort'

In [53]:
# Another repeating metacharacter is +, which matches one or more times. (requires one occurrence)
# Will work
match = re.search(r's+hort', sentence)
# Won't work on sentence3 as no 's'
#match = re.search(r's+hort', sentence3)
match.group()

'short'

In [54]:
# The question mark character '?' matches either once or zero times, it's optional
match = re.search(r'to -?short', sentence)
match = re.search(r'to -?short', sentence4)
match.group()

'to -short'

In [55]:
# The most complicated repeated qualifier is {m,n}, where m and n are decimal integers.
# This qualifier means there must be at least m repetitions, and at most n.
match = re.search(r'to/{1,3}short', sentence5, re.I)
match.group()

'to///short'

In [56]:
# The '.' character returns any character except New Line
match = re.search(r'treatment....', sentence5)
match.group()

'treatment ass'

In [57]:
# Other attribute findall()
# Finds all substrings where the RE matches, and returns them as a list
match = re.findall(r'short', sentence6, re.I)
match

['short']

In [58]:
# Other attribute finditer()
# Find all substrings where the RE matches, return them as an iterator
matches = re.finditer(r'short', sentence6)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(10, 15), match='short'>


In [59]:
# Other attribute match
# Determine if the RE matches at the beginning of the string
match = re.match(r'due to', sentence6, re.I)
match.group()

'due to'

In [60]:
# Examples
#'+' means one or more
match = re.match(r'[a-z]+', sentence, re.I)
match

<_sre.SRE_Match object; span=(0, 3), match='due'>

In [61]:
match = re.match(r'due', sentence, re.I)
match

<_sre.SRE_Match object; span=(0, 3), match='due'>

In [62]:
# Group returns the substring that was matched
match.group()

'due'

In [63]:
# Returns start and ending index
match.start(), match.end()

(0, 3)

In [64]:
match.span()

(0, 3)

In [65]:
# NOTE: The match() method only checks if the RE matches at the start of a string
# search() method of patterns scans through the string
# Below match() won't work as treatment is not at the beginning of the string
match = re.match(r'treatment', sentence, re.I)
if match:
    print('Match found: ', match.group())
else:
    print('No match')

No match


In [66]:
# search() will work and treatment is in string
match = re.search(r'treatment', sentence, re.I)
if match:
    print('Match found: ', match)
else:
    print('No match')

Match found:  <_sre.SRE_Match object; span=(51, 60), match='treatment'>


In [67]:
# Example to return all the matches of a pattern. Returns all the digits
# findall() creates a list
match = re.findall(r'\d+', sentence6, re.I)
match

['10', '9', '8']

In [68]:
# finditer() method returns a sequence of match object as an iterator
matches = re.finditer(r'\d+', sentence6, re.I)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(7, 9), match='10'>
<_sre.SRE_Match object; span=(28, 29), match='9'>
<_sre.SRE_Match object; span=(56, 57), match='8'>


In [69]:
# Compilation Flags, example re.I and Verbose
# Multiline, usually '^' matches only at the beginning of the string, and
match = re.search(r'^due', sentence6, re.I)
match

<_sre.SRE_Match object; span=(0, 3), match='due'>

In [70]:
# '$' matches only at the end of the string
match = re.search(r'assessed$', sentence6, re.I)
match

<_sre.SRE_Match object; span=(68, 76), match='assessed'>

In [71]:
# More metacharacters
# '\b' is word boundary, only matches when a complete word, won't match when contained inside another word
match = re.search(r'\bwill\b', sentence, re.I)
match

<_sre.SRE_Match object; span=(39, 43), match='will'>

In [72]:
# '|', match word1 'or' word2
match = re.search(r'\bzzzz|will\b', sentence, re.I)
match.group()

'will'

In [73]:
# Groups are marked by the '(', ')' metacharacters, grouping expressions contained inside the brackets
# You can repeat the contents of a group with a repeating qualifier, such as *,+,?, or {m,n}
match = re.match(r'(dh)*', sentence7, re.I)
match.span()

(0, 8)

In [74]:
match = re.search(r'(dh)*', sentence7, re.I)
match.span()

(0, 8)

In [75]:
# Group 0 is always present
match = re.search('(short )duration', sentence, re.I)
match.group(1)

'short '

In [76]:
#Subgroups are numbered from left to right, from 1 upward.
match = re.search('(you (will) not) be', sentence, re.I)
match.group(0) # 'you will not be'
match.group(1) # 'you will not'
match.group(2) # 'will'

'will'

In [77]:
# Groups can be passed multiple numbers at a time
match.group(2, 1, 1)

('will', 'you will not', 'you will not')

In [78]:
#The groups() method returns a tuple containing the stringss for all subgroups
match.groups()

('you will not', 'will')

In [79]:
# Backreferences in a pattern allow you specify that the current contents of an earlier capturing group
# must also be found at the current location in the string
# The following detects doubled words in a string
# \b denote boundary \w denotes word \s denotes whitespace \1 denotes the first word in the group hence repeated
match = re.search(r'\b(\w+)\s+\1\b', sentence8, re.I)
match.group()

'of of'

In [80]:
# Sometimes you'll want to use a group to denote a part of a regular expression, but 
# aren't interested in retrieving the group's contents
match = re.search(r'([abc])+', sentence, re.I)
match.groups()

('a',)

In [81]:
# You can make this fact explicit by using a non-capturing group (?:...) replace ... with any RE ????? Not sure
# Can't retrieve contents of what group matched?
match = re.search(r'(?:[abc])+', sentence, re.I)
match.groups()

()

In [82]:
# Groups can be referenced by name, still given numbers can retrieve both ways
# The syntax for a named group is (?P<name>)
match = re.search(r'(?P<test>\b\w+\b)', sentence)
match.group('test')

'due'

In [83]:
# Lookahead Assertions
# Positive lookahead assertion (?=...)   ... represents the RE      (succeeds if matches)
match = re.search(r'^(?=.*?not)(?=.*?short).*$', sentence, re.I)
match

<_sre.SRE_Match object; span=(0, 110), match='due to short duration of sentence, you will not b>

In [84]:
# Negative lookahead assertion (?!...)  ... represents the RE       (succeeds if doesn't match)
# search for a pattern that contains 'not' and does not contain 'zzzz'
# Note: Use '?!' for negative and '?=' for positive
match = re.search(r'^(?=.*?not)(?!.*?zzzz).*$', sentence, re.I)
match

<_sre.SRE_Match object; span=(0, 110), match='due to short duration of sentence, you will not b>

In [85]:
# lookahead example
# Use [.] to match the specific character class
# The trailing '$' is added to ensure the rest of the string is included 
match = re.search(r'.*[.].*$', sentence9, re.I)
match.group()
match

<_sre.SRE_Match object; span=(0, 110), match='due to short.duration of sentence, you will not b>

In [86]:
# #pattern = re.compile(r'^(?=.*?(treatment need|extensive criminal))(?=.*?unmet treatment need).*$', re.I)


### RE Parole reason tests

In [87]:
sentence = 'due to short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence2 = 'due to sshort duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence3 = 'due to hort duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence4 = 'due to -short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence5 = 'due to///short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence6 = 'due to 10 short duration of 9 sentence, you will not be 8 treatment assessed'
sentence7 = 'dhdhdhdhue to short duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence8 = 'due to short duration of of sentence, you will not be treatment assessed and you have outstanding treatment needs'
sentence9 = 'due to short.duration of sentence, you will not be treatment assessed and you have outstanding treatment needs'
