In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from IPython.display import display

%matplotlib inline
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('/Users/jenniferkaufman/Desktop/dsi_galvanize/capstones/capstone1/pretrial_services_success/ahAC21Q3.csv')

In [None]:
pd.options.display.max_columns = None
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### Data Cleaning

In [None]:
df[df['Age']==0] #two rows with the same STAYID and CASEID have age =0

In [None]:
df = df[df['Age'] != 0.] #dropped the two rows with 0 for age 

In [None]:
df.describe()

In [None]:
df['TERMREASONID'].value_counts()

In [None]:
df['TERMINATIONID'].value_counts()

In [None]:
df.loc[(df['TERMREASONID']=='NC'),'TERMINATIONID'] = 61 #some with TERMREASONID = 'NC' are coded incorrectly in TERMINATIONID(should be 61(unsuccessful) rather than 60)

In [None]:
df['TERMINATIONID'].value_counts()

In [None]:
df.loc[(df['TERMINATIONID']==60),'TERMINATIONID'] = 1 #recoding successful(60) to be equal to 1
df.loc[(df['TERMINATIONID']==61),'TERMINATIONID'] = 0 #recoding unsuccessful(61) to be equal to 0

In [None]:
df['TERMINATIONID'].value_counts()

In [None]:
df['SegmentStart']=pd.to_datetime(df.SegmentStart)
df['SegmentEnd']=pd.to_datetime(df.SegmentEnd)
df['length_of_segments']=df['SegmentEnd']-df['SegmentStart'] #added column for length of segment for further analysis
df['length_of_segments']=df['length_of_segments'].dt.days.astype('int16')

In [None]:
df.info()

In [None]:
df2=df.drop_duplicates(['CASEID', 'SegmentStart', 'SegmentEnd', 'SUPERVISIONLEVELID']) #accounting for duplicate segments for each CASEID due to each bond condition having a row (CONDITIONID)
df2.info()

In [None]:
df2.head()

In [None]:
#grouping by CASEID and summing the length of segments so that segment length is summed by case
df3=df2.groupby(['CASEID','STAYID', 'Age', 'RACE', 'ETHNICID', 'OSEX', 'CPATCATID', 'OFFENSECATID', 'TERMINATIONID', 'TERMREASONID']).agg({'length_of_segments':lambda x: x.sum()}).reset_index()


In [None]:
df3.info()

### Observations

+ there are no nan values for the columns I will use in data analysis
+ all columns are categorical other than "legnth_of_segment" and "Age"

In [None]:
df3.head()

In [None]:
df3.tail()

In [None]:
df3.describe()

In [None]:
df3[df3['length_of_segments']==1883.000000] #test to see if extreme value of 1883 is accurate

In [None]:
df2[df2['CASEID']=='15M05929'] #test to see if extreme value of 1883 is accurate

In [None]:
df3[df3['length_of_segments']<5]

In [None]:
df3 = df3[df3['length_of_segments'] > 5] #dropped the rows with less than 5 for length of segments

In [None]:
df3.describe()

In [None]:
df3.info()

In [None]:
df3['TERMINATIONID'].value_counts()

In [None]:
df3.RACE.unique()

In [None]:
df3['RACE'].value_counts()

In [None]:
df3.ETHNICID.unique()

In [None]:
df3['ETHNICID'].value_counts()

In [None]:
df3.TERMINATIONID.unique()

In [None]:
df3.TERMREASONID.unique()

In [None]:
df3.OFFENSECATID.unique()

In [None]:
df3['OFFENSECATID'].value_counts()

In [None]:
df3.CPATCATID.unique()

In [None]:
%store df3

### Data Visualization

In [None]:
sns.heatmap(df3.isnull(),cbar=False,yticklabels=False,cmap = "YlGnBu") #checking visually for null values

+ There are no nans in the categories in the graph above
+ I will not be using categories with nan values: ODARASCORE, NewChargesCat, NewChargesClass, CCNOTES, or MONITORTYPEID

### Age

In [None]:
sns.scatterplot(x='Age', 
                y='TERMINATIONID',
                data=df3, alpha=0.1)

plt.title('Pretrial Services Success by Age')

In [None]:
fig, ax = plt.subplots()

ax.hist(df3[df3['TERMINATIONID']==0]["Age"], bins=25, alpha=0.5, color="blue", label="Unsuccessful")
ax.hist(df3[df3['TERMINATIONID']==1]["Age"], bins=25, alpha=0.25, color="green", label="Successful")

ax.set_xlabel("Age")
ax.set_ylabel("Count")

fig.suptitle("Age vs. Pretrial Success")

ax.legend(title="Termination Type")

ax.figure.savefig('EDA_images/Age_EDA.png',bbox_inches='tight');


### Gender

In [None]:
ax = sns.countplot(x="OSEX", hue="TERMINATIONID", data=df3, palette='Paired', saturation=1.5)
ax.legend(title="Termination Type", labels=["Unsuccessful", "Successful"])
ax.set_xticklabels(["Male", "Female"])


ax.set(xlabel='Gender')

ax.set_title("Gender vs. Pretrial Success")
ax.figure.savefig('EDA_images/Gender_EDA.png',bbox_inches='tight')

### Race

In [None]:
ax = sns.countplot(x="RACE", hue="TERMINATIONID", data=df3)

fig.suptitle("Race vs. Pretrial Success")

### Ethnicity

In [None]:
ax = sns.countplot(x="ETHNICID", hue="TERMINATIONID", data=df3, palette='Paired', saturation=1.5)
ax.set_xticklabels(["Not-Hispanic", "Hispanic", "Unknown"])
ax.set(xlabel='Termination Type')
ax.legend(['Unsuccessful','Successful'],  
          title='Termination Type')
ax.set_title("Ethnicity vs. Pretrial Success")
ax.figure.savefig('EDA_images/Ethnicity_EDA.png', bbox_inches='tight')

### Offense Category

In [None]:
ax = sns.countplot(x="OFFENSECATID", hue="TERMINATIONID", data=df3)

fig.suptitle("Offense Category vs. Pretrial Success")

### Supervision Level

In [None]:
ax = sns.countplot(x="SUPERVISIONLEVELID", hue="TERMINATIONID", data=df2)


fig.suptitle("Supervision Level vs. Pretrial Success")

### Length of Supervision

In [None]:
fig, ax = plt.subplots()

ax.hist(df3[df3['TERMINATIONID']==0]["length_of_segments"], bins=300, alpha=0.5, color="blue", label="Unsuccessful")
ax.hist(df3[df3['TERMINATIONID']==1]["length_of_segments"], bins=300, alpha=0.25, color="green", label="Successful")

ax.set_xlim(0,400)
ax.set_xlabel("Days")
ax.set_ylabel("Count")
ax.legend(title="Termination Type")

fig.suptitle("Length of Supervision vs. Pretrial Success")
ax.figure.savefig('EDA_images/Length_EDA.png', bbox_inches='tight');

### Risk Level

In [None]:
ax = sns.countplot(x="CPATCATID", hue="TERMINATIONID", data=df3)

fig.suptitle("Risk Level vs. Pretrial Success")

In [None]:
g = sns.catplot(x="CPATCATID", hue="TERMINATIONID", col="OSEX",
                data=df3, kind="count",
                height=4, aspect=.7, order=['C1','C2','C3', 'C4']);
ax.legend(title="Risk Level")

### Hypotheses to test based on EDA

Age <br>
* $H_0$: Successful and unsuccessful groups do not vary by age.<br>
* $H_a$: Successful and unsuccessful groups do vary by age.<br>
alpha=0.05<br>
statistical test=Mann–Whitney U test<br>
bonferoni correction-look into and possibly modify alpha value

Gender <br>
* $H_0$: Pretrial services success and gender are independent.<br>
* $H_a$: Pretrial services success and gender are not independent.<br>
alpha=0.05<br>
statistical test=chi-squared test of independence<br>

Race <br>
Null Hypothesis:Pretrial services success rates do not vary by race.<br>
Alternate Hypothesis: Pretrial services success rates do vary by race.<br>
alpha=0.05<br>
statistical test=chi-squared test of independence<br>

Ethnicity <br>
* $H_0$: Pretrial services success and ethnicity are independent.<br>
* $H_a$: Pretrial services success and ethnicity are not independent.<br>
alpha=0.05<br>
statistical test=chi-squared test of independence<br>

Offense Category <br>
Null Hypothesis:Pretrial services success rates do not vary by offense type.<br>
Alternate Hypothesis: Pretrial services success rates do vary by offense type.<br>
alpha=0.05<br>
statistical test=chi-squared test of independence<br>

Length of Supervision <br>
* $H_0$: Successful and unsuccessful groups do not vary by length of supervision.<br>
* $H_a$: Successful and unsuccessful groups do vary by length of supervision.<br>
alpha=0.05<br>
statistical test=Mann–Whitney U test<br>


Risk Category <br>
Null Hypothesis:Pretrial services success rates do vary by risk category.<br>
Alternate Hypothesis: Pretrial services success rates do not vary by risk category.<br>
alpha=0.05<br>
statistical test=chi-squared test of independence<br>