In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
warnings.filterwarnings('ignore')

There are four datasets: case.csv, demo.csv, prior_arrests.csv, and grades.csv. case.csv is the main
dataset and reflects dates of arrest and disposition (trial or court appearance) during the period in
which the program operated. The file also contains an indicator of whether the arrestee was referred
to the intervention program for that arrest (i.e. whether they were treated), whether the person was rearrested while awaiting trial, the number of prior arrests at the time of program entry, and the arrest
location. demo.csv contains demographic information about arrestees, including some who were not
included in the program evaluation. prior_arrests.csv reflects pre-period arrests among individuals in
case.csv; the pre-period ran from 2008-2011. Finally, grades.csv includes 9th and 10th grade course
grades for a subset of individuals in case.csv. Further description of the datasets is included at the end
of this document.

| Tables   |      Are      |  Cool |
|----------|:-------------:|------:|
| case|  main dataset | dates of arrest, diposition and referred indicator treatment |
| demo |    demographic info   |   arrestees info |
| prior arrests | pre period arrest |    from 2008 - 2011 |
|grades|9th to 10th grades|

## 1. Data Management


1. The demographic data were extracted from a system that inconsistently coded gender.
Recode it so that males are consistently coded as “M” and females are consistently coded as
“F”


In [33]:
demo = pd.read_csv('/content/drive/MyDrive/upen/demo.csv')
demo.head()

Unnamed: 0,person_id,race,gender,bdate
0,1,WHITE,F,1985-07-03
1,5,BLACK,M,1986-09-27
2,6,BLACK,M,1991-06-07
3,7,BLACK,F,1994-08-24
4,8,BLACK,M,1978-04-04


In [34]:
demo.gender.value_counts()#we can see that there more than 2 values for the gender, that must change

M         15241
F          3804
male       1164
female      227
Name: gender, dtype: int64

In [35]:
demo['gender'] = demo.gender.replace({'male':'M','female':'F'})

In [36]:
demo.gender.value_counts()
#now we only got the desired values

M    16405
F     4031
Name: gender, dtype: int64

2. Merge the case and demo datasets together so that each row in the case dataset also contains
the demographics of the defendant. Keep in mind that the populations in the case and demo
data may not be 100% aligned.

In [37]:
df_case = pd.read_csv('/content/drive/MyDrive/upen/case.csv')

In [8]:
print(f'Rows:{demo.shape[0]}\nColumns: {demo.shape[1]}')
demo.head(5)


Rows:20436
Columns: 4


Unnamed: 0,person_id,race,gender,bdate
0,1,WHITE,F,1985-07-03
1,5,BLACK,M,1986-09-27
2,6,BLACK,M,1991-06-07
3,7,BLACK,F,1994-08-24
4,8,BLACK,M,1978-04-04


In [38]:
#we have to verify if the demo dataset have any duplicated row
demo.duplicated().sum()
#there are 4721 duplicated rows so we have to drop this rows in orde to erge our dataset

4721

In [39]:
demo.drop_duplicates(inplace=True)
demo.shape
#now we got unique rows

(15715, 4)

In [40]:
print(f'Rows:{df_case.shape[0]}\nColumns: {df_case.shape[1]}')

df_case.head(5)

Rows:26000
Columns: 8


Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests,address
0,57514,1,2012-01-04,2012-03-27,0,0,2,"1698 W 25TH PL, CHICAGO"
1,39970,1,2012-07-11,2012-10-20,1,0,3,"4866 S CORNELL AVE, CHICAGO"
2,88413,1,2013-04-04,2013-06-22,0,0,4,"2543 N WILLETTS CT, CHICAGO"
3,40216,5,2012-03-31,2013-03-25,0,0,2,"4578 W MORSE AVE, CHICAGO"
4,92255,6,2012-12-09,2013-11-09,0,0,3,"5111 S SANGAMON ST, CHICAGO"


In [41]:
df_case.duplicated().sum()
#no duplicated rows

0

In [65]:
df = df_case.merge(demo, on = 'person_id', how = 'left')

In [66]:
df.shape

(26000, 11)

In [44]:
df.head(5)

Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests,address,race,gender,bdate
0,57514,1,2012-01-04,2012-03-27,0,0,2,"1698 W 25TH PL, CHICAGO",WHITE,F,1985-07-03
1,39970,1,2012-07-11,2012-10-20,1,0,3,"4866 S CORNELL AVE, CHICAGO",WHITE,F,1985-07-03
2,88413,1,2013-04-04,2013-06-22,0,0,4,"2543 N WILLETTS CT, CHICAGO",WHITE,F,1985-07-03
3,40216,5,2012-03-31,2013-03-25,0,0,2,"4578 W MORSE AVE, CHICAGO",BLACK,M,1986-09-27
4,92255,6,2012-12-09,2013-11-09,0,0,3,"5111 S SANGAMON ST, CHICAGO",BLACK,M,1991-06-07


3. While the program was mostly rolled out to defendants in Chicago, the State’s Attorney’s
Office also ran a pilot serving a small number of individuals arrested in other parts of Cook
County. For the purpose of this analysis, please restrict the data to only individuals who were
arrested in Chicago.

- we filter by the address column the values that contains chicago but with no case sensitive

In [67]:
df['street']  = df.address.str.split(',',expand = True)[0]
df['city']  = df.address.str.split(',',expand = True)[1]



In [68]:
df.city.value_counts()

 CHICAGO     21796
 Chicago      3204
 OAK LAWN      337
 CICERO        303
 Oak Lawn      262
 Cicero         98
Name: city, dtype: int64

In [71]:
data = df.loc[df["city"].str.contains('chicago',case = False)]

In [69]:
df.loc[df["city"].str.contains('chicago',case = False) ==False].shape[0]
# 985 were deleted

1000

## Part 2: Variable Creation

1. Create an age variable equal to the defendant’s age at the time of arrest for each case.

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25016 entries, 0 to 25894
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   caseid         25016 non-null  int64 
 1   person_id      25016 non-null  int64 
 2   arrest_date    25016 non-null  object
 3   dispos_date    25016 non-null  object
 4   treat          25016 non-null  int64 
 5   re_arrest      25016 non-null  int64 
 6   prior_arrests  25016 non-null  int64 
 7   address        25016 non-null  object
 8   race           25016 non-null  object
 9   gender         25016 non-null  object
 10  bdate          25016 non-null  object
dtypes: int64(5), object(6)
memory usage: 2.3+ MB


In [72]:
data['bdate']= pd.to_datetime(data['bdate'])
data['arrest_date']= pd.to_datetime(data['arrest_date'])
data['dispos_date']= pd.to_datetime(data['dispos_date'])

In [73]:
data['age_of_arrest'] = ((data['arrest_date'] - data['bdate'])/ np.timedelta64(1, 'Y')).apply(math.floor)

In [80]:
data.shape

(25000, 14)

2. The State’s Attorney is interested in pursuing a partnership with the Chicago Public Schools
to investigate the relationship between high school achievement and criminal justice
outcomes in early adulthood. To that end, the State’s Attorney’s Office has requested 9th and
10th grade course grade data from defendants between the ages of 18 and 24. These data are
included in grades.csv. Please construct measures for 9th and 10th grade GPA for this target
population. When constructing GPA, please use a 4 point scale, where: A=4, B=3, C=2, D=1,
and F=0

In [75]:
grades = pd.read_csv('/content/drive/MyDrive/upen/grades.csv')
print(grades.shape)
grades.head()

(11251, 17)


Unnamed: 0,person_id,gr9_fall_math,gr9_fall_sci,gr9_fall_eng,gr9_fall_hist,gr9_spring_math,gr9_spring_sci,gr9_spring_eng,gr9_spring_hist,gr10_fall_math,gr10_fall_sci,gr10_fall_eng,gr10_fall_hist,gr10_spring_math,gr10_spring_sci,gr10_spring_eng,gr10_spring_hist
0,1,D,,A,A,A,D,A,,D,A,A,D,A,A,A,A
1,5,D,,A,A,A,D,D,A,D,A,A,A,D,F,A,A
2,8,,D,F,A,A,D,D,D,A,D,,D,D,D,A,A
3,10,,A,A,A,,A,A,D,A,A,A,A,A,A,A,F
4,11,,A,,A,F,,D,D,D,D,A,F,A,,D,D


In [89]:
grades = grades.replace({'A':4,'B':3,'C':2,'D':1,'F':0})
grades.columns

Index(['person_id', 'gr9_fall_math', 'gr9_fall_sci', 'gr9_fall_eng',
       'gr9_fall_hist', 'gr9_spring_math', 'gr9_spring_sci', 'gr9_spring_eng',
       'gr9_spring_hist', 'gr10_fall_math', 'gr10_fall_sci', 'gr10_fall_eng',
       'gr10_fall_hist', 'gr10_spring_math', 'gr10_spring_sci',
       'gr10_spring_eng', 'gr10_spring_hist'],
      dtype='object')

In [90]:
gr_9th = ['gr9_fall_math', 'gr9_fall_sci', 'gr9_fall_eng',
       'gr9_fall_hist', 'gr9_spring_math', 'gr9_spring_sci', 'gr9_spring_eng',
       'gr9_spring_hist']
gr_10th = ['gr10_fall_math', 'gr10_fall_sci', 'gr10_fall_eng',
       'gr10_fall_hist', 'gr10_spring_math', 'gr10_spring_sci',
       'gr10_spring_eng', 'gr10_spring_hist']

In [93]:
grades[gr_9th]

Unnamed: 0,gr9_fall_math,gr9_fall_sci,gr9_fall_eng,gr9_fall_hist,gr9_spring_math,gr9_spring_sci,gr9_spring_eng,gr9_spring_hist
0,1.0,,4.0,4.0,4.0,1.0,4.0,
1,1.0,,4.0,4.0,4.0,1.0,1.0,4.0
2,,1.0,0.0,4.0,4.0,1.0,1.0,1.0
3,,4.0,4.0,4.0,,4.0,4.0,1.0
4,,4.0,,4.0,0.0,,1.0,1.0
...,...,...,...,...,...,...,...,...
11246,1.0,4.0,0.0,1.0,4.0,,0.0,4.0
11247,4.0,,0.0,4.0,4.0,1.0,4.0,
11248,4.0,4.0,1.0,,1.0,4.0,4.0,4.0
11249,1.0,4.0,4.0,,,,4.0,


In [94]:
grades[gr_9th].mean(axis =1,  skipna = False)
grades[gr_10th].mean(axis =1,  skipna = False)



0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
11246     NaN
11247     NaN
11248     NaN
11249     NaN
11250    3.25
Length: 11251, dtype: float64

In [82]:
data_1 = data.merge(grades, on = 'person_id', how = 'left')

In [86]:
dataset.shape

(5616, 30)

In [96]:
data[data_1.age_of_arrest <=24]

Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests,address,race,gender,bdate,street,city,age_of_arrest
4,92255,6,2012-12-09,2013-11-09,0,0,3,"5111 S SANGAMON ST, CHICAGO",BLACK,M,1991-06-07,5111 S SANGAMON ST,CHICAGO,21
5,26516,7,2012-02-25,2012-03-26,0,0,0,"7162 W PALATINE AVE, Chicago",BLACK,F,1994-08-24,7162 W PALATINE AVE,Chicago,17
8,82277,9,2012-01-12,2012-11-08,0,0,1,"2942 E 78TH PL, CHICAGO",ASIAN,F,1994-07-11,2942 E 78TH PL,CHICAGO,17
9,31881,9,2013-09-25,2013-12-29,1,0,2,"528 S JENSEN BLVD, CHICAGO",ASIAN,F,1994-07-11,528 S JENSEN BLVD,CHICAGO,19
16,56468,14,2013-04-05,2013-10-09,1,0,2,"3309 N MAJOR AVE, CHICAGO",BLACK,M,1988-10-02,3309 N MAJOR AVE,CHICAGO,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24983,96089,19989,2012-05-29,2013-01-10,0,0,4,"667 W BROMPTON AVE, CHICAGO",BLACK,F,1988-11-25,667 W BROMPTON AVE,CHICAGO,23
24984,61944,19989,2013-10-13,2013-11-05,1,0,5,"7024 S ANTHONY AVE, CHICAGO",BLACK,F,1988-11-25,7024 S ANTHONY AVE,CHICAGO,24
24993,42724,19994,2012-10-29,2016-08-06,0,0,1,"1274 W SURF ST, CHICAGO",WHITE,F,1988-04-26,1274 W SURF ST,CHICAGO,24
24998,21029,19999,2013-08-17,2014-10-19,0,0,2,"3032 E 136TH ST, CHICAGO",BLACK,M,1989-02-13,3032 E 136TH ST,CHICAGO,24


In [97]:
dataset = data_1[(data_1.age_of_arrest <=24) & (18 <= data_1.age_of_arrest)]

In [98]:
dataset.shape

(5616, 30)

- econ ml
- dowhy
double machine learning
anovas
- sin asumir normalidad man whitney
- wincolson
- t de student
- pinguin stats


3.

a. The provided case.csv file includes a variable that indicates the number of arrests
prior to that case for each individual. Please reconstruct the variable using the
prior_arrests.csv file. Assume that all of the individual’s arrests prior to the study
period are contained in prior_arrest.csv. If someone is not included in
prior_arrests.csv, assume they had zero arrests at the start of the study period. Also
note that some individuals were arrested multiple times during the study period and
that this should be accounted for in your prior arrest count. For example, if individual
A was arrested 5 times prior to the study period and appears twice in the case file,
their first arrest in the case file should have a prior arrest count of “5” and their
second arrest should have a prior arrest count of “6”. One final note, some people
really do get arrested multiple times on the same day. Count each arrest separately,
regardless of whether another arrest occurred on the same day.


b. The case file also includes a variable re_arrest which indicates whether individuals
were arrested during their case period (i.e. after the case’s arrest date and before the
case’s disposition date). Please reconstruct this indicator. Assume that all arrests
during the study period are reflected in the case file.


c. Please show that the variables you reconstructed are equal to the versions in the
provided datasets.

In [25]:
prior = pd.read_csv('/content/drive/MyDrive/upen/prior_arrests.csv')

In [26]:
df_case

Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests,address
0,57514,1,2012-01-04,2012-03-27,0,0,2,"1698 W 25TH PL, CHICAGO"
1,39970,1,2012-07-11,2012-10-20,1,0,3,"4866 S CORNELL AVE, CHICAGO"
2,88413,1,2013-04-04,2013-06-22,0,0,4,"2543 N WILLETTS CT, CHICAGO"
3,40216,5,2012-03-31,2013-03-25,0,0,2,"4578 W MORSE AVE, CHICAGO"
4,92255,6,2012-12-09,2013-11-09,0,0,3,"5111 S SANGAMON ST, CHICAGO"
...,...,...,...,...,...,...,...,...
25995,97535,12413,2012-01-13,2013-07-19,0,0,1,"3830 N ALTA VISTA TER, OAK LAWN"
25996,2943,2599,2012-11-23,2015-06-03,0,0,5,"5328 N OAKVIEW AVE, CICERO"
25997,29001,20675,2013-02-04,2013-11-05,0,0,5,"4546 W 57TH ST, Oak Lawn"
25998,23486,23937,2012-01-22,2014-04-23,1,0,2,"398 W 29TH ST, Cicero"


In [27]:
df_case[df_case.person_id == 8]

Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests,address
6,2913,8,2012-10-06,2013-12-29,1,1,4,"15 W QUINCY CT, CHICAGO"
7,6304,8,2013-04-06,2013-07-07,0,0,5,"5742 N NICOLET AVE, CHICAGO"


In [28]:
prior[prior.person_id == 1]


Unnamed: 0,person_id,arrest_date
0,1,2008-06-14
1,1,2010-04-09


In [117]:
df_case[df_case.person_id.duplicated('first') == False]

Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests,address
0,57514,1,2012-01-04,2012-03-27,0,0,2,"1698 W 25TH PL, CHICAGO"
3,40216,5,2012-03-31,2013-03-25,0,0,2,"4578 W MORSE AVE, CHICAGO"
4,92255,6,2012-12-09,2013-11-09,0,0,3,"5111 S SANGAMON ST, CHICAGO"
5,26516,7,2012-02-25,2012-03-26,0,0,0,"7162 W PALATINE AVE, Chicago"
6,2913,8,2012-10-06,2013-12-29,1,1,4,"15 W QUINCY CT, CHICAGO"
...,...,...,...,...,...,...,...,...
25995,97535,12413,2012-01-13,2013-07-19,0,0,1,"3830 N ALTA VISTA TER, OAK LAWN"
25996,2943,2599,2012-11-23,2015-06-03,0,0,5,"5328 N OAKVIEW AVE, CICERO"
25997,29001,20675,2013-02-04,2013-11-05,0,0,5,"4546 W 57TH ST, Oak Lawn"
25998,23486,23937,2012-01-22,2014-04-23,1,0,2,"398 W 29TH ST, Cicero"


In [118]:
prior.groupby('person_id').arrest_date.count().to_frame().reset_index().rename(columns = {"arrest_date": "prior_arrests2"})

Unnamed: 0,person_id,prior_arrests2
0,1,2
1,5,2
2,6,3
3,8,4
4,9,1
...,...,...
13555,19995,1
13556,19996,5
13557,19997,2
13558,19999,2


In [139]:
prior

Unnamed: 0,person_id,arrest_date
0,1,2008-06-14
1,1,2010-04-09
2,5,2009-06-07
3,5,2010-05-20
4,6,2008-03-16
...,...,...
43597,19997,2008-11-21
43598,19999,2009-07-26
43599,19999,2010-06-28
43600,20000,2008-06-01


In [141]:
df_case.person_id.value_counts()

6957     7
8038     7
19846    6
15703    6
6253     6
        ..
12612    1
4471     1
4482     1
12608    1
21906    1
Name: person_id, Length: 15353, dtype: int64

In [135]:
prior.groupby('person_id').arrest_date.count()

person_id
1        2
5        2
6        3
8        4
9        1
        ..
19995    1
19996    5
19997    2
19999    2
20000    2
Name: arrest_date, Length: 13560, dtype: int64

In [133]:
df_case

Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests,address
0,57514,1,2012-01-04,2012-03-27,0,0,2,"1698 W 25TH PL, CHICAGO"
1,39970,1,2012-07-11,2012-10-20,1,0,3,"4866 S CORNELL AVE, CHICAGO"
2,88413,1,2013-04-04,2013-06-22,0,0,4,"2543 N WILLETTS CT, CHICAGO"
3,40216,5,2012-03-31,2013-03-25,0,0,2,"4578 W MORSE AVE, CHICAGO"
4,92255,6,2012-12-09,2013-11-09,0,0,3,"5111 S SANGAMON ST, CHICAGO"
...,...,...,...,...,...,...,...,...
25995,97535,12413,2012-01-13,2013-07-19,0,0,1,"3830 N ALTA VISTA TER, OAK LAWN"
25996,2943,2599,2012-11-23,2015-06-03,0,0,5,"5328 N OAKVIEW AVE, CICERO"
25997,29001,20675,2013-02-04,2013-11-05,0,0,5,"4546 W 57TH ST, Oak Lawn"
25998,23486,23937,2012-01-22,2014-04-23,1,0,2,"398 W 29TH ST, Cicero"


In [127]:
prior.groupby('person_id').arrest_date.count()

Unnamed: 0,person_id,arrest_date
0,1,2008-06-14
1,1,2010-04-09
2,5,2009-06-07
3,5,2010-05-20
4,6,2008-03-16
...,...,...
43597,19997,2008-11-21
43598,19999,2009-07-26
43599,19999,2010-06-28
43600,20000,2008-06-01


In [126]:
df_case.person_id.drop_duplicates().to_frame().merge(prior.groupby('person_id').arrest_date.count().to_frame().reset_index().rename(columns = {"arrest_date": "prior_arrests2"}), on = 'person_id', how = 'left').fillna

Unnamed: 0,person_id,prior_arrests2
0,1,2.0
1,5,2.0
2,6,3.0
3,7,
4,8,4.0
...,...,...
15348,12413,
15349,2599,
15350,20675,
15351,23937,


In [105]:
_iddf_case[df_case.person_id.duplicated('first') == False].merge(prior.groupby('person_id').arrest_date.count().to_frame().reset_index().rename(columns = {"arrest_date": "prior_arrests"}), on = "person_id",how = "left", suffixes=('_original', '_reconstructed'))

Unnamed: 0,caseid,person_id,arrest_date,dispos_date,treat,re_arrest,prior_arrests_original,address,prior_arrests_reconstructed
0,57514,1,2012-01-04,2012-03-27,0,0,2,"1698 W 25TH PL, CHICAGO",2.0
1,40216,5,2012-03-31,2013-03-25,0,0,2,"4578 W MORSE AVE, CHICAGO",2.0
2,92255,6,2012-12-09,2013-11-09,0,0,3,"5111 S SANGAMON ST, CHICAGO",3.0
3,26516,7,2012-02-25,2012-03-26,0,0,0,"7162 W PALATINE AVE, Chicago",
4,2913,8,2012-10-06,2013-12-29,1,1,4,"15 W QUINCY CT, CHICAGO",4.0
...,...,...,...,...,...,...,...,...,...
15348,97535,12413,2012-01-13,2013-07-19,0,0,1,"3830 N ALTA VISTA TER, OAK LAWN",
15349,2943,2599,2012-11-23,2015-06-03,0,0,5,"5328 N OAKVIEW AVE, CICERO",
15350,29001,20675,2013-02-04,2013-11-05,0,0,5,"4546 W 57TH ST, Oak Lawn",
15351,23486,23937,2012-01-22,2014-04-23,1,0,2,"398 W 29TH ST, Cicero",
