## Crime and Education using Machine Learning 

In [1]:
import os
import pandas as pd
import requests
import seaborn as sns

In [2]:
def get_data_chicago(id):
    '''
    Connect to the chicago data portal API and returns a dataframe
    '''
    
    url = f'https://data.cityofchicago.org/api/views/{id}/rows.csv?accessType=DOWNLOAD'
    df = pd.read_csv(url)
    
    return df

In [3]:
# School Performance 2012
school_performance_2012 = get_data_chicago('9xs2-f89t')

In [4]:
# Change the key name in order to merge columns later
school_performance_2012=school_performance_2012.rename(columns = {'School ID':'School_ID'}, inplace = False)

In [5]:
# Crime 2016

Crime_2016 = get_data_chicago('kf95-mnd6')

In [6]:
# School Performance 2015 - 2016 (it uses the last year as a tag).
school_performance_2016 = get_data_chicago('fvrx-esxp')

In [7]:
#Crime 2018

Crime_2018 = get_data_chicago('3i3m-jwuy') 

In [8]:
# School Performance in 2017 - 2018. (it uses the last year as a tag)

school_performance_2018 = get_data_chicago('wkiz-8iya')


In [9]:
#check if  Community_Area column exists in school_performance_2016 and school_performance_2018
Community_Area_2016 = "Community Area Number" in school_performance_2016
Community_Area_2018 = "Community Area Number" in school_performance_2018

In [10]:
Community_Area_2016

False

In [11]:
Community_Area_2018

False

In [12]:
school_performance_2018.columns

Index(['School_ID', 'Short_Name', 'Long_Name', 'School_Type',
       'Primary_Category', 'Phone', 'Fax', 'CPS_School_Profile', 'Website',
       'Progress_Report_Year',
       ...
       'Progress_Toward_Graduation_Year_2', 'State_School_Report_Card_URL',
       'Mobility_Rate_Pct', 'Chronic_Truancy_Pct',
       'Empty_Progress_Report_Message', 'School_Survey_Rating_Description',
       'Supportive_School_Award', 'Supportive_School_Award_Desc',
       'Parent_Survey_Results_Year', 'Location'],
      dtype='object', length=163)

In [13]:
#Since Community Area Number does not exist in school_performance_2016 neither 2018, we merge that column 
#using School_ID as key

In [14]:
school_performance_2016=school_performance_2016.merge(school_performance_2012[['School_ID', 'Community Area Name','Community Area Number']],
                                                      on='School_ID')

In [15]:
school_performance_2018=school_performance_2018.merge(school_performance_2012[['School_ID', 'Community Area Name','Community Area Number']],
                 on='School_ID')

In [16]:
Crime_2016['Primary Type'].unique()

array(['CRIMINAL SEXUAL ASSAULT', 'DECEPTIVE PRACTICE',
       'OFFENSE INVOLVING CHILDREN', 'HOMICIDE', 'BURGLARY', 'BATTERY',
       'SEX OFFENSE', 'ASSAULT', 'NARCOTICS', 'THEFT', 'CRIMINAL DAMAGE',
       'ROBBERY', 'WEAPONS VIOLATION', 'OTHER OFFENSE',
       'CRIMINAL TRESPASS', 'PUBLIC PEACE VIOLATION',
       'MOTOR VEHICLE THEFT', 'KIDNAPPING',
       'INTERFERENCE WITH PUBLIC OFFICER', 'CRIM SEXUAL ASSAULT',
       'NON-CRIMINAL', 'ARSON', 'CONCEALED CARRY LICENSE VIOLATION',
       'STALKING', 'LIQUOR LAW VIOLATION', 'PROSTITUTION', 'INTIMIDATION',
       'NON-CRIMINAL (SUBJECT SPECIFIED)', 'OBSCENITY', 'GAMBLING',
       'NON - CRIMINAL', 'OTHER NARCOTIC VIOLATION', 'PUBLIC INDECENCY',
       'HUMAN TRAFFICKING'], dtype=object)

In [17]:
# recode according to type of crime: 1: Violent, 0: No Violent, 99: undefined

def type_to_binary(x):
        if x=='BATTERY': return 1
        if x=='MOTOR VEHICLE THEFT': return 1
        if x=='ROBBERY': return 1
        if x=='THEFT': return 1
        if x=='ASSAULT': return 1
        if x=='CRIMINAL SEXUAL ASSAULT': return 1
        if x=='INTIMIDATION': return 1
        if x=='WEAPONS VIOLATION': return 1
        if x=='HOMICIDE': return 1
        if x=='PUBLIC PEACE VIOLATION': return 1
        if x=='KIDNAPPING': return 1
        if x=='SEX OFFENSE': return 1
        if x=='CRIM SEXUAL ASSAULT': return 1
        if x=='CRIMINAL SEXUAL ASSAULT': return 1
        if x=='DECEPTIVE PRACTICE': return 0
        if x=='NARCOTICS': return 0
        if x=='INTERFERENCE WITH PUBLIC OFFICER': return 0
        if x=='PROSTITUTION': return 0
        if x=='GAMBING': return 0
        if x=='OBSCENITY': return 0
        if x=='RITUALISM': return 0
        if x=='CONCEALED CARRY LICENSE VIOLATION': return 0
        if x=='LIQUOR LAW VIOLATION': return 0
        if x=='HUMAN TRAFFICKING': return 0
        if x=='PUBLIC INDECENCY': return 0
        if x=='OTHER NARCOTIC VIOLATION': return 0
        if x=='OTHER OFFENSE': return 99
        if x=='CRIMINAL TRESPASS': return 99
        if x=='CRIMINAL DAMAGE': return 99
        if x=='BURGLARY': return 99
        if x=='OFFENSE INVOLVING CHILDREN': return 99
        if x=='ARSON': return 99
        if x=='STALKING': return 99
        if x=='NON-CRIMINAL': return 99
        if x=='CRIMINAL TRESPASS': return 99
        if x=='NON-CRIMINAL (SUBJECT SPECIFIED)': return 99


# Applying the function
Crime_2016['binary'] = Crime_2016['Primary Type'].apply(type_to_binary)
Crime_2018['binary'] = Crime_2018['Primary Type'].apply(type_to_binary)

In [18]:
Crime_2016['binary'].value_counts()

1.0     162681
99.0     72013
0.0      34588
Name: binary, dtype: int64

In [19]:
Crime_2018['binary'].value_counts()

1.0     165726
99.0     66647
0.0      35675
Name: binary, dtype: int64

In [20]:
Crime_2016 = Crime_2016[Crime_2016.binary != 99]

In [21]:
Crime_2018 = Crime_2018[Crime_2018.binary != 99]

In [22]:
Crime_2016.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'binary'],
      dtype='object')

In [23]:
#Subset for the column of interest
Crime_2016 =Crime_2016[['Community Area','binary']]
# Rename the Community Area in order to have homogeneous names
Crime_2016=Crime_2016.rename(columns = {'Community Area':'Community Area Number'}, inplace = False)

In [24]:
#Subset for the column of interest
Crime_2018 =Crime_2018[['Community Area','binary']]
# Rename the Community Area in order to have homogeneous names
Crime_2018=Crime_2018.rename(columns = {'Community Area':'Community Area Number'}, inplace = False)

In [40]:
test2 = "nwea_reading_attainment_grade_8_pct" in school_performance_2016

In [41]:
test2 

False

In [44]:
for col in school_performance_2016.columns: 
    print(col)

School_ID
Short_Name
Long_Name
School_Type
Primary_Category
Address
City
State
Zip
Phone
Fax
CPS_School_Profile
Website
Progress_Report_Year
Blue_Ribbon_Award_Year
Excelerate_Award_Gold_Year
Spot_Light_Award_Year
Improvement_Award_Year
Excellence_Award_Year
Student_Growth_Rating
Student_Growth_Description
Growth_Reading_Grades_Tested_Pct_ES
Growth_Reading_Grades_Tested_Label_ES
Growth_Math_Grades_Tested_Pct_ES
Growth_Math_Grades_Tested_Label_ES
Student_Attainment_Rating
Student_Attainment_Description
Attainment_Reading_Pct_ES
Attainment_Reading_Lbl_ES
Attainment_Math_Pct_ES
Attainment_Math_Lbl_ES
Culture_Climate_Rating
Culture_Climate_Description
School_Survey_Student_Response_Rate_Pct
School_Survey_Student_Response_Rate_Avg_Pct
School_Survey_Teacher_Response_Rate_Pct
School_Survey_Teacher_Response_Rate_Avg_Pct
Healthy_School_Certification
Healthy_School_Certification_Description
Creative_School_Certification
Creative_School_Certification_Description
NWEA_Reading_Growth_Grade_3_Pct
NWE

In [47]:
#Subset for the column of interest-2016
school_performance_2016=school_performance_2016[['NWEA_Math_Attainment_Grade_8_Pct',
                                      'NWEA_Reading_Attainment_Grade_8_Pct', 'Student_Attendance_Avg_Pct',
                                                 'Community Area Number']]

In [48]:
#Subset for the column of interest-2018
school_performance_2018=school_performance_2018[['NWEA_Math_Attainment_Grade_8_Pct',
                                      'NWEA_Reading_Attainment_Grade_8_Pct', 'Student_Attendance_Avg_Pct',
                                                 'Community Area Number']]

In [49]:
community_performance_2016=school_performance_2016.groupby('Community Area Number').mean().reset_index()

In [51]:
community_performance_2018=school_performance_2018.groupby('Community Area Number').mean().reset_index()

In [52]:
train=pd.merge(Crime_2016,community_performance_2016,on='Community Area Number')

In [55]:
train

Unnamed: 0,Community Area Number,binary,NWEA_Math_Attainment_Grade_8_Pct,NWEA_Reading_Attainment_Grade_8_Pct,Student_Attendance_Avg_Pct,Teacher_Attendance_Avg_Pct
0,66,1.0,38.142857,39.285714,95.1,95.0
1,66,0.0,38.142857,39.285714,95.1,95.0
2,66,1.0,38.142857,39.285714,95.1,95.0
3,66,1.0,38.142857,39.285714,95.1,95.0
4,66,1.0,38.142857,39.285714,95.1,95.0
...,...,...,...,...,...,...
197458,34,1.0,97.500000,90.500000,93.0,95.0
197459,34,0.0,97.500000,90.500000,93.0,95.0
197460,34,0.0,97.500000,90.500000,93.0,95.0
197461,34,1.0,97.500000,90.500000,93.0,95.0


In [56]:
test=pd.merge(Crime_2018,community_performance_2018,on='Community Area Number')

In [57]:
test

Unnamed: 0,Community Area Number,binary,NWEA_Math_Attainment_Grade_8_Pct,NWEA_Reading_Attainment_Grade_8_Pct,Student_Attendance_Avg_Pct,Teacher_Attendance_Avg_Pct
0,29,1.0,41.0,48.9,92.8,90.1
1,29,0.0,41.0,48.9,92.8,90.1
2,29,1.0,41.0,48.9,92.8,90.1
3,29,1.0,41.0,48.9,92.8,90.1
4,29,1.0,41.0,48.9,92.8,90.1
...,...,...,...,...,...,...
201597,55,0.0,88.5,91.5,92.8,90.1
201598,55,1.0,88.5,91.5,92.8,90.1
201599,55,0.0,88.5,91.5,92.8,90.1
201600,55,1.0,88.5,91.5,92.8,90.1


In [60]:
X_train=train.drop(columns=['Community Area Number','binary'])

In [61]:
Y_train=train['binary']

In [62]:
X_test=test.drop(columns=['Community Area Number','binary'])

In [63]:
Y_test=test['binary']