In [6]:
## imports 
import pandas as pd
import numpy as np
import yaml
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# comment these out if you don't have plotnine--not essential here/only used once
import matplotlib.pyplot as plt
# import plotnine
# from plotnine import *

## way to connect to mysql 
## if you need to install
## uncomment this line:
#! pip install mysql-connector-python
import mysql.connector

## function to feed path name to load
## credentials
def load_creds(path: str):
    with open(path, 'r') as stream:
        try:
            creds = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return(creds)

pd.options.display.max_rows = 999
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Preliminary: define connection and read sample of data

In [7]:
## read in creds; change the path name if stored
## elsewhere
creds = load_creds("09_db_cred.yaml")

In [8]:
creds

{'practice_database': {'user': 'qss20',
  'password': 'xtPdDZHnxKMFSD5RyP',
  'port': 3306,
  'database': 'sentencing',
  'host': 'qss20-7570-db.c.dartmouth.edu'}}

In [12]:
## connect to the database
cnx = mysql.connector.connect(user=creds['practice_database']['user'], 
                            password=creds['practice_database']['password'],
                            port=creds['practice_database']['port'],
                            database= creds['practice_database']['database'],
                            host = creds['practice_database']['host'])
cnx

<mysql.connector.connection.MySQLConnection at 0x15b077440>

# Activity 1

1. Create a new column -- `in_chicago` when pulling from the `caseinit` table that takes on the value of "YES" if INCIDENT_CITY = Chicago; "NO" otherwise (which represents incidents in Cook County suburbs outside the city limits);  and pull the table. Use `crosstabs` to confirm that this worked
2. Repeat step 1 but also filter out blank strings (`INCIDENT_CITY` == "")
3. Use `where` to row filter to initiations in Chicago and use group by to find the count of cases diverted and not diverted (`is_in_diversion`); pull the table with those counts
4. Modify the query in step 3 to find the proportion of cases in chicago diverted (hint you made need to use case when in a subquery)
5. Modify the query in step 4 to find the proportion of cases in chicago versus cases not in chicago sent to diversion 


In [32]:
# your code here 1
chicago_q = """
SELECT *,
       CASE
         WHEN INCIDENT_CITY = "Chicago" THEN 'YES'
         ELSE 'NO'
       END AS in_chicago_1
FROM   caseinit
"""

chicago_d = pd.read_sql_query(chicago_q, cnx)

In [33]:
chicago_d.in_chicago_1

0          NO
1          NO
2          NO
3          NO
4          NO
         ... 
272289    YES
272290     NO
272291    YES
272292    YES
272293     NO
Name: in_chicago_1, Length: 272294, dtype: object

In [36]:
pd.crosstab(columns=chicago_d['in_chicago_1'], index=chicago_d['INCIDENT_CITY'])

in_chicago_1,NO,YES
INCIDENT_CITY,Unnamed: 1_level_1,Unnamed: 2_level_1
,20557,0
Addison,2,0
Albers,1,0
Algonquin,3,0
Alsip,568,0
Antioch,2,0
Arlington Heights,1291,0
Aurora,3,0
Barrington,136,0
Barrington Hills,16,0


In [28]:
# your code here 2
filter_blank_q = """
SELECT INCIDENT_CITY,
       CASE
         WHEN INCIDENT_CITY = "Chicago" THEN 'YES'
         ELSE 'NO'
       END AS in_chicago_1
FROM   caseinit
WHERE INCIDENT_CITY <> ""
"""

In [37]:
# your code here 3
# filter_chicago_q = """
# SELECT *,
# FROM caseinit
#     INNER JOIN (SELECT INCIDENT_CITY,
  #              CASE
   #             WHEN INCIDENT_CITY = "Chicago" THEN 'YES'
    #            ELSE "NO"
     #           END AS in_chicago_1
      #          FROM caseinit) AS tmp
       #     ON tmp.INCIDENT_CITY = caseinit.INCIDENT_CITY
#WHERE in_chicago_1 = "YES"
#"""

In [51]:
# your code here 3
prop_chicago_q = """
SELECT is_in_diversion,
        Count(*) AS count
FROM  caseinit
WHERE INCIDENT_CITY = "Chicago"
GROUP BY is_in_diversion
"""

filter_chicago_d = pd.read_sql_query(filter_chicago_q, cnx)

In [52]:
filter_chicago_d

Unnamed: 0,is_in_diversion,count
0,False,167171
1,True,6402


In [None]:
# your code here 4
filter_chicago_q = """
SELECT abg(is_div_bool) AS prop
FROM (
    SELECT *,
        CASE
            WHEN is_in_diversion = "True" THEN 1
            ELSE 0
        END AS is_div_bool
)
"""

In [None]:
# your code here 5
q = """
SELECT
    avg(is_div_bool) as prop,
    in_chicago_1
FROM (
    SELECT *,
        CASE
            WHEN is_in_diversion = "True" THEN 1
            ELSE 0
        END AS is_div_bool,
        CASE
            WHEN INCIDENT_CITY = "Chicago" THEN "YES"
            ELSE "NO"
        END AS in_chicago_1
    FROM caseinit) AS d
WHERE INCIDENT_CITY <> ""
GROUP BY in_chicago_1
"""

In [None]:
WITH s1 AS (
    SELECT 
        *,
        CASE 
            WHEN is_in_diversion = True THEN 1
            ELSE 0
        END as is_div_bool,
        CASE 
            WHEN INCIDENT_CITY = 'Chicago' THEN 'Yes'
            ELSE 'No'
        END AS in_chicago_1
    FROM SQL_Test.caseinit)
SELECT 
    AVG(is_div_bool) as prop_div, 
    in_chicago_1
FROM s1
WHERE INCIDENT_CITY <> ''
GROUP BY in_chicago_1

# Activity 2 

1. Use the following crosswalk and the `CASE` variable in the `divert` table to create a new variable `DIVERSION_PROGRAM_TEXT` that spells out the diversion programs
    - DC: Drug Court

    - DDPP: Drug Deferred Prosecution

    - DS: Drug School

    - RJCC: Restorative Justice

    - MHC: Mental Health Court

    - VC: Veteran Court

2. Build on the query from step 1 to filter to Narcotics as the `UPDATED_OFFENSE_CATEGORY` and Black or White defendants (based on race in the diversions table) (hint: you'll need to join with the caseinit table based on case_id and case_participant_id, you can do a inner join to keep only those diverted). Select the case_id, case_participant_id, case, race, and diversion_program_text columns

In [59]:
# your code here 1
divert_q = """
SELECT *
FROM divert
"""

divert_d = pd.read_sql_query(divert_q, cnx)

In [61]:
divert_d.DIVERSION_PROGRAM

0          DS
1          VC
2         MHC
3         MHC
4         MHC
         ... 
25490    DDPP
25491    DDPP
25492    DDPP
25493    DDPP
25494     BR9
Name: DIVERSION_PROGRAM, Length: 25495, dtype: object

In [None]:
# your code here 2