In [1]:
import re
import datetime
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [6]:
in_path = path + '/input/shootings-ipra-may2016_copy_20170112/'
out_path =path + '/output/shootings-ipra-may2016_copy_20170112/'

In [7]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

In [8]:
location_code.head()

Unnamed: 0,Location_Code,Location_Value
0,1,Food Sales/Restaurant
1,2,Tavern/Liquor Store
2,3,Other Business Establishment
3,4,Police Building
4,5,Lockup Facility


## May 2016 Data

### Report 1

In [9]:
in_path_may_2016_report = in_path
out_path_may_2016_report = out_path

files = [x for x in os.listdir(in_path_may_2016_report) if '.xls' in x]
files

['IPRA Shootings Data.xls']

In [10]:
sheets = pd.ExcelFile(in_path_may_2016_report + files[0]).sheet_names

In [11]:
sheets

['qry_II_217_Data163_2016',
 'qry_II_217_Data163_2015',
 'qry_II_217_Data163_2014',
 'qry_II_217_Data163_2013',
 'qry_II_217_2012_Incid',
 'qry_II_217_2012_Parties',
 'qry_II_217_2011_Incid',
 'qry_II_217_2011_Parties',
 'qry_II_217_2010_Incid',
 'qry_II_217_2010_Parties',
 'qry_II_217_2009_Incid',
 'qry_II_217_2009_Parties',
 'qry_II_217_200807_Incid',
 'qry_II_217_200807_Parties',
 'Sheet3']

### Data Sheets

In [12]:
file = files[0]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
data_sheets = [sheet for sheet in sheets if '163' in sheet] 
for sheet in data_sheets:
    df = pd.read_excel(in_path_may_2016_report + file, sheetname=sheet)

    df.columns = ['CR_Required','Initial_Category_Code','Initial_Category','Current_Category_Code',
                  'Current_Category','Current_Status','Complaint_Number','Beat','Street','Address',
                  'Location_Value','Incident_Time_Start','Incident_Time_End','Complaint_Date',
                  'IPRA_Closed_Date','IPRA_Investigator_Last_Name','IPRA_Investigator_First_Name',
                  'IPRA_Assign_Date','IPRA_Investigate_Begin_Date','IPRA_Investigate_End_Date',
                  'IPRA_Investigator_Type','Report_Status','Finding_Code','Finding_Id','Penalty_Code',
                  'No_Days','Penalty_Status','Accused_First_Name','Accused_Last_Name','Accused_Star',
                  'Accused_Assignment','Accused_Detail','Accused_Appointment_Date','Accused_Position',
                  'IAD_Ops','Converted_Rec_I','Involved_Party_Description','Involved_Party_Type',
                  'Involved_Party_Last_Name','Involved_Party_First_Name','Involved_Party_Assignment',
                  'Involved_Party_Detail','Involved_Party_Position','Involved_Party_Age',
                  'Involved_Party_Sex','Involved_Party_Race','Police_Shooting_No','Police_Shooting']

    df["Address"] = df["Address"].astype(str)
    df["Incident_Address"] = df[["Address","Street"]].apply(lambda x: ' '.join(x), axis=1)

    df =  df[['CR_Required','Initial_Category_Code','Initial_Category','Current_Category_Code',
                  'Current_Category','Current_Status','Complaint_Number','Beat',"Incident_Address",
                  'Location_Value','Incident_Time_Start','Incident_Time_End','Complaint_Date',
                  'IPRA_Closed_Date','IPRA_Investigator_Last_Name','IPRA_Investigator_First_Name',
                  'IPRA_Assign_Date','IPRA_Investigate_Begin_Date','IPRA_Investigate_End_Date',
                  'IPRA_Investigator_Type','Report_Status','Finding_Code','Finding_Id','Penalty_Code',
                  'No_Days','Penalty_Status','Accused_First_Name','Accused_Last_Name','Accused_Star',
                  'Accused_Assignment','Accused_Detail','Accused_Appointment_Date','Accused_Position',
                  'IAD_Ops','Converted_Rec_I','Involved_Party_Description','Involved_Party_Type',
                  'Involved_Party_Last_Name','Involved_Party_First_Name','Involved_Party_Assignment',
                  'Involved_Party_Detail','Involved_Party_Position','Involved_Party_Age',
                  'Involved_Party_Sex','Involved_Party_Race','Police_Shooting_No','Police_Shooting']]
    
    df["Sheet"] = sheet

    ## Adding File Metadata
    df["FOIA_Request_Number"]=''
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''    

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [13]:
saving_files = "IPRA_Shootings_Data_qry_II_217_Data163"

final_df.to_csv(out_path_may_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_may_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_may_2016_report+saving_files+"_metadata.csv",index=False)

### Incidents

In [14]:
df.columns.tolist()

['CR_Required',
 'Initial_Category_Code',
 'Initial_Category',
 'Current_Category_Code',
 'Current_Category',
 'Current_Status',
 'Complaint_Number',
 'Beat',
 'Incident_Address',
 'Location_Value',
 'Incident_Time_Start',
 'Incident_Time_End',
 'Complaint_Date',
 'IPRA_Closed_Date',
 'IPRA_Investigator_Last_Name',
 'IPRA_Investigator_First_Name',
 'IPRA_Assign_Date',
 'IPRA_Investigate_Begin_Date',
 'IPRA_Investigate_End_Date',
 'IPRA_Investigator_Type',
 'Report_Status',
 'Finding_Code',
 'Finding_Id',
 'Penalty_Code',
 'No_Days',
 'Penalty_Status',
 'Accused_First_Name',
 'Accused_Last_Name',
 'Accused_Star',
 'Accused_Assignment',
 'Accused_Detail',
 'Accused_Appointment_Date',
 'Accused_Position',
 'IAD_Ops',
 'Converted_Rec_I',
 'Involved_Party_Description',
 'Involved_Party_Type',
 'Involved_Party_Last_Name',
 'Involved_Party_First_Name',
 'Involved_Party_Assignment',
 'Involved_Party_Detail',
 'Involved_Party_Position',
 'Involved_Party_Age',
 'Involved_Party_Sex',
 'Involved_P

In [15]:
file = files[0]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
data_sheets = [sheet for sheet in sheets if 'Incid' in sheet] 
for sheet in data_sheets:
    df = pd.read_excel(in_path_may_2016_report + file, sheetname=sheet)
    df.reset_index(drop=True,inplace=True)
    df.columns = ['CR_Required','Initial_Category_Code','Initial_Category','Current_Category_Code',
                  'Current_Category','Current_Status','Complaint_Number','Beat','Street','Address',
                  'Location_Value','Incident_Time_Start','Incident_Time_End','Complaint_Date',
                  'IPRA_Closed_Date','Report_Status','Finding_Code','Finding_Id','Penalty_Code',
                  'No_Days','Penalty_Status','Accused_First_Name','Accused_Last_Name','Accused_Star',
                  'Accused_Assignment','Accused_Detail','Accused_Appointment_Date','Accused_Position',
                  'IAD_Ops','Converted_Rec_I','Police_Shooting_No','Police_Shooting']
    
    df["Address"] = df["Address"].astype(str)
    df["Street"] = df["Street"].astype(str)
    df["Incident_Address"] = df[["Address","Street"]].apply(lambda x: ' '.join(x), axis=1)
    df =  df[['CR_Required','Initial_Category_Code','Initial_Category','Current_Category_Code',
                  'Current_Category','Current_Status','Complaint_Number','Beat',"Incident_Address",
                  'Location_Value','Incident_Time_Start','Incident_Time_End','Complaint_Date',
                  'IPRA_Closed_Date','Report_Status','Finding_Code','Finding_Id','Penalty_Code',
                  'No_Days','Penalty_Status','Accused_First_Name','Accused_Last_Name','Accused_Star',
                  'Accused_Assignment','Accused_Detail','Accused_Appointment_Date','Accused_Position',
                  'IAD_Ops','Converted_Rec_I','Police_Shooting_No','Police_Shooting']]
    
    df["Sheet"] = sheet

    ## Adding File Metadata
    df["FOIA_Request_Number"]=''
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''    

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [16]:
metadata_df

Unnamed: 0,Original_Dataset,Column_Name,Non_Null_Count,Unique_Count,Object_Type
0,IPRA Shootings Data.xls,CR_Required,165,3,object
1,IPRA Shootings Data.xls,Initial_Category_Code,53,5,object
2,IPRA Shootings Data.xls,Initial_Category,53,4,object
3,IPRA Shootings Data.xls,Current_Category_Code,165,6,object
4,IPRA Shootings Data.xls,Current_Category,165,5,object
5,IPRA Shootings Data.xls,Current_Status,165,6,object
6,IPRA Shootings Data.xls,Complaint_Number,165,48,float64
7,IPRA Shootings Data.xls,Beat,165,39,float64
8,IPRA Shootings Data.xls,Incident_Address,170,50,object
9,IPRA Shootings Data.xls,Location_Value,165,11,object


In [17]:
saving_files = "qry_II_217_Incid"

final_df.to_csv(out_path_may_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_may_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_may_2016_report+saving_files+"_metadata.csv",index=False)

### Parties

In [18]:
df.columns.tolist()

['CR_Required',
 'Initial_Category_Code',
 'Initial_Category',
 'Current_Category_Code',
 'Current_Category',
 'Current_Status',
 'Complaint_Number',
 'Beat',
 'Incident_Address',
 'Location_Value',
 'Incident_Time_Start',
 'Incident_Time_End',
 'Complaint_Date',
 'IPRA_Closed_Date',
 'Report_Status',
 'Finding_Code',
 'Finding_Id',
 'Penalty_Code',
 'No_Days',
 'Penalty_Status',
 'Accused_First_Name',
 'Accused_Last_Name',
 'Accused_Star',
 'Accused_Assignment',
 'Accused_Detail',
 'Accused_Appointment_Date',
 'Accused_Position',
 'IAD_Ops',
 'Converted_Rec_I',
 'Police_Shooting_No',
 'Police_Shooting',
 'Sheet',
 'FOIA_Request_Number',
 'Report_Produced_Date']

In [19]:
file = files[0]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
data_sheets = [sheet for sheet in sheets if 'Parties' in sheet] 
for sheet in data_sheets:
    df = pd.read_excel(in_path_may_2016_report + file, sheetname=sheet)
    df.reset_index(drop=True,inplace=True)
    print(sheet)
    df.columns = ['CR_Required','Initial_Category_Code','Initial_Category','Current_Category_Code',
                  'Current_Category','Current_Status','Complaint_Number','Complaint_Date',
                  'IPRA_Investigator_Last_Name','IPRA_Investigator_First_Name',
                  'IPRA_Assign_Date','IPRA_Investigate_Begin_Date','IPRA_Investigate_End_Date',
                  'IPRA_Investigator_Type','Accused_First_Name','Accused_Last_Name','Accused_Star',
                  'Accused_Assignment','Accused_Detail','Accused_Appointment_Date','Accused_Position',
                  'IAD_Ops','Converted_Rec_I','Involved_Party_Description','Involved_Party_Type',
                  'Involved_Party_Last_Name','Involved_Party_First_Name','Involved_Party_Assignment',
                  'Involved_Party_Detail','Involved_Party_Position','Involved_Party_Age',
                  'Involved_Party_Sex','Involved_Party_Race','Police_Shooting_No','Police_Shooting']


    df =  df[['CR_Required','Initial_Category_Code','Initial_Category','Current_Category_Code',
                  'Current_Category','Current_Status','Complaint_Number','Complaint_Date',
                  'IPRA_Investigator_Last_Name','IPRA_Investigator_First_Name',
                  'IPRA_Assign_Date','IPRA_Investigate_Begin_Date','IPRA_Investigate_End_Date',
                  'IPRA_Investigator_Type','Accused_First_Name','Accused_Last_Name','Accused_Star',
                  'Accused_Assignment','Accused_Detail','Accused_Appointment_Date','Accused_Position',
                  'IAD_Ops','Converted_Rec_I','Involved_Party_Description','Involved_Party_Type',
                  'Involved_Party_Last_Name','Involved_Party_First_Name','Involved_Party_Assignment',
                  'Involved_Party_Detail','Involved_Party_Position','Involved_Party_Age',
                  'Involved_Party_Sex','Involved_Party_Race','Police_Shooting_No','Police_Shooting']]
    
    df["Sheet"] = sheet

    ## Adding File Metadata
    df["FOIA_Request_Number"]=''
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''    

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

qry_II_217_2012_Parties
qry_II_217_2011_Parties
qry_II_217_2010_Parties
qry_II_217_2009_Parties
qry_II_217_200807_Parties


In [20]:
saving_files = "qry_II_217_Parties"

final_df.to_csv(out_path_may_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_may_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_may_2016_report+saving_files+"_metadata.csv",index=False)