In [1]:
import re
import datetime
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [6]:
in_path = path + '/input/complaints-ipra-2016-apr_copy_20170112/'
out_path =path + '/output/complaints-ipra-2016-apr_copy_20170112/'

In [7]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

In [8]:
location_code.head()

Unnamed: 0,Location_Code,Location_Value
0,1,Food Sales/Restaurant
1,2,Tavern/Liquor Store
2,3,Other Business Establishment
3,4,Police Building
4,5,Lockup Facility


## Apr 2016 Data

### Report 1

In [9]:
in_path_apr_2016_report = in_path
out_path_apr_2016_report = out_path

files = [x for x in os.listdir(in_path_apr_2016_report) if '.xls' in x]
files

['All complaints during 2012.xls',
 'All complaints during 2013.xls',
 'All complaints during 2014.xls',
 'All complaints during 2015.xls',
 'All complaints during 2016YTD.xls',
 "IPRA Investigators' Info.xls",
 'Shootings Victim Data.xls']

In [10]:
comp_files = [file for file in files if 'complaints' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in comp_files:
    df = pd.read_excel(in_path_apr_2016_report + file)
    ## Making Sure Every File contains date the file was created and the foia that created it
    #col_list = df.columns.tolist()
    #Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    #col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    #FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    
    df.dropna(how='all', inplace=True)
    #print(df.head())

    df.columns = ["CRID","Beat","Street","Address","Location_Value","Incident_Time_Start","Incident_Time_End",
                  "Notification_Date","Closed_Date","Investigator_Last_Name","Investigator_First_Name",
                  "Accused_First_Name","Accused_Last_Name","Accused_Star","Accused_Unit","Accused_Detail",
                  "Accused_Appointment_Date","Accused_Rank","Initial_Category_Code","Initial_Category_Description",
                  "Current_Category_Code","Current_Category_Description","Current_Status","Finding_Code","Disciple_Code",
                  "Discipline_Description","Involved_Officer_Last_Name","Involved_Officer_First_Name",
                  "Involved_Officer_Unit","Involved_Officer_Detail","Involved_Officer_Rank",
                  "Involved_Officer_Age","Involved_Officer_Sex","Involved_Officer_Race"]


    ## Covert Address Columns into single column
    df["Address"] = df['Address'].astype(str)
    df["Street"] = df["Street"].astype(str)
    df["Address"] = df["Address"].str.replace("nan","")
    df["Address"] = df["Address"].str.replace(".0","")
    df["Street"] = df["Street"].str.replace("nan","")

    df["Address_of_Incident"] = df[["Address","Street"]].apply(lambda x: ' '.join(x), axis=1)

    df = df[["CRID","Beat","Address","Street","Address_of_Incident","Location_Value","Incident_Time_Start",
             "Incident_Time_End","Notification_Date","Closed_Date","Investigator_Last_Name","Investigator_First_Name",
             "Accused_First_Name","Accused_Last_Name","Accused_Star","Accused_Unit","Accused_Detail",
             "Accused_Appointment_Date","Accused_Rank","Initial_Category_Code","Initial_Category_Description",
             "Current_Category_Code","Current_Category_Description","Current_Status","Finding_Code","Disciple_Code",
             "Discipline_Description","Involved_Officer_Last_Name","Involved_Officer_First_Name",
             "Involved_Officer_Unit","Involved_Officer_Detail","Involved_Officer_Rank",
             "Involved_Officer_Age","Involved_Officer_Sex","Involved_Officer_Race"]]

    
    ## Adding File Metadata
    ##df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date.date()
    except:
        df["Report_Produced_Date"]=''    

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [11]:
final_df.head()

Unnamed: 0,CRID,Beat,Address,Street,Address_of_Incident,Location_Value,Incident_Time_Start,Incident_Time_End,Notification_Date,Closed_Date,...,Discipline_Description,Involved_Officer_Last_Name,Involved_Officer_First_Name,Involved_Officer_Unit,Involved_Officer_Detail,Involved_Officer_Rank,Involved_Officer_Age,Involved_Officer_Sex,Involved_Officer_Race,Report_Produced_Date
0,1053129,432,99,AVENUE J,99 AVENUE J,SIDEWALK,2012-04-07 00:33:00,2012-04-07 00:33:00,2012-04-07 01:32:20,2012-04-30 08:48:00,...,,EVARUSTO,CABRERA,,,UNKNOWN,,MALE,WHITE HISPANIC,
1,1053129,432,99,AVENUE J,99 AVENUE J,SIDEWALK,2012-04-07 00:33:00,2012-04-07 00:33:00,2012-04-07 01:32:20,2012-04-30 08:48:00,...,,KAPA,CHRISTOPH,4.0,,SERGEANT OF POLICE,,MALE,WHITE,
2,1053129,432,99,AVENUE J,99 AVENUE J,SIDEWALK,2012-04-07 00:33:00,2012-04-07 00:33:00,2012-04-07 01:32:20,2012-04-30 08:48:00,...,,O BRIEN,DENNIS,312.0,,SERGEANT OF POLICE,,MALE,WHITE,
3,1053130,433,149,AVENUE M,149 AVENUE M,RESIDENTIAL YARD (FRONT/BACK),2012-04-07 00:01:00,2012-04-07 00:01:00,2012-04-07 01:39:09,2012-05-29 15:49:00,...,,ALICEA,CHRISTIAN,,,UNKNOWN,16.0,MALE,WHITE HISPANIC,
4,1053130,433,149,AVENUE M,149 AVENUE M,RESIDENTIAL YARD (FRONT/BACK),2012-04-07 00:01:00,2012-04-07 00:01:00,2012-04-07 01:39:09,2012-05-29 15:49:00,...,,KAPA,CHRISTOPH,4.0,,SERGEANT OF POLICE,,MALE,WHITE,


In [12]:
saving_files = "All_complaints_during_2012-2016YTD"

final_df.to_csv(out_path_apr_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_apr_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_apr_2016_report+saving_files+"_metadata.csv",index=False)

### Report 2

In [13]:
## FILE SEEMS TO CONTAIN 2 SEPARATE FILES
file = files[5]
df1 = pd.read_excel(in_path_apr_2016_report + file)
df2 = pd.read_excel(in_path_apr_2016_report + file, skiprows=1)
## Making Sure Every File contains date the file was created and the foia that created it
#col_list = df.columns.tolist()
#Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
#col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
#FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
df1.dropna(how='all', inplace=True)
df1 = df1[["Badge #","Last Name","First Name","Title"]]
df1.dropna(how='all', inplace=True)
df2 = df2[["Name","Position","Hire Date","Hire Date","Employment","Date"]]
df2.dropna(subset=['Name','Position'], inplace=True)
## Drop all null columns
df1 = null_dropper(df1)
df2 = null_dropper(df2)

df1.columns = ["CRID","Investigator_Last_Name","Investigator_First_name","Investigator_Rank"]
df2.columns = ["Investigator_Name","Investigator_Rank","Investigator_City_Hire_Date",
              "Investigator_IPRA_Hire_Date","Investigator_Employment_Status","Investigator_Left_Date"]

##df["FOIA_Request_Number"]=FOIA_Request
#try:
#    df["Report_Produced_Date"]=Report_Produced_Date.date()
#except:
#    df["Report_Produced_Date"]=''
final_df1 = df1
final_df1.reset_index(drop=True,inplace=True)
final_df2 = df2
final_df2.reset_index(drop=True,inplace=True)

metadata_df1 = metadata_dataset(final_df1,file)
metadata_df1.reset_index(drop=True,inplace=True)

metadata_df2 = metadata_dataset(final_df2,file)
metadata_df2.reset_index(drop=True,inplace=True)

In [14]:
saving_file = "IPRA_Investigators_Officers"
final_df1.to_csv(out_path_apr_2016_report+saving_file+".csv",index=False)
final_df1.to_excel(out_path_apr_2016_report+saving_file+".xlsx",index=False)

metadata_df1.to_csv(out_path_apr_2016_report+saving_file+".csv",index=False)

saving_file = "IPRA_Investigators_Staff"
final_df2.to_csv(out_path_apr_2016_report+saving_file+".csv",index=False)
final_df2.to_excel(out_path_apr_2016_report+saving_file+".xlsx",index=False)

metadata_df2.to_csv(out_path_apr_2016_report+saving_file+".csv",index=False)

### Report 3

In [15]:
df.columns.tolist()

['CRID',
 'Beat',
 'Address',
 'Street',
 'Address_of_Incident',
 'Location_Value',
 'Incident_Time_Start',
 'Incident_Time_End',
 'Notification_Date',
 'Closed_Date',
 'Investigator_Last_Name',
 'Investigator_First_Name',
 'Accused_First_Name',
 'Accused_Last_Name',
 'Accused_Star',
 'Accused_Unit',
 'Accused_Detail',
 'Accused_Appointment_Date',
 'Accused_Rank',
 'Initial_Category_Code',
 'Initial_Category_Description',
 'Current_Category_Code',
 'Current_Category_Description',
 'Current_Status',
 'Finding_Code',
 'Disciple_Code',
 'Discipline_Description',
 'Involved_Officer_Last_Name',
 'Involved_Officer_First_Name',
 'Involved_Officer_Unit',
 'Involved_Officer_Detail',
 'Involved_Officer_Rank',
 'Involved_Officer_Age',
 'Involved_Officer_Sex',
 'Involved_Officer_Race',
 'Report_Produced_Date']

In [16]:
file = files[6]
df = pd.read_excel(in_path_apr_2016_report + file,nrows=20)
print(df.head())
## Making Sure Every File contains date the file was created and the foia that created it
#col_list = df.columns.tolist()
#Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
#col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
#FOIA_Request = [x for x in col_list if 'FOIA' in x][0]

df.dropna(how='all', inplace=True)
## Drop all null columns
df = null_dropper(df)
df.columns = ["CRID","Unit","Police_Shooting","Incident_Date","Address","Direction","Street","City",
              "Party","Initial_Category_Code","Initial_Category_Description","Initital_Category_Type_Code",
             "Initital_Category_Type_Description","Current_Category_Code","Current_Category_Type_Code",
              "Zip","Current_Category_Description","Current_Category_Type_Description","Victim_Age","Victim_Sex",
             "Victim_Race","Complaint_Date","CR_Required"]

## Covert Address Columns into single column
df["Address"] = df['Address'].astype(str)
df["Direction"] = df['Direction'].astype(str)
df["Street"] = df["Street"].astype(str)
df["Address"] = df["Address"].str.replace("nan","")
df["Address"] = df["Address"].str.replace(".0","")
df["Direction"] = df['Direction'].str.replace("nan","")
df["Street"] = df["Street"].str.replace("nan","")

df["Address_of_Incident"] = df[["Address","Direction","Street"]].apply(lambda x: ' '.join(x), axis=1)

df = df[["CRID","Unit","Police_Shooting","Incident_Date","Address","Direction","Street","Address_of_Incident",
         "City","Zip","Party","Initial_Category_Code","Initial_Category_Description","Initital_Category_Type_Code",
         "Initital_Category_Type_Description","Current_Category_Code","Current_Category_Description",
         "Current_Category_Type_Code","Current_Category_Type_Description","Victim_Age","Victim_Sex",
         "Victim_Race","Complaint_Date","CR_Required"]]

#df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date.date()
except:
    df["Report_Produced_Date"]=''
final_df = df
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_dataset(final_df,file)
metadata_df.reset_index(drop=True,inplace=True)

      LOG_NO  LastOfOCCURANCE_UNIT_NO POLICE_SHOOTING LastOfFROM_DATETIME  \
0  1000341.0                     15.0             Yes 2006-10-09 17:40:00   
1  1000343.0                      4.0             Yes 2006-10-09 23:09:00   
2  1000343.0                      4.0             Yes 2006-10-09 23:09:00   
3  1000647.0                      9.0             Yes 2006-10-21 01:50:00   
4  1000647.0                      9.0             Yes 2006-10-21 01:50:00   

   LastOfSTREET_NO LastOfSTREET_DIRECTION LastOfSTREET_NME LastOfCITY  \
0            932.0                  North      LATROBE AVE    CHICAGO   
1           8132.0                  South       COLFAX AVE    CHICAGO   
2           8132.0                  South       COLFAX AVE    CHICAGO   
3           4505.0                  South      ASHLAND AVE    CHICAGO   
4           4505.0                  South      ASHLAND AVE    CHICAGO   

  PARTY_TYPE INITIAL_CATEGORY_CODE     ...     CURRENT_CATEGORY_TYPE_CODE  \
0     Victim         

In [17]:
final_df.head()

Unnamed: 0,CRID,Unit,Police_Shooting,Incident_Date,Address,Direction,Street,Address_of_Incident,City,Zip,...,Current_Category_Code,Current_Category_Description,Current_Category_Type_Code,Current_Category_Type_Description,Victim_Age,Victim_Sex,Victim_Race,Complaint_Date,CR_Required,Report_Produced_Date
0,1000341.0,15.0,Yes,2006-10-09 17:40:00,932,North,LATROBE AVE,932 North LATROBE AVE,CHICAGO,60651.0,...,18A,FIREARM DISCHARGE WITH HITS - HANDGUN,S249,OPS SUBCODE 18A,37.0,MALE,BLACK,2006-10-09 22:20:41,No,
1,1000343.0,4.0,Yes,2006-10-09 23:09:00,8132,South,COLFAX AVE,8132 South COLFAX AVE,CHICAGO,60617.0,...,18A,FIREARM DISCHARGE WITH HITS - HANDGUN,S249,OPS SUBCODE 18A,21.0,MALE,BLACK,2006-10-10 00:10:52,No,
2,1000343.0,4.0,Yes,2006-10-09 23:09:00,8132,South,COLFAX AVE,8132 South COLFAX AVE,CHICAGO,60617.0,...,18A,FIREARM DISCHARGE WITH HITS - HANDGUN,S249,OPS SUBCODE 18A,22.0,MALE,BLACK,2006-10-10 00:10:52,No,
3,1000647.0,9.0,Yes,2006-10-21 01:50:00,45,South,ASHLAND AVE,45 South ASHLAND AVE,CHICAGO,60609.0,...,018,FIREARM DISCHARGE WITH HITS / ON DUTY,18A,GROUP 18 - POLICE INVOLVED SHOOTING,24.0,MALE,BLACK,2006-10-21 02:55:31,No,
4,1000647.0,9.0,Yes,2006-10-21 01:50:00,45,South,ASHLAND AVE,45 South ASHLAND AVE,CHICAGO,60609.0,...,018,FIREARM DISCHARGE WITH HITS / ON DUTY,18A,GROUP 18 - POLICE INVOLVED SHOOTING,27.0,MALE,BLACK,2006-10-21 02:55:31,No,


In [18]:
saving_file = "Shootings_Victim_Data"
final_df.to_csv(out_path_apr_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_apr_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_apr_2016_report+saving_file+".csv",index=False)
