In [1]:
import re
import datetime
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [6]:
in_path = path + '/input/shootings-cpd-feb2016_copy_20170112/'
out_path =path + '/output/shootings-cpd-feb2016_copy_20170112/'

In [7]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

In [8]:
location_code.head()

Unnamed: 0,Location_Code,Location_Value
0,1,Food Sales/Restaurant
1,2,Tavern/Liquor Store
2,3,Other Business Establishment
3,4,Police Building
4,5,Lockup Facility


## Feb 2016 Data

### Report 1

In [9]:
in_path_feb_2016_report = in_path
out_path_feb_2016_report = out_path

files = [x for x in os.listdir(in_path_feb_2016_report) if '.xls' in x]
files

['18A - 1 - Incident(1).xls',
 '18A - 3 -  Involved Member(1).xls',
 '18A - 4 -  cpd witness(1).xls',
 '18A - 5 -  cpd Reporting Party(1).xls',
 '18a 18b 20a victim-detainee demographics(1).xls',
 '18b - 1 - Incidents(1).xls',
 '18b - 3 - Involved Member(1).xls',
 '18b - 4 - cpd witness(1)Updated Info.xls',
 '18b - 5 - cpd reporting party(1).xls',
 '20A - 1 - Incident(1).xls',
 '20A - 3 -  Involved Member(1).xls',
 '20A - 4 - cpd Witness(1).xls',
 '20A - 5 - cpd Reporting Party(1).xls',
 'Copy of 18a - 2 - Incident Address(1) Block Level (2)(1)updated.xls',
 'Copy of 18b - 2 - incident address(1) block level(1)updated.xls',
 'Copy of 20A - 2 - Incident address(1)block level X(2)updated report.xls',
 'crms - 05j complaint and investigator(1).xls',
 'crms - 05j cpd witness, reporting party, victim(1).xls',
 'crms - 05j Officer(1).xls',
 'stars for 18a - involved members(1).xls',
 'stars for 18b - involved members(1).xls',
 'stars for 20a - involved members(1).xls']

In [10]:
incident_files = [file for file in files if '- 1 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing, +1 because of header in first df
    skip = np.where(df.iloc[:,0]=="Log No")[0][0]+1
    df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    ## remove end of record rows and page number row
    df = df.dropna(subset=["Log No","Assignment","Initial Category","Assigned Team"],how="all",axis=0)
    
    df.columns = ["CRID","Assignment","Initial_Category","Assigned_Team","Team_Assigned_Date",
                  "Investigator_Assigned","Investigator_Assigned_Date","Supervisor_Assigned","IPRA_Closed_Date"]

    df["Team_Assigned_Date"] = pd.to_datetime(df["Team_Assigned_Date"],format="%Y%m%d %H:%M",errors='coerce')
    
    df["CRID"] = df["CRID"].astype(int)
    
    ## Adding File Metadata
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''    

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [11]:
saving_files = "1_-_Incident(1)"

final_df.to_csv(out_path_feb_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_files+"_metadata.csv",index=False)

### Report 2

In [12]:
incident_files = [file for file in files if '- 2 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    try:
        df.columns = ["CRID","Incident_Date","Incident_Address","District"]
    except:
        df.columns = ["CRID","Incident_Date","Incident_Number","Incident_Address","District"]
        df["Incident_Address"] = df[["Incident_Number","Incident_Address"]].apply(lambda x: ' '.join(x), axis=1)
        df = df[["CRID","Incident_Date","Incident_Address","District"]]
    
    ## Split Up Date into Start and End Date
    df1 = df["Incident_Date"].str.split(" - ",expand=True)
    df = df.merge(df1,how='left',right_index=True,left_index=True)
    
    df.columns = ["CRID","Incident_Date","Incident_Address","District","Incident_Start_Date","Incident_End_Date"]
    df["Incident_Start_Date"] = pd.to_datetime(df["Incident_Start_Date"],format="%d-%b-%Y %H:%M",errors='coerce')
    df["Incident_End_Date"] = pd.to_datetime(df["Incident_End_Date"],format="%d-%b-%Y %H:%M",errors='coerce')
    
    ## Split up address into all the relevant fields
    df2 = df["Incident_Address"].str.split(",",expand=True)
    def row_switcher(row):
        if row[3] is None:
            row[3]=row[2]
            row[2]=row[1]
            row[1]=None
        return row

    if df2.shape[1]==4:
        df2 = df2.apply(row_switcher,axis=1)
        max_val = df2.shape[1]-1
        df3 = df2[max_val].str.strip().str.split(" ",expand=True)
        df2 = df2.merge(df3,left_index=True,right_index=True)
        df2.columns = ["Incident_Address","Incident_Apt","Incident_City","State_Zip","Incident_State","Incident_Zip"]
        df2 = df2[["Incident_Address","Incident_Apt","Incident_City","Incident_State","Incident_Zip"]]
    else:
        df2[3]=None
        df2 = df2[[0,3,1,2]]
        df2.columns = [0,1,2,3]
        max_val = df2.shape[1]-1
        df3 = df2[max_val].str.strip().str.split(" ",expand=True)
        df2 = df2.merge(df3,left_index=True,right_index=True)
        df2.columns = ["Incident_Address","Incident_Apt","Incident_City","State_Zip","Incident_State","Incident_Zip"]
        df2 = df2[["Incident_Address","Incident_Apt","Incident_City","Incident_State","Incident_Zip"]]
    
    df = df.merge(df2,left_index=True,right_index=True)
    df = df[["CRID","Incident_Date","Incident_Address_y","Incident_Apt",
             "Incident_City","Incident_State","Incident_Zip","District",
             "Incident_Start_Date","Incident_End_Date"]]
    df.columns = ["CRID","Incident_Date","Incident_Address","Incident_Apt",
             "Incident_City","Incident_State","Incident_Zip","District",
             "Incident_Start_Date","Incident_End_Date"]

    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [13]:
saving_file = "Incident_Address(1)_Block_Level"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)


### Report 3

In [14]:
incident_files = [file for file in files if '- 3 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    df.columns = ["CRID","Involved_Officer"]
    
    df.dropna(how='all', inplace=True)
    
    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [15]:
saving_file = "Involved_Member(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)

### Report 4

In [16]:
incident_files = [file for file in files if '- 4 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    df.columns = ["CRID","Officer_Witness"]
    
    df.dropna(how='all', inplace=True)
    
    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [17]:
metadata_df

Unnamed: 0,Original_Dataset,Column_Name,Non_Null_Count,Unique_Count,Object_Type
0,18A - 4 - cpd witness(1).xls,CRID,131,45,int64
1,18A - 4 - cpd witness(1).xls,Officer_Witness,131,128,object
2,18A - 4 - cpd witness(1).xls,FOIA_Request_Number,131,1,object
3,18A - 4 - cpd witness(1).xls,Report_Produced_Date,131,1,object
4,18b - 4 - cpd witness(1)Updated Info.xls,CRID,6,2,int64
5,18b - 4 - cpd witness(1)Updated Info.xls,Officer_Witness,5,6,object
6,18b - 4 - cpd witness(1)Updated Info.xls,FOIA_Request_Number,6,1,object
7,18b - 4 - cpd witness(1)Updated Info.xls,Report_Produced_Date,6,1,object
8,20A - 4 - cpd Witness(1).xls,CRID,47,32,int64
9,20A - 4 - cpd Witness(1).xls,Officer_Witness,47,46,object


In [18]:
saving_file = "CPD_Witness(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)

### Report 5

In [19]:
incident_files = [file for file in files if '- 5 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    df.columns = ["CRID","Officer_Reporting_Party"]
    
    df.dropna(how='all', inplace=True)
    
    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [20]:
saving_file = "CPD_Reporting_Party(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)

### Other Files In Works