In [1]:
import re
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [6]:
in_path = path + '/input/complaints-merged-2015_copy_20170112'
out_path =path + '/output/complaints-merged-2015_copy_20170112'

In [7]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

## March 2015 Data

### Report 1

In [8]:
in_path_mar_2015_report1 = in_path+'/march_2015/Report_1_-_All_Complaints_in_Time_Frame/'
out_path_mar_2015_report1 = out_path+'/march_2015/Report_1_-_All_Complaints_in_Time_Frame/'

files = os.listdir(in_path_mar_2015_report1)
files

['foia 14-5509 - report 1a - all complaints in time frame.xls',
 'foia 14-5509 - report 1b - all complaints in time frame.xls',
 'foia 14-5509 - report 1c - all complaints in time frame.xls',
 'foia 14-5509 - report 1d - all complaints in time frame.xls',
 'foia 14-5509 - report 1e - all complaints in time frame.xls',
 'foia 14-5509 - report 1f - all complaints in time frame.xls',
 'foia 14-5509 - report 1g - all complaints in time frame.xls',
 'foia 14-5509 - report 1h - all complaints in time frame.xls']

In [9]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in files:    
    df = pd.read_excel(in_path_mar_2015_report1 + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = col_list.pop()
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing, +1 because of header in first df
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1+1
    df = pd.read_excel(in_path_mar_2015_report1 + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    ## remove end of record rows and page number row
    df = df.dropna(subset=["Number:","Beat:","Location Code:","Address of Incident:","Unnamed: 6"
                     ,"Unnamed: 9","Unnamed: 10","Unnamed: 11"],how="all",axis=0)

    ## Need to move Investigator Name to Col 12
    df['Number:'].fillna(method='ffill', inplace=True)
    df['Number:'] = df['Number:'].astype(int)

    ## Investigator Rows have all others as null
    df1 = df[df["Unnamed: 1"].isnull() & 
       df["Beat:"].isnull() & 
       df["Location Code:"].isnull() & 
       df["Unnamed: 9"].isnull() &
       df["Unnamed: 10"].isnull()].loc[:,("Number:","Unnamed: 11")]

    df1.columns=["Number:","Investigator:"]

    ## Merge Back Rows 
    df2 = df.merge(df1,how="left",on="Number:")

    ## Drop Original all null Investigator Rows
    df2 = df2.dropna(subset=["Beat:","Location Code:","Address of Incident:","Unnamed: 5","Unnamed: 6"
                     ,"Unnamed: 9","Unnamed: 10"],how="all",axis=0)

    ## Replace ---- with empty strings
    df2 = df2.replace('----', "").replace('-----', "")
    ## Drop all null columns
    df2 = null_dropper(df2)

    ## Covert Address Columns into single column
    df2["Address of Incident:"] = df2['Address of Incident:'].astype(str)
    df2["Unnamed: 5"] = df2['Unnamed: 5'].astype(str)
    df2["Unnamed: 6"] = df2['Unnamed: 6'].astype(str)
    df2["Unnamed: 7"] = df2['Unnamed: 7'].astype(str)

    df2["Address of Incident:"] = df2[["Address of Incident:","Unnamed: 5", "Unnamed: 6"]].apply(lambda x: ' '.join(x), axis=1)
    
    df2["Address of Incident:"] = df2['Address of Incident:'].str.replace("nan","")
    df2["Address of Incident:"] = df2['Address of Incident:'].str.strip()
    
    df2 = df2[["Number:","Beat:","Location Code:","Address of Incident:","Unnamed: 7",
               "Unnamed: 9","Unnamed: 10", "Unnamed: 11","Investigator:"]]

    df2.columns = ["CRID","Beat","Location_Code","Address_of_Incident",
                      "City_State_Zip","Incident_Date","Complaint_Date", 
                      "Closed_Date","Investigator"]

    ## Splitting City State Zip into three columns
    city_state_zip = city_state_zip_splitter(df2)
    df2.reset_index(drop=True, inplace=True)
    df2 = df2.merge(city_state_zip,how='left',right_index=True,left_index=True)
    ## Appending Location Type
    df2 = df2.merge(location_code,how='left',on='Location_Code')
    
    df2 = df2 [["CRID","Beat","Location_Code","Location_Value","Address_of_Incident",
                      "City","State","Zip","Incident_Date","Complaint_Date", 
                      "Closed_Date","Investigator"]]
    
    ## Adding File Metadata
    df2["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date.date()
    except:
        df["Report_Produced_Date"]=''  
    ## Appending to Final File + Metadata
    final_df = final_df.append(df2)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df2,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [10]:
final_df.to_csv(out_path_mar_2015_report1+"foia_14-5509_-_report_1.csv",index=False)
final_df.to_excel(out_path_mar_2015_report1+"foia_14-5509_-_report_1.xlsx",index=False)

metadata_df.to_csv(out_path_mar_2015_report1+"foia_14-5509_-_report_1_metadata.csv",index=False)


### Report 2

In [11]:
in_path_mar_2015_report2 = in_path+'/march_2015/Report_2_-_Identified_Accused/'
out_path_mar_2015_report2 = out_path+'/march_2015/Report_2_-_Identified_Accused/'

files = os.listdir(in_path_mar_2015_report2)
files

['foia 14-5509 - report 2a - identified accused xi.xls',
 'foia 14-5509 - report 2b - identified accused xi.xls',
 'foia 14-5509 - report 2c - identified accused xi.xls',
 'foia 14-5509 - report 2d - identified accused xi.xls',
 'foia 14-5509 - report 2e - identified accused xi.xls',
 'foia 14-5509 - report 2f - identified accused xi.xls',
 'foia 14-5509 - report 2g - identified accused xi.xls',
 'foia 14-5509 - report 2h - identified accused xi.xls']

In [12]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in files:
    df = pd.read_excel(in_path_mar_2015_report2 + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = col_list.pop()
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]

    # +1 because of python indexing, +1 because of header in first df
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1+1
    df = pd.read_excel(in_path_mar_2015_report2 + file, skiprows=skip)
    df.dropna(how='all', inplace=True)

    ## Remove leading and trailing whitespace from columns 
    df.columns = [col.strip() for col in df.columns.tolist()]

    ## Need to fill in Number
    df['Number:'].fillna(method='ffill', inplace=True)
    df['Number:'] = df['Number:'].astype(int)

    ## Drops end of record 
    df = df.dropna(subset=["Accused:","Gender:","Date of Appt:","Star:"],how="all",axis=0)
    
    ## drops the significant number of columns that are all nulls
    df = null_dropper(df)
    
    df.columns = ["CRID","Accused","Accused_Gender","Accused_Race_Code","Date_of_Appt","Current_Unit","Current_Rank",
                  "Star","Complaint_Category","Orig_Finding","Orig_Recommended_Discipline","Final_Finding",
                 "Final_Recommended_Discipline"]

    ## Excel reads NA as null for Orig and Final Finding, this returns it to NA status when discipline is not null
    df["Orig_Finding"] = df['Orig_Finding'].astype(str)
    df["Orig_Finding"] =np.where((df["Orig_Finding"]=='nan') & (~df["Orig_Recommended_Discipline"].isnull()),
                                 "NA",
                                 df["Orig_Finding"])

    df["Final_Finding"] = df['Final_Finding'].astype(str)
    df["Final_Finding"] = np.where((df["Final_Finding"]=='nan') & (~df["Final_Recommended_Discipline"].isnull()),
                                 "NA",
                                 df["Final_Finding"])
    
    ## replace the actual nulls with blanks
    df["Orig_Finding"] = np.where((df["Orig_Finding"]=='nan'),
                                 "",
                                 df["Orig_Finding"])
    df["Final_Finding"] =np.where((df["Final_Finding"]=='nan'),
                                 "",
                                 df["Final_Finding"])

    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date.date()
    except:
        df["Report_Produced_Date"]=''
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [13]:
final_df.to_csv(out_path_mar_2015_report2+"foia_14-5509_-_report_2.csv",index=False)
final_df.to_excel(out_path_mar_2015_report2+"foia_14-5509_-_report_2.xlsx",index=False)

metadata_df.to_csv(out_path_mar_2015_report2+"foia_14-5509_-_report_2_metadata.csv",index=False)

### Report 3

In [14]:
in_path_mar_2015_report3 = in_path+'/march_2015/Report_3_-_Police_Witnesses/'
out_path_mar_2015_report3 = out_path+'/march_2015/Report_3_-_Police_Witnesses/'

files = os.listdir(in_path_mar_2015_report3)
files

['foia 14-5509 - report 3 - police officer witness data xi.xls']

In [15]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in files:
    df = pd.read_excel(in_path_mar_2015_report3 + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = col_list.pop()
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]

    # +1 because of python indexing
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
    df = pd.read_excel(in_path_mar_2015_report3 + file, skiprows=skip)
    df.dropna(how='all', inplace=True)

    ## Remove leading and trailing whitespace from columns 
    df.columns = [col.strip() for col in df.columns.tolist()]

    ## Filling Number Column
    df['Unnamed: 4'].fillna(method='ffill', inplace=True)
    df['Unnamed: 4'] = df['Unnamed: 4'].astype(int)

    ## Drops end of record 
    df = df.dropna(subset=["Unnamed: 2","Gender","Star"],how="all",axis=0)

    ## drops the significant number of columns that are all nulls
    df = null_dropper(df)

    df.columns = ["Officer_Witness","CRID","Officer_Witness_Gender","Officer_Witness_Race","Officer_Witness_Star"]
    df = df[["CRID","Officer_Witness","Officer_Witness_Gender","Officer_Witness_Race","Officer_Witness_Star"]]

    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date.date()
    except:
        df["Report_Produced_Date"]=''
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [16]:
final_df.to_csv(out_path_mar_2015_report3+"foia_14-5509_-_report_3.csv",index=False)
final_df.to_excel(out_path_mar_2015_report3+"foia_14-5509_-_report_3.xlsx",index=False)

metadata_df.to_csv(out_path_mar_2015_report3+"foia_14-5509_-_report_3_metadata.csv",index=False)

### Report 4

In [17]:
in_path_mar_2015_report4 = in_path+'/march_2015/Report_4_-_Complaining_Witnesses/'
out_path_mar_2015_report4 = out_path+'/march_2015/Report_4_-_Complaining_Witnesses/'

files = os.listdir(in_path_mar_2015_report4)
files

['foia 14-5509 - report 4a - complaining witness data.xls',
 'foia 14-5509 - report 4b - complaining witness data.xls',
 'foia 14-5509 - report 4c - complaining witness data.xls',
 'foia 14-5509 - report 4d - complaining witness data.xls',
 'foia 14-5509 - report 4e - complaining witness data.xls',
 'foia 14-5509 - report 4f - complaining witness data.xls',
 'foia 14-5509 - report 4g - complaining witness data.xls',
 'foia 14-5509 - report 4h - complaining witness data.xls']

In [18]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

for file in files:
    df = pd.read_excel(in_path_mar_2015_report4 + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = col_list.pop()
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]

    # +1 because of python indexing
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
    df = pd.read_excel(in_path_mar_2015_report4 + file, skiprows=skip)
    df.dropna(how='all', inplace=True)

    ## Remove leading and trailing whitespace from columns 
    df.columns = [col.strip() for col in df.columns.tolist()]

    ## Filling Number Column
    df['Unnamed: 2'].fillna(method='ffill', inplace=True)
    df['Unnamed: 2'] = df['Unnamed: 2'].astype(int)

    ## Drops end of record 
    df = df.dropna(subset=["Gender","Race"],how="all",axis=0)

    ## drops the significant number of columns that are all nulls
    df = null_dropper(df)

    df.columns = ["CRID","Witness_Gender","Witness_Race"]

    ## drop end of record rows
    df = df[df["Witness_Race"]!="end of record"]
    
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date.date()
    except:
        df["Report_Produced_Date"]=''
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [19]:
final_df.to_csv(out_path_mar_2015_report4+"foia_14-5509_-_report_4.csv",index=False)
final_df.to_excel(out_path_mar_2015_report4+"foia_14-5509_-_report_4.xlsx",index=False)

metadata_df.to_csv(out_path_mar_2015_report4+"foia_14-5509_-_report_4_metadata.csv",index=False)