In [1]:
import re
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Dat/chicago-police-data/import"

In [6]:
in_path = path + '/input/complaints-merged-2015_copy_20170112'
out_path =path + '/output/complaints-merged-2015_copy_20170112'

In [7]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

## August 2015 Data

### Report 1

In [8]:
in_path_aug_2015_report = in_path+'/august_2015/'
out_path_aug_2015_report = out_path+'/august_2015/'

files = os.listdir(in_path_aug_2015_report)
files

['FOIA 14-5509 - birth year of accused.xls',
 'foia 14-5509 - complaining witness data.xls',
 'foia 14-5509 - investigator data.xls']

In [9]:
file = files[0]
df = pd.read_excel(in_path_aug_2015_report + file,nrows=20)
col_list = df.columns.tolist()
Report_Produced_Date = col_list.pop()
FOIA_Request = file[:12].upper()
# +1 because of python indexing, +1 because of header in first df
skip = np.where(df.iloc[:,4]=="Accused")[0][0]+1+1
df = pd.read_excel(in_path_aug_2015_report + file, skiprows=skip)
df.dropna(how='all', inplace=True)
## Drop all null columns
df = null_dropper(df)
df.columns = ["Accused_Full_Name","Year_of_Birth"]
df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date.date()
except:
    df["Report_Produced_Date"]=''
df.head()

Unnamed: 0,Accused_Full_Name,Year_of_Birth,FOIA_Request_Number,Report_Produced_Date
0,"AARON, JEFFERY",1971.0,FOIA 14-5509,
1,"AARON, KARINA",1980.0,FOIA 14-5509,
2,"ABDELHADI, ABDALMAHD",1978.0,FOIA 14-5509,
3,"ABDELMAJEID, AZIZ",1984.0,FOIA 14-5509,
4,"ABRAHAM, NANCY",1986.0,FOIA 14-5509,


In [10]:
########################################
## DEPRECATED IN FAVOR OF ROSETTE API
########################################


## jr_splitter Created based on an exploration of First_Last_Name_List
## test = pd.DataFrame(First_Last_Name_list)
## test.shape[1]>2
## test[~(test[2].isnull())]


##First_Last_Name_list = df["Accused_Full_Name"].str.split(",").tolist()

def jr_splitter(row):
    if len(row)>2:
        return [row[0],row[2],"JR"]
    elif " JR" in row[0]:
        return [row[0][:row[0].find(" JR")],row[1],"JR"]
    else:
        return row

##df1 = pd.DataFrame([jr_splitter(row) for row in First_Last_Name_list])
##df1.columns = ["Accused_Last_Name","Accused_First_Name","Accused_Suffix"]


In [11]:
df["FOIA_Request_Number"]=FOIA_Request
df["Report_Produced_Date"]=Report_Produced_Date
final_df = df
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_dataset(final_df,file)
metadata_df.reset_index(drop=True,inplace=True)

In [12]:
final_df.to_csv(out_path_aug_2015_report+"foia_14-5509_-_birth_year_of_accused.csv",index=False)
final_df.to_excel(out_path_aug_2015_report+"foia_14-5509_-_birth_year_of_accused.xlsx",index=False)

metadata_df.to_csv(out_path_aug_2015_report+"foia_14-5509_-_birth_year_of_accused_metadata.csv",index=False)

### Report 2

In [13]:
file = files[1]
df = pd.read_excel(in_path_aug_2015_report + file,nrows=20)
col_list = df.columns.tolist()
Report_Produced_Date = col_list.pop()
FOIA_Request = file[:12].upper()
# +1 because of python indexing, +1 because of header in first df
skip = np.where(df.iloc[:,2]=="Number")[0][0]+1+1
df = pd.read_excel(in_path_aug_2015_report + file, skiprows=skip)
df.dropna(how='all', inplace=True)
## Drop all null columns
df = null_dropper(df)
df['Number'].fillna(method='ffill', inplace=True)
df['Number'] = df['Number'].astype(int)
df = df.dropna(subset=["Gender","Age","Race Desc"],how="all",axis=0)
df = null_dropper(df)
df.columns = ["CRID","Witness_Gender","Witness_Age","Witness_Race"]
df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date.date()
except:
    df["Report_Produced_Date"]=''
final_df = df
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_dataset(final_df,file)
metadata_df.reset_index(drop=True,inplace=True)

In [14]:
final_df.to_csv(out_path_aug_2015_report+"foia_14-5509_-_complaining_witness_data.csv",index=False)
final_df.to_excel(out_path_aug_2015_report+"foia_14-5509_-_complaining_witness_data.xlsx",index=False)

metadata_df.to_csv(out_path_aug_2015_report+"foia_14-5509_-_complaining_witness_data_metadata.csv",index=False)

### Report 3

In [15]:
file = files[2]
df = pd.read_excel(in_path_aug_2015_report + file,nrows=20)
## Additional Data
col_list = df.columns.tolist()
Report_Produced_Date = col_list.pop()
FOIA_Request = file[:12].upper()
# +1 because of python indexing, +1 because of header in first df
skip = np.where(df.iloc[:,0]=="Invst Last Name")[0][0]+1+1
df = pd.read_excel(in_path_aug_2015_report + file, skiprows=skip)
df.dropna(how='all', inplace=True)
## Drop all null columns
df = null_dropper(df)
df.columns = ["Investigator_Last_Name","Investigator_First_Name","Current_Report","Current_Rank"]
df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date.date()
except:
    df["Report_Produced_Date"]=''
final_df = df
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_dataset(final_df,file)
metadata_df.reset_index(drop=True,inplace=True)

In [16]:
final_df.to_csv(out_path_aug_2015_report+"foia_14-5509_-_investigator_data.csv",index=False)
final_df.to_excel(out_path_aug_2015_report+"foia_14-5509_-_investigator_data.xlsx",index=False)

metadata_df.to_csv(out_path_aug_2015_report+"foia_14-5509_-_investigator_data_metadata.csv",index=False)