In [1]:
import re
import datetime
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [6]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [13]:
in_path = path + '/input/complaints-cpd-2016-dec_copy_20170112/'
out_path =path + '/output/complaints-cpd-2016-dec_copy_20170112/'

In [14]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

def padding(value):
    if len(value)<2:
        return "0"+value
    else:
        return value
    
location_code['Location_Code'] = location_code['Location_Code'].apply(padding)

## Dec 2016 Data

### Report 1

In [31]:
in_path_dec_2016_report = in_path
out_path_dec_2016_report = out_path

files = os.listdir(in_path_dec_2016_report)
files

['MAINFRAME_COMPLAINTS_REVISED_30NOV2016.CSV',
 'NEW WITNESS FILE NOV 29 2016 - no emp number.csv']

In [32]:
saving_files = [file.replace(" ","_").replace(".csv","").replace(".CSV","") for file in files]
saving_files

['MAINFRAME_COMPLAINTS_REVISED_30NOV2016',
 'NEW_WITNESS_FILE_NOV_29_2016_-_no_emp_number']

IS Investigation Unit Equivalent to Investigator_Assignment?

In [33]:
file=files[0]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
df = pd.read_csv(in_path_dec_2016_report + file)

df.columns = ["CRID","Incident_Date","Incident_Time","Complaint_Date",
                  "Case_Status","Closed_Date","Investigator_Full_Name", 
                  "Investigator_Assignment","Beat",
               "Address_of_Incident", "Accused","Star","Accused_Gender",
              "Accused_Birth_Year","Accused_Appointed_Date","Current_Unit","Final_Complaint_Category"
              ,"Current_Rank","Complaint_Description","Final_Finding","Final_Finding_Description"
             , "Final_Action_Taken","Final_Action_Description"]

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [34]:
final_df.to_csv(out_path_dec_2016_report+saving_files[0]+".csv",index=False)
final_df.to_excel(out_path_dec_2016_report+saving_files[0]+".xlsx",index=False)

metadata_df.to_csv(out_path_dec_2016_report+saving_files[0]+"_metadata.csv",index=False)

### Report 2

In [35]:
file=files[1]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
df = pd.read_csv(in_path_dec_2016_report + file)
print(df.head())
df.columns = ["CRID","Officer_Witness_or_Witness","Officer_Last_Name","Officer_First_Name",
                  "Star","Officer_Gender","Officer_Birth_Year","Officer_Race"]

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

  CASE NUMBER OFFICER OR NON OFFICER OFFICER LAST NAME OFFICER FIRST NAME  \
0     C052094    COMPLAINING_WITNESS               NaN                NaN   
1     C093392        OFFICER_WITNESS         BURZINSKI             WALTER   
2     C093392        OFFICER_WITNESS            SHAFER            CHARLES   
3     C100256    COMPLAINING_WITNESS               NaN                NaN   
4     C100256    COMPLAINING_WITNESS               NaN                NaN   

   OFFICER STAR NUMBER  SEX YEAR OF BIRTH RACE  
0                  NaN    M          1988    X  
1              13037.0  NaN           NaN  NaN  
2               7288.0  NaN           NaN  NaN  
3                  NaN    F          1946    S  
4                  NaN    M          1958    S  


In [36]:
final_df.to_csv(out_path_dec_2016_report+saving_files[1]+".csv",index=False)
final_df.to_excel(out_path_dec_2016_report+saving_files[1]+".xlsx",index=False)

metadata_df.to_csv(out_path_dec_2016_report+saving_files[1]+"_metadata.csv",index=False)