In [1]:
import re
import datetime
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [6]:
in_path = path + '/input/complaints-cpd-2016-oct_copy_20170112/'
out_path =path + '/output/complaints-cpd-2016-oct_copy_20170112/'

In [7]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

def padding(value):
    if len(value)<2:
        return "0"+value
    else:
        return value
    
location_code['Location_Code'] = location_code['Location_Code'].apply(padding)

## October 2016 Data

### Report 1

In [10]:
in_path_oct_2016_report = in_path
out_path_oct_2016_report = out_path

files = [file for file in os.listdir(in_path_oct_2016_report) if '.xlsx' in file]
files

['CR_AllRecords.xlsx']

In [11]:
saving_files = [file.replace(" ","_").replace(".xlsx","") for file in files]
saving_files

['CR_AllRecords']

IS Investigation Unit Equivalent to Investigator_Assignment?

In [17]:
file=files[0]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
df = pd.read_excel(in_path_oct_2016_report + file)
print(df.head())
df.columns = ["Officer_Name","Officer_Unit","Date_of_Appointment","CRID",
                  "Initial_Complaint_Category","Final_Complaint_Category","Incident_Date", 
                  "Complaint_Date","Closed_Date",
               "Final_Finding", "Final_Action_Taken","Final_Action_Description"]

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

                NAME UNIT DATE_OF_APPOINTMENT   CR_NO  \
0  ABBATE, ANTHONY G  011         05-DEC-1994  237004   
1  ABBATE, ANTHONY G  011         05-DEC-1994  258124   
2   ABBATE, CARMEL G  641         06-JAN-1969  190369   
3   ABBATE, CARMEL G  640         06-JAN-1969  210426   
4   ABBATE, CARMEL G  640         06-JAN-1969  219424   

                     INITIAL_COMPLAINT_CATEGORY  \
0                05A   ARRESTEE - DURING ARREST   
1  04E   PRISONERS PROPERTY - INVENTORY/RECEIPT   
2                           03G   MISCELLANEOUS   
3                    10V   INVENTORY PROCEDURES   
4                           09J   MISCELLANEOUS   

                       FINAL_COMPLAINT_CATEGORY INCIDENT_DATE COMPLAINT_DATE  \
0                05B   ARRESTEE - DURING ARREST   21-MAY-1997    21-MAY-1997   
1  04E   PRISONERS PROPERTY - INVENTORY/RECEIPT   19-NOV-1999    20-NOV-1999   
2                           03G   MISCELLANEOUS   11-FEB-1992    11-FEB-1992   
3                    04E   INV

In [18]:
metadata_df.head()

Unnamed: 0,Original_Dataset,Column_Name,Non_Null_Count,Unique_Count,Object_Type
0,CR_AllRecords.xlsx,Officer_Name,134683,18907,object
1,CR_AllRecords.xlsx,Officer_Unit,134528,170,object
2,CR_AllRecords.xlsx,Date_of_Appointment,134486,1455,object
3,CR_AllRecords.xlsx,CRID,134683,70449,int64
4,CR_AllRecords.xlsx,Initial_Complaint_Category,134683,128,object


In [19]:
final_df.to_csv(out_path_oct_2016_report+saving_files[0]+".csv",index=False)
final_df.to_excel(out_path_oct_2016_report+saving_files[0]+".xlsx",index=False)

metadata_df.to_csv(out_path_oct_2016_report+saving_files[0]+"_metadata.csv",index=False)