In [1]:
import re
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io
import xlrd
import datetime

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [6]:
in_path = path + '/input/complaints-merged-2015_copy_20170112'
out_path =path + '/output/complaints-merged-2015_copy_20170112'

In [44]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

def padding(value):
    if len(value)<2:
        return "0"+value
    else:
        return value
    
location_code['Location_Code'] = location_code['Location_Code'].apply(padding)

## June 2016 Data

### Report 1

In [31]:
in_path_june_2016_report = in_path+'/june_2016/'
out_path_june_2016_report = out_path+'/june_2016/'

files = [file for file in os.listdir(in_path_june_2016_report) if 'unlocked' in file and '~$' not in file]
files

['P046957 - report 1 - all complaints in time frame-unlocked.xlsx',
 'P046957 - report 2 - identified accused xi-unlocked.xlsx',
 'P046957 - report 3 - police officer witness data xi-unlocked.xlsx',
 'P046957 - report 4 - victim data-unlocked.xlsx',
 'P046957 - report 5 - complainant (reporting party) data-unlocked.xlsx']

In [32]:
saving_files = [file.replace(" ","_").replace(".xlsx","") for file in files]
saving_files

['P046957_-_report_1_-_all_complaints_in_time_frame-unlocked',
 'P046957_-_report_2_-_identified_accused_xi-unlocked',
 'P046957_-_report_3_-_police_officer_witness_data_xi-unlocked',
 'P046957_-_report_4_-_victim_data-unlocked',
 'P046957_-_report_5_-_complainant_(reporting_party)_data-unlocked']

In [33]:
df = pd.read_excel(in_path_june_2016_report + files[0],nrows=20)

In [45]:
file = files[0]
df = pd.read_excel(in_path_june_2016_report + file,nrows=20,engine=None)
## Making Sure Every File contains date the file was created and the foia that created it
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
# +1 because of python indexing, +1 because of header in first df
skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
df = pd.read_excel(in_path_june_2016_report + file, skiprows=skip)
df.dropna(how='all', inplace=True)
## remove end of record rows and page number row
df = df.dropna(subset=["Number:","Beat:","Location Code:","Address of Incident:","Unnamed: 6"
                 ,"Complaint Date","Closed Date","Unnamed: 11"],how="all",axis=0)

print(df.head())
## Need to move Investigator Name to Col 12
df['Number:'].fillna(method='ffill', inplace=True)
df['Number:'] = df['Number:'].astype(int)
## Investigator Rows have all others as null
df1 = df[ 
   df["Incident Date & Time"].isnull() & 
   df["Complaint Date"].isnull() &
   df["Closed Date"].isnull()]

df1 = null_dropper(df1)
df1 = df1[df1["Location Code:"]!='end of record']
df1 = df1[["Number:","Location Code:","Address of Incident:","Unnamed: 4","Unnamed: 5","Unnamed: 6"]]
df1.columns = ["Number:","Investigator_Full_Name","Investigator_Assignment","Investigator_Rank",
               "Investigator_Star","Investigator_Appt_Date"]

df2 = df.merge(df1,how="left",on="Number:")
## check dtypes to figure out which matter
df2 = df2.dropna(subset=["Incident Date & Time","Complaint Date","Closed Date"],how="all",axis=0)
## Replace ---- with empty strings
df2 = df2.replace('----', "").replace('-----', "")
## Drop all null columns
df2 = null_dropper(df2)

## Covert Address Columns into single column
df2["Address of Incident:"] = df2['Address of Incident:'].astype(str)
df2["Unnamed: 4"] = df2['Unnamed: 4'].astype(str)
df2["Unnamed: 5"] = df2['Unnamed: 5'].astype(str)
df2["Unnamed: 6"] = df2['Unnamed: 6'].astype(str)

df2["Unnamed: 4"] = df2["Unnamed: 4"].replace("nan","")
df2["Unnamed: 5"] = df2['Unnamed: 5'].replace("nan","")

df2["Address of Incident:"] = df2[["Address of Incident:","Unnamed: 4", "Unnamed: 5"]].apply(lambda x: ' '.join(x), axis=1)

df2 = df2[["Number:","Beat:","Location Code:","Address of Incident:","Unnamed: 6",
           "Incident Date & Time", "Complaint Date", "Closed Date","Investigator_Full_Name",
          "Investigator_Assignment","Investigator_Rank","Investigator_Star","Investigator_Appt_Date"]]

df2.columns = ["CRID","Beat","Location_Code","Address_of_Incident",
                  "City_State_Zip","Incident_Date","Complaint_Date", 
                  "Closed_Date","Investigator_Full_Name",
               "Investigator_Assignment","Investigator_Rank","Investigator_Star","Investigator_Appt_Date"]

## Splitting City State Zip into three columns
city_state_zip = city_state_zip_splitter(df2)
df2.reset_index(drop=True, inplace=True)
df2 = df2.merge(city_state_zip,how='left',right_index=True,left_index=True)
## Appending Location Type
df2 = df2.merge(location_code,how='left',on='Location_Code')
df2 = df2 [["CRID","Beat","Location_Code","Location_Value","Address_of_Incident",
                  "City","State","Zip","Incident_Date","Complaint_Date", 
                  "Closed_Date","Investigator_Full_Name",
               "Investigator_Assignment","Investigator_Rank","Investigator_Star","Investigator_Appt_Date"]]

## Adding File Metadata
df2["FOIA_Request_Number"]=FOIA_Request
try:
    df2["Report_Produced_Date"]=Report_Produced_Date.date()
except:
    df2["Report_Produced_Date"]=''    

final_df = df2
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_dataset(final_df,file)
metadata_df.reset_index(drop=True,inplace=True)

    Number:                                           Beat:  Location Code:  \
0  106213.0                                            1631              17   
1       NaN  Investigator with Current Assignment and Rank:  KLIMAS, ROBERT   
2       NaN                                             NaN   end of record   
3  107899.0                                            0611              17   
5       NaN                                             NaN   end of record   

  Address of Incident:       Unnamed: 4 Unnamed: 5           Unnamed: 6  \
0                 3700     N HARLEM AVE        NaN     CHICAGO IL 60634   
1                  121        COMMANDER          0  2008-08-04 00:00:00   
2                  NaN              NaN        NaN                  NaN   
3                 7843  S HERMITAGE AVE        NaN      CHICAGO IL 6062   
5                  NaN              NaN        NaN                  NaN   

  Incident Date & Time Complaint Date Closed Date  Unnamed: 10  Unnamed: 1

In [46]:
final_df.head()

Unnamed: 0,CRID,Beat,Location_Code,Location_Value,Address_of_Incident,City,State,Zip,Incident_Date,Complaint_Date,Closed_Date,Investigator_Full_Name,Investigator_Assignment,Investigator_Rank,Investigator_Star,Investigator_Appt_Date,FOIA_Request_Number,Report_Produced_Date
0,106213,1631,17,Public Way - Other,3700 N HARLEM AVE,CHICAGO,IL,60634.0,2015-07-19 21:00:00,2015-07-20,2015-09-21,"KLIMAS, ROBERT",121.0,COMMANDER,0.0,2008-08-04 00:00:00,FOIA P046957\nReport 1\nAll Complaints in CRMS...,
1,107899,611,17,Public Way - Other,7843 S HERMITAGE AVE,CHICAGO,IL,6062.0,2015-08-16 20:20:00,2016-01-25,NaT,,,,,,FOIA P046957\nReport 1\nAll Complaints in CRMS...,
2,107901,1814,17,Public Way - Other,1622 N NORTH PARK AVE,CHICAGO,IL,60614.0,2016-01-25 21:50:00,2016-01-26,NaT,,,,,,FOIA P046957\nReport 1\nAll Complaints in CRMS...,
3,108026,1722,4,Police Building,4650 N PULASKI RD,CHICAGO,IL,,2013-10-14 00:01:00,2016-04-25,NaT,,,,,,FOIA P046957\nReport 1\nAll Complaints in CRMS...,
4,108109,832,17,Public Way - Other,2340 W 69TH ST,CHICAGO,IL,,2016-02-26 11:00:00,2016-04-14,NaT,,,,,,FOIA P046957\nReport 1\nAll Complaints in CRMS...,


In [47]:
final_df.to_csv(out_path_june_2016_report+"P046957_-_report_1_-_all_complaints_in_time_frame-unlocked.csv",index=False)
final_df.to_excel(out_path_june_2016_report+"P046957_-_report_1_-_all_complaints_in_time_frame-unlocked.xlsx",index=False)

metadata_df.to_csv(out_path_june_2016_report+"P046957_-_report_1_-_all_complaints_in_time_frame-unlocked_metadata.csv",index=False)

### Report 2

In [13]:
in_path_june_2016_report2 = in_path+'/june_2016/'
out_path_june_2016_report2 = out_path+'/june_2016/'

files = [file for file in os.listdir(in_path_june_2016_report2) if 'unlocked' in file and '~$' not in file]
files

['P046957 - report 1 - all complaints in time frame-unlocked.xlsx',
 'P046957 - report 2 - identified accused xi-unlocked.xlsx',
 'P046957 - report 3 - police officer witness data xi-unlocked.xlsx',
 'P046957 - report 4 - victim data-unlocked.xlsx',
 'P046957 - report 5 - complainant (reporting party) data-unlocked.xlsx']

In [15]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
file = files[1]
df = pd.read_excel(in_path_june_2016_report2 + file,nrows=20)
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
# +1 because of python indexing, +1 because of header in first df
skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
df = pd.read_excel(in_path_june_2016_report2 + file, skiprows=skip)
df.dropna(how='all', inplace=True)

## Remove leading and trailing whitespace from columns 
df.columns = [col.strip() for col in df.columns.tolist()]

## Need to fill in Number
df['Number:'].fillna(method='ffill', inplace=True)
df['Number:'] = df['Number:'].astype(int)

## Drops end of record 
df = df.dropna(subset=["Accused:","Gender:","Date of Appt:","Star:"],how="all",axis=0)

## drops the significant number of columns that are all nulls
df = null_dropper(df)
print(df.head())
df.columns = ["CRID","Accused","Accused_Birth_Year","Accused_Gender","Accused_Race_Code","Date_of_Appt","Current_Unit",
              "Current_Rank","Star","Complaint_Category","Orig_Finding","Orig_Recommended_Discipline","Final_Finding",
             "Final_Recommended_Discipline"]

## Excel reads NA as null for Orig and Final Finding, this returns it to NA status when discipline is not null
df["Orig_Finding"] = df['Orig_Finding'].astype(str)
df["Orig_Finding"] =np.where((df["Orig_Finding"]=='nan') & (~df["Orig_Recommended_Discipline"].isnull()),
                             "NA",
                             df["Orig_Finding"])

df["Final_Finding"] = df['Final_Finding'].astype(str)
df["Final_Finding"] = np.where((df["Final_Finding"]=='nan') & (~df["Final_Recommended_Discipline"].isnull()),
                             "NA",
                             df["Final_Finding"])

## replace the actual nulls with blanks
df["Orig_Finding"] = np.where((df["Orig_Finding"]=='nan'),
                             "",
                             df["Orig_Finding"])
df["Final_Finding"] =np.where((df["Final_Finding"]=='nan'),
                             "",
                             df["Final_Finding"])

## Adding File Metadata
df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''  

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

   Number:         Accused: Birth Yr: Gender: Race Code: Date of Appt:  \
1   107901  GONZALES, ROBIN      1981       F          S    2008-04-28   
4   108026     BECKER, JOHN      1970       M        WHI    2000-01-24   
5   108026    MARKHAM, SEAN      1971       M          I    2000-06-19   
8  1038595      WELLS, OTIS      1967       M        BLK    2007-04-02   
9  1038595  MCCLAY, CHARLES      1983       M        BLK    2007-10-29   

   Current Unit: Current Rank:    Star: Complaint Category  \
1           18.0            PO   5137.0                NaN   
4           17.0            PO   4734.0                NaN   
5           17.0            PO  19054.0                NaN   
8            6.0            PO   5385.0      05D-NO ARREST   
9            3.0            PO   4735.0      05D-NO ARREST   

  Finding &\nRecommended Discipline  Unnamed: 11 Final Finding & \nDiscipline  \
1                               NaN          NaN                          NaN   
4                   

In [16]:
final_df.to_csv(out_path_june_2016_report2+saving_files[1]+".csv",index=False)
final_df.to_excel(out_path_june_2016_report2+saving_files[1]+".xlsx",index=False)

metadata_df.to_csv(out_path_june_2016_report2+saving_files[1]+"_metadata.csv",index=False)

### Report 3

In [17]:
in_path_june_2016_report3 = in_path+'/june_2016/'
out_path_june_2016_report3 = out_path+'/june_2016/'

files = [file for file in os.listdir(in_path_june_2016_report3) if 'unlocked' in file and '~$' not in file]
files

['P046957 - report 1 - all complaints in time frame-unlocked.xlsx',
 'P046957 - report 2 - identified accused xi-unlocked.xlsx',
 'P046957 - report 3 - police officer witness data xi-unlocked.xlsx',
 'P046957 - report 4 - victim data-unlocked.xlsx',
 'P046957 - report 5 - complainant (reporting party) data-unlocked.xlsx']

In [18]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
file = files[2]
df = pd.read_excel(in_path_june_2016_report3 + file,nrows=20)
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
# +0 Because Number is miss-labeled
skip = np.where(df.iloc[:,0]=="Number:")[0][0]
df = pd.read_excel(in_path_june_2016_report3 + file, skiprows=skip)
df.dropna(how='all', inplace=True)

## Remove leading and trailing whitespace from columns 
df.columns = [col.strip() for col in df.columns.tolist()]
print(df.head())
## Filling Number Column
df['CRID'] = pd.to_numeric(df["Gender"],errors='coerce').fillna(method='ffill')
df['CRID'] = df['CRID'].astype(int)
## Drops end of record 
df = df.dropna(subset=["Unnamed: 0","Gender","Star"],how="all",axis=0)
## Drops CRID only Row
df = df[df["Gender"]!=df["CRID"].astype(str)]

## drops the significant number of columns that are all nulls
df = null_dropper(df)

df.columns = ["Officer_Witness","Officer_Witness_Gender","Officer_Witness_Race","Officer_Witness_Star",
              "Officer_Witness_Birth_Year","Officer_Witness_Date_Appointed","CRID"]
df = df[["CRID","Officer_Witness","Officer_Witness_Gender","Officer_Witness_Race","Officer_Witness_Star",
        "Officer_Witness_Birth_Year","Officer_Witness_Date_Appointed"]]


## Adding File Metadata
df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''  

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

            Unnamed: 0   Gender           Race   Star  Birth Year  \
0              Number:  1053502            NaN    NaN         NaN   
1          HARRIS, KAL        M            BLK  14236      1974.0   
2                  NaN      NaN  end of record    NaN         NaN   
3              Number:  1053509            NaN    NaN         NaN   
4  BUKOWSKIBUS, GEORGE        M            WHI  11982      1966.0   

  Date Appointed  Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  Unnamed: 10  \
0            NaT         NaN         NaN         NaN         NaN          NaN   
1     2001-05-29         NaN         NaN         NaN         NaN          NaN   
2            NaT         NaN         NaN         NaN         NaN          NaN   
3            NaT         NaN         NaN         NaN         NaN          NaN   
4     1996-12-02         NaN         NaN         NaN         NaN          NaN   

   Unnamed: 11  
0          NaN  
1          NaN  
2          NaN  
3          NaN  
4          Na

In [19]:
final_df.to_csv(out_path_june_2016_report3+saving_files[2]+".csv",index=False)
final_df.to_excel(out_path_june_2016_report3+saving_files[2]+".xlsx",index=False)

metadata_df.to_csv(out_path_june_2016_report3+saving_files[2]+"_metadata.csv",index=False)

### Report 4

In [20]:
in_path_june_2016_report4 = in_path+'/june_2016/'
out_path_june_2016_report4 = out_path+'/june_2016/'

files = [file for file in os.listdir(in_path_june_2016_report4) if 'unlocked' in file and '~$' not in file]
files

['P046957 - report 1 - all complaints in time frame-unlocked.xlsx',
 'P046957 - report 2 - identified accused xi-unlocked.xlsx',
 'P046957 - report 3 - police officer witness data xi-unlocked.xlsx',
 'P046957 - report 4 - victim data-unlocked.xlsx',
 'P046957 - report 5 - complainant (reporting party) data-unlocked.xlsx']

In [21]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

file = files[3]
df = pd.read_excel(in_path_june_2016_report4 + file,nrows=20)
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
# +1 because of python indexing
skip = np.where(df.iloc[:,0]=="Number")[0][0]+1
df = pd.read_excel(in_path_june_2016_report4 + file, skiprows=skip)
df.dropna(how='all', inplace=True)
print(df.head())
## Remove leading and trailing whitespace from columns 
df.columns = [col.strip() for col in df.columns.tolist()]

## Filling Number Column
df['Number'].fillna(method='ffill', inplace=True)
df['Number'] = df['Number'].astype(int)

## Drops end of record 
df = df.dropna(subset=["Gender","Age","Race Desc"],how="all",axis=0)

## drops the significant number of columns that are all nulls
df = null_dropper(df)

df.columns = ["CRID","Victim_Gender","Victim_Age","Victim_Race"]

## drop end of record rows
df = df[df["Victim_Race"]!="end of record"]

## Adding File Metadata
df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''  
    
final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

      Number  Unnamed: 1 Gender        Age  Unnamed: 4      Race Desc  \
0  1039179.0         NaN    NaN        NaN         NaN            NaN   
1        NaN         NaN      F  25.832877         NaN          BLACK   
2        NaN         NaN      M  38.000000         NaN          BLACK   
3        NaN         NaN    NaN        NaN         NaN  end of record   
4  1053502.0         NaN    NaN        NaN         NaN            NaN   

   Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  Unnamed: 10  Unnamed: 11  \
0         NaN         NaN         NaN         NaN          NaN          NaN   
1         NaN         NaN         NaN         NaN          NaN          NaN   
2         NaN         NaN         NaN         NaN          NaN          NaN   
3         NaN         NaN         NaN         NaN          NaN          NaN   
4         NaN         NaN         NaN         NaN          NaN          NaN   

   Unnamed: 12  
0          NaN  
1          NaN  
2          NaN  
3          NaN  
4

In [22]:
final_df.head()

Unnamed: 0,CRID,Victim_Gender,Victim_Age,Victim_Race,FOIA_Request_Number,Report_Produced_Date
0,1039179,F,25.832877,BLACK,FOIA # P046957\nReport 4\nVictim Data\nIncide...,2016-06-16
1,1039179,M,38.0,BLACK,FOIA # P046957\nReport 4\nVictim Data\nIncide...,2016-06-16
2,1053502,F,,BLACK,FOIA # P046957\nReport 4\nVictim Data\nIncide...,2016-06-16
3,1053505,F,18.378082,BLACK,FOIA # P046957\nReport 4\nVictim Data\nIncide...,2016-06-16
4,1053509,F,42.747945,WHITE HISPANIC,FOIA # P046957\nReport 4\nVictim Data\nIncide...,2016-06-16


In [23]:
final_df.to_csv(out_path_june_2016_report4+saving_files[3]+".csv",index=False)
final_df.to_excel(out_path_june_2016_report4+saving_files[3]+".xlsx",index=False)

metadata_df.to_csv(out_path_june_2016_report4+saving_files[3]+"_metadata.csv",index=False)

### Report 5

In [24]:
in_path_june_2016_report5 = in_path+'/june_2016/'
out_path_june_2016_report5 = out_path+'/june_2016/'

files = [file for file in os.listdir(in_path_june_2016_report5) if 'unlocked' in file and '~$' not in file]
files

['P046957 - report 1 - all complaints in time frame-unlocked.xlsx',
 'P046957 - report 2 - identified accused xi-unlocked.xlsx',
 'P046957 - report 3 - police officer witness data xi-unlocked.xlsx',
 'P046957 - report 4 - victim data-unlocked.xlsx',
 'P046957 - report 5 - complainant (reporting party) data-unlocked.xlsx']

In [25]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

file = files[4]
df = pd.read_excel(in_path_june_2016_report5 + file,nrows=20)
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
# +1 because of python indexing
skip = np.where(df.iloc[:,0]=="Number")[0][0]+1
df = pd.read_excel(in_path_june_2016_report5 + file, skiprows=skip)
df.dropna(how='all', inplace=True)
print(df.head())
## Remove leading and trailing whitespace from columns 
df.columns = [col.strip() for col in df.columns.tolist()]

## Filling Number Column
df['Number'].fillna(method='ffill', inplace=True)
df['Number'] = df['Number'].astype(int)

## Drops end of record 
df = df.dropna(subset=["Gender","Age","Race Desc"],how="all",axis=0)

## drops the significant number of columns that are all nulls
df = null_dropper(df)

df.columns = ["CRID","Witness_Gender","Witness_Age","Witness_Race"]

## drop end of record rows
df = df[df["Witness_Race"]!="end of record"]

## Adding File Metadata
df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''  
    
final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

      Number  Unnamed: 1 Gender        Age  Unnamed: 4      Race Desc  \
0  1038595.0         NaN    NaN        NaN         NaN            NaN   
1        NaN         NaN      F  63.805479         NaN          WHITE   
2        NaN         NaN    NaN        NaN         NaN  end of record   
3  1039179.0         NaN    NaN        NaN         NaN            NaN   
4        NaN         NaN      M  50.449315         NaN          BLACK   

   Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  Unnamed: 10  Unnamed: 11  
0         NaN         NaN         NaN         NaN          NaN          NaN  
1         NaN         NaN         NaN         NaN          NaN          NaN  
2         NaN         NaN         NaN         NaN          NaN          NaN  
3         NaN         NaN         NaN         NaN          NaN          NaN  
4         NaN         NaN         NaN         NaN          NaN          NaN  


In [26]:
final_df.head()

Unnamed: 0,CRID,Witness_Gender,Witness_Age,Witness_Race,FOIA_Request_Number,Report_Produced_Date
0,1038595,F,63.805479,WHITE,FOIA # P046957\nReport 5\nComplainant Data\nI...,2016-06-16
1,1039179,M,50.449315,BLACK,FOIA # P046957\nReport 5\nComplainant Data\nI...,2016-06-16
2,1039179,M,51.391781,BLACK,FOIA # P046957\nReport 5\nComplainant Data\nI...,2016-06-16
3,1039179,F,34.641096,BLACK,FOIA # P046957\nReport 5\nComplainant Data\nI...,2016-06-16
4,1053492,F,51.364384,BLACK,FOIA # P046957\nReport 5\nComplainant Data\nI...,2016-06-16


In [27]:
final_df.to_csv(out_path_june_2016_report5+saving_files[4]+".csv",index=False)
final_df.to_excel(out_path_june_2016_report5+saving_files[4]+".xlsx",index=False)

metadata_df.to_csv(out_path_june_2016_report5+saving_files[4]+"_metadata.csv",index=False)