In [21]:
import re
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io
import datetime

# Functions that help with the magic

### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

### Creates metadata as we go

In [3]:
def metadata_dataset(df,file):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    metadata_df = pd.DataFrame(info_values)
    metadata_df["File"] = file
    metadata_df.columns = ["Column_Name","Column_Info","Original_Dataset"]
    ## Column Info Split
    metadata_df['Non_Null_Count'], metadata_df['Object_Type'] = metadata_df['Column_Info'].str.split(' ', 1).str
    metadata_df["Object_Type"] = metadata_df["Object_Type"].str.replace("non-null ","")
    ## unique counts for each variable
    uniques_df = df.apply(lambda x: len(x.unique())).reset_index()
    uniques_df.columns = ["Column_Name","Unique_Count"]
    metadata_df["Unique_Count"] = uniques_df["Unique_Count"]
    metadata_df = metadata_df[["Original_Dataset","Column_Name","Non_Null_Count","Unique_Count","Object_Type"]]
    return metadata_df

### Converts single column named City_State_Zip into separate columns

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def city_state_zip_splitter(df):
    new_states_list=[]
    for value in df["City_State_Zip"]:
        ## check if it contains a number (zipcode)
        if hasNumbers(value):
            split_state = value.split(" ")
            ## join city names until we have 3 values
            while len(split_state)>3:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        else:
            split_state = value.split(" ")
            ## join city names until we have 2 values
            while len(split_state)>2:
                split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
        new_states_list.append(split_state)
    city_state_zip =  pd.DataFrame(new_states_list)
    city_state_zip.columns = ["City","State","Zip"]
    return city_state_zip

### Establishes General Path

In [5]:
path = "/Users/thudson/Documents/Github/chicago-police-data/import"

In [12]:
in_path = path + '/input/complaints-cpd-2016-nov_copy_20170112/'
out_path =path + '/output/complaints-cpd-2016-nov_copy_20170112/'

In [39]:
location_code = pd.read_csv(path+'/doc/Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

def padding(value):
    if len(value)<2:
        return "0"+value
    else:
        return value
    
location_code['Location_Code'] = location_code['Location_Code'].apply(padding)

## Nov 2016 Data

### Report 1

In [40]:
in_path_nov_2016_report = in_path
out_path_nov_2016_report = out_path

files = os.listdir(in_path_nov_2016_report)
files = [file for file in files if '.xls' in file and '1.' in file]
files

['p046957 - report 1.1 - all complaints in time frame.xls',
 'p046957 - report 1.2 - all complaints in time frame.xls',
 'p046957 - report 1.3 - all complaints in time frame.xls',
 'p046957 - report 1.4 - all complaints in time frame.xls',
 'p046957 - report 1.5 - all complaints in time frame.xls',
 'p046957 - report 1.6 - all complaints in time frame.xls']

In [45]:
saving_files = [file.replace(" ","_").replace(".xls","") for file in files]
saving_files = saving_files[0].replace('.1',"")

In [41]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in files: 
    print(file)
    df = pd.read_excel(in_path_nov_2016_report + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing, +1 because of header in first df
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
    df = pd.read_excel(in_path_nov_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    ## remove end of record rows and page number row
    df = df.dropna(subset=["Number:","Beat:","Location Code:","Address of Incident:","Unnamed: 6"
                     ,"Incident Date & Time","Complaint Date","Closed Date"],how="all",axis=0)

    ## Need to move Investigator Name to Col 12
    df['Number:'].fillna(method='ffill', inplace=True)
    df['Number:'] = df['Number:'].astype(int)

    ## Investigator Rows have all others as null
    df1 = df[
       df["Incident Date & Time"].isnull() & 
       df["Complaint Date"].isnull() &
       df["Closed Date"].isnull()].loc[:,("Number:","Location Code:",
                                          "Address of Incident:","Unnamed: 4",
                                          "Unnamed: 5","Unnamed: 6")]

    df1.columns=["Number:","Investigator:","Assignment","Rank","Star","Appt_Date"]

    ## Merge Back Rows 
    df2 = df.merge(df1,how="left",on="Number:")

    ## Drop Original all null Investigator Rows
    df2 = df2.dropna(subset=["Incident Date & Time",
                     "Complaint Date","Closed Date"],how="all",axis=0)

    ## Replace ---- with empty strings
    df2 = df2.replace('----', "").replace('-----', "")
    ## Drop all null columns
    df2 = null_dropper(df2)

    ## Covert Address Columns into single column
    df2["Address of Incident:"] = df2['Address of Incident:'].astype(str)
    df2["Unnamed: 4"] = df2['Unnamed: 4'].astype(str)
    df2["Unnamed: 5"] = df2['Unnamed: 5'].astype(str)
    df2["Unnamed: 6"] = df2['Unnamed: 6'].astype(str)

    df2["Address of Incident:"] = df2[["Address of Incident:","Unnamed: 4", "Unnamed: 5"]].apply(lambda x: ' '.join(x), axis=1)
    
    df2["Address of Incident:"] = df2['Address of Incident:'].str.replace("nan","")
    df2["Address of Incident:"] = df2['Address of Incident:'].str.strip()
    
    df2 = df2[["Number:","Beat:","Location Code:","Address of Incident:","Unnamed: 6",
               "Incident Date & Time","Complaint Date","Closed Date","Investigator:",
              "Assignment","Rank","Star","Appt_Date"]]

    df2.columns = ["CRID","Beat","Location_Code","Address_of_Incident",
                      "City_State_Zip","Incident_Date","Complaint_Date", 
                      "Closed_Date","Investigator_Full_Name",
                  "Investigator_Assignment","Investigator_Rank",
                   "Investigator_Star","Investigator_Appt_Date"]

    ## Splitting City State Zip into three columns
    city_state_zip = city_state_zip_splitter(df2)
    df2.reset_index(drop=True, inplace=True)
    df2 = df2.merge(city_state_zip,how='left',right_index=True,left_index=True)
    ## Appending Location Type
    df2 = df2.merge(location_code,how='left',on='Location_Code')
    
    df2 = df2 [["CRID","Beat","Location_Code","Location_Value","Address_of_Incident",
                      "City","State","Zip","Incident_Date","Complaint_Date", 
                      "Closed_Date","Investigator_Full_Name",
                  "Investigator_Assignment","Investigator_Rank",
                   "Investigator_Star","Investigator_Appt_Date"]]
    
    ## Adding File Metadata
    df2["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date.date()
    except:
        df["Report_Produced_Date"]=''  
    ## Appending to Final File + Metadata
    final_df = final_df.append(df2)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df2,file))
    metadata_df.reset_index(drop=True,inplace=True)

p046957 - report 1.1 - all complaints in time frame.xls
p046957 - report 1.2 - all complaints in time frame.xls
p046957 - report 1.3 - all complaints in time frame.xls
p046957 - report 1.4 - all complaints in time frame.xls
p046957 - report 1.5 - all complaints in time frame.xls
p046957 - report 1.6 - all complaints in time frame.xls


In [42]:
final_df.head()

Unnamed: 0,CRID,Beat,Location_Code,Location_Value,Address_of_Incident,City,State,Zip,Incident_Date,Complaint_Date,Closed_Date,Investigator_Full_Name,Investigator_Assignment,Investigator_Rank,Investigator_Star,Investigator_Appt_Date,FOIA_Request_Number
0,258996,1524,4,Police Building,5327 W CHICAGO,CHICAGO,IL,,2000-01-01 01:20:00,2000-01-01,2001-01-26,"SCHWIEGER, STEVEN",13,LIEUTENANT OF POLICE,,1986-06-16 00:00:00,FOIA P046957\nReport 1.1\nAll Complaints in CR...
1,258997,1115,17,Public Way - Other,4316 W JACKSON,CHICAGO,IL,,2000-01-01 01:30:00,2000-01-01,2000-10-14,"MULLIGAN JR, MICHAEL",8,SERGEANT OF POLICE,,1970-12-14 00:00:00,FOIA P046957\nReport 1.1\nAll Complaints in CR...
2,258998,1834,17,Public Way - Other,500 W ILLINOIS,CHICAGO,IL,,2000-01-01 00:28:00,2000-01-01,2001-01-18,"MC MAHON, MAUREEN",608,LIEUTENANT OF POLICE,,1985-07-01 00:00:00,FOIA P046957\nReport 1.1\nAll Complaints in CR...
3,258999,0,17,Public Way - Other,,CHICAGO,IL,,2000-01-01 03:30:00,2000-01-01,2000-03-23,"DEAN, BRUCE",113,SUPERVISING INV IPRA,,1995-09-16 00:00:00,FOIA P046957\nReport 1.1\nAll Complaints in CR...
4,259000,1524,4,Police Building,5327 W CHICAGO AVE,CHICAGO,IL,,2000-01-01 05:00:00,2000-01-01,2001-01-17,"LABERN, LINDA",20,SERGEANT OF POLICE,,1973-07-16 00:00:00,FOIA P046957\nReport 1.1\nAll Complaints in CR...


In [46]:
final_df.to_csv(out_path_nov_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_nov_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_nov_2016_report+saving_files+"_metadata.csv",index=False)


### Report 2

In [51]:
in_path_nov_2016_report = in_path
out_path_nov_2016_report = out_path

files = os.listdir(in_path_nov_2016_report)
files = [file for file in files if '.xls' in file and '2.' in file]
files

['p046957 - report 2.1 - identified accused.xls',
 'p046957 - report 2.2 - identified accused.xls',
 'p046957 - report 2.3 - identified accused.xls',
 'p046957 - report 2.4 - identified accused.xls',
 'p046957 - report 2.5 - identified accused.xls']

In [56]:
saving_files = [file.replace(" ","_").replace(".xls","") for file in files]
saving_files = saving_files[0].replace('.1',"")

In [55]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in files:
    df = pd.read_excel(in_path_nov_2016_report + file,nrows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]

    # +1 because of python indexing,
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
    df = pd.read_excel(in_path_nov_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    
    ## Remove leading and trailing whitespace from columns 
    df.columns = [col.strip() for col in df.columns.tolist()]

    ## Need to fill in Number
    df['Number:'].fillna(method='ffill', inplace=True)
    df['Number:'] = df['Number:'].astype(int)

    ## Drops end of record 
    df = df.dropna(subset=["Accused:","Gender:","Date of Appt:","Star:"],how="all",axis=0)
    
    ## drops the significant number of columns that are all nulls
    df = null_dropper(df)
    ##print(df.head())
    df.columns = ["CRID","Accused","Accused_Birth_Year","Accused_Gender","Accused_Race_Code","Date_of_Appt","Current_Unit","Current_Rank",
                  "Star","Complaint_Category","Orig_Finding","Orig_Recommended_Discipline","Final_Finding",
                 "Final_Recommended_Discipline"]

    ## Excel reads NA as null for Orig and Final Finding, this returns it to NA status when discipline is not null
    df["Orig_Finding"] = df['Orig_Finding'].astype(str)
    df["Orig_Finding"] =np.where((df["Orig_Finding"]=='nan') & (~df["Orig_Recommended_Discipline"].isnull()),
                                 "NA",
                                 df["Orig_Finding"])

    df["Final_Finding"] = df['Final_Finding'].astype(str)
    df["Final_Finding"] = np.where((df["Final_Finding"]=='nan') & (~df["Final_Recommended_Discipline"].isnull()),
                                 "NA",
                                 df["Final_Finding"])
    
    ## replace the actual nulls with blanks
    df["Orig_Finding"] = np.where((df["Orig_Finding"]=='nan'),
                                 "",
                                 df["Orig_Finding"])
    df["Final_Finding"] =np.where((df["Final_Finding"]=='nan'),
                                 "",
                                 df["Final_Finding"])

    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date.date()
    except:
        df["Report_Produced_Date"]=''
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [57]:
final_df.head()

Unnamed: 0,CRID,Accused,Accused_Birth_Year,Accused_Gender,Accused_Race_Code,Date_of_Appt,Current_Unit,Current_Rank,Star,Complaint_Category,Orig_Finding,Orig_Recommended_Discipline,Final_Finding,Final_Recommended_Discipline,FOIA_Request_Number,Report_Produced_Date
0,258996,"BARRON, WILLIAM",1949.0,M,WHI,1978-02-27,18,SGT,,01A-USE OF PROFANITY,NS,600.0,NS,600.0,FOIA P046957\nReport 2.1\nAll Complaints in CR...,
1,258997,"C0NNOLLY, KIMBERLY",1965.0,F,BLK,1990-07-30,55,,11026.0,01A-USE OF PROFANITY,UN,600.0,UN,600.0,FOIA P046957\nReport 2.1\nAll Complaints in CR...,
2,258997,"KEENE, JOHN",1968.0,M,WHI,1999-03-08,153,PO,,01A-USE OF PROFANITY,UN,600.0,UN,600.0,FOIA P046957\nReport 2.1\nAll Complaints in CR...,
3,258998,"SLAVIN, SCOTT",1965.0,M,WHI,1991-11-18,145,SGT,807.0,10J-NEGLECT OF DUTY/CONDUCT UNBECOMING - ON DUTY,EX,600.0,EX,600.0,FOIA P046957\nReport 2.1\nAll Complaints in CR...,
4,259001,"MARTINEZ, ANTONIO",1971.0,M,S,1996-11-04,701,PO,,10U-INADEQUATE/FAILURE TO PROVIDE SERVICE,UN,600.0,UN,600.0,FOIA P046957\nReport 2.1\nAll Complaints in CR...,


In [58]:
saving_files

'p046957_-_report_2_-_identified_accused'

In [59]:
final_df.to_csv(out_path_nov_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_nov_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_nov_2016_report+saving_files+"_metadata.csv",index=False)

### Report 3

In [62]:
in_path_nov_2016_report = in_path
out_path_nov_2016_report = out_path

files = os.listdir(in_path_nov_2016_report)
files = [file for file in files if '.xls' in file and 'report 3' in file]
files

['p046957 - report 3 - police officer witness data xi.xls']

In [63]:
saving_files = [file.replace(" ","_").replace(".xls","") for file in files]
saving_files = saving_files[0]

In [72]:
saving_files

'p046957_-_report_3_-_police_officer_witness_data_xi'

In [68]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
file=files[0]
df = pd.read_excel(in_path_nov_2016_report + file,nrows=20)
## Making Sure Every File contains date the file was created and the foia that created it
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]

# +1 because of python indexing
skip = np.where(df.iloc[:,0]=="Number:")[0][0]
df = pd.read_excel(in_path_nov_2016_report + file, skiprows=skip)
df.dropna(how='all', inplace=True)

## Remove leading and trailing whitespace from columns 
df.columns = [col.strip() for col in df.columns.tolist()]

## Filling Number Column
df['CRID'] = pd.to_numeric(df["Gender"],errors='coerce').fillna(method='ffill')
df['CRID'] = df['CRID'].astype(int)
## Drops end of record 
df = df.dropna(subset=["Unnamed: 0","Gender","Star"],how="all",axis=0)
## Drops CRID only Row
df = df[df["Gender"]!=df["CRID"].astype(str)]

## drops the significant number of columns that are all nulls
df = null_dropper(df)

df.columns = ["Officer_Witness","Officer_Witness_Gender","Officer_Witness_Race","Officer_Witness_Star",
              "Officer_Witness_Birth_Year","Officer_Witness_Date_Appointed","CRID"]

df = df[["CRID","Officer_Witness","Officer_Witness_Gender","Officer_Witness_Race","Officer_Witness_Star",
        "Officer_Witness_Birth_Year","Officer_Witness_Date_Appointed"]]

df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date.date()
except:
    df["Report_Produced_Date"]=''
final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

           Unnamed: 0 Gender Race   Star  Birth Year Date Appointed  \
1   DOLCIMASCOLO, NED      M  WHI    NaN      1945.0     1972-10-23   
4      BRYANT, YVONNE      F  BLK    NaN      1956.0     1998-08-31   
5      CARROLL, RONDY      M  BLK    NaN      1973.0     1997-08-04   
8     CASTILLO, DIEGO      M  WWH   7499      1963.0     1994-07-05   
11    RODIRGUEZ, GINA      F  WWH  20045      1964.0     1990-03-26   

    Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  Unnamed: 10    CRID  
1          NaN         NaN         NaN         NaN          NaN  259069  
4          NaN         NaN         NaN         NaN          NaN  259088  
5          NaN         NaN         NaN         NaN          NaN  259088  
8          NaN         NaN         NaN         NaN          NaN  259100  
11         NaN         NaN         NaN         NaN          NaN  259108  


In [73]:
final_df.to_csv(out_path_nov_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_nov_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_nov_2016_report+saving_files+"_metadata.csv",index=False)

### Report 4

In [74]:
in_path_nov_2016_report = in_path
out_path_nov_2016_report = out_path

files = os.listdir(in_path_nov_2016_report)
files = [file for file in files if '.xls' in file and 'report 4' in file]
files

['p046957 - report 4 - victim data.xls']

In [81]:
saving_files = [file.replace(" ","_").replace(".xls","") for file in files]
saving_files = saving_files[0]
saving_files

'p046957_-_report_4_-_victim_data'

In [80]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

file = files[0]
df = pd.read_excel(in_path_nov_2016_report + file,nrows=20)
## Making Sure Every File contains date the file was created and the foia that created it
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]

# +1 because of python indexing
skip = np.where(df.iloc[:,0]=="Number")[0][0]+1
df = pd.read_excel(in_path_nov_2016_report + file, skiprows=skip)
df.dropna(how='all', inplace=True)

print(df.head())

## Remove leading and trailing whitespace from columns 
df.columns = [col.strip() for col in df.columns.tolist()]

## Filling Number Column
df['Number'].fillna(method='ffill', inplace=True)
df['Number'] = df['Number'].astype(int)

## Drops end of record 
df = df.dropna(subset=["Gender","Age","Race Desc"],how="all",axis=0)

## drops the significant number of columns that are all nulls
df = null_dropper(df)

df.columns = ["CRID","Victim_Gender","Victim_Age","Victim_Race"]

## drop end of record rows
df = df[df["Victim_Race"]!="end of record"]

df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date.date()
except:
    df["Report_Produced_Date"]=''
    
final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

      Number  Unnamed: 1 Gender        Age  Unnamed: 4       Race Desc  \
1  1000009.0         NaN    NaN        NaN         NaN             NaN   
2        NaN         NaN      F  37.106849         NaN  WHITE HISPANIC   
3        NaN         NaN    NaN        NaN         NaN   end of record   
4  1000015.0         NaN    NaN        NaN         NaN             NaN   
5        NaN         NaN      F  35.410959         NaN           BLACK   

   Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  Unnamed: 10  Unnamed: 11  
1         NaN         NaN         NaN         NaN          NaN          NaN  
2         NaN         NaN         NaN         NaN          NaN          NaN  
3         NaN         NaN         NaN         NaN          NaN          NaN  
4         NaN         NaN         NaN         NaN          NaN          NaN  
5         NaN         NaN         NaN         NaN          NaN          NaN  


In [19]:
final_df.to_csv(out_path_nov_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_nov_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_nov_2016_report+saving_files+"_metadata.csv",index=False)

### Report 5

In [83]:
in_path_nov_2016_report = in_path
out_path_nov_2016_report = out_path

files = os.listdir(in_path_nov_2016_report)
files = [file for file in files if '.xls' in file and 'report 5' in file]
files

['p046957 - report 5.1 - complainant (reporting party) data.xls',
 'p046957 - report 5.2 - complainant (reporting party) data.xls',
 'p046957 - report 5.3 - complainant (reporting party) data.xls']

In [84]:
saving_files = [file.replace(" ","_").replace(".xls","") for file in files]
saving_files = saving_files[0].replace('.1',"")
saving_files

'p046957_-_report_5_-_complainant_(reporting_party)_data'

In [89]:
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

for file in files:
    df = pd.read_excel(in_path_nov_2016_report + file,nrows=20)
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = np.where(df.iloc[:,0]=="Number")[0][0]+1
    df = pd.read_excel(in_path_nov_2016_report + file, skiprows=skip)
    df.dropna(how='all', inplace=True)
    ##print(df.head())
    ## Remove leading and trailing whitespace from columns 
    df.columns = [col.strip() for col in df.columns.tolist()]

    ## Filling Number Column
    df['Number'].fillna(method='ffill', inplace=True)
    df['Number'] = df['Number'].astype(int)

    ## Drops end of record 
    df = df.dropna(subset=["Gender","Age","Race Desc"],how="all",axis=0)

    ## drops the significant number of columns that are all nulls
    df = null_dropper(df)

    df.columns = ["CRID","Witness_Gender","Witness_Age","Witness_Race"]

    ## drop end of record rows
    df = df[df["Witness_Race"]!="end of record"]

    ## Adding File Metadata
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''  

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [87]:
final_df.head()

Unnamed: 0,CRID,Witness_Gender,Witness_Age,Witness_Race,FOIA_Request_Number,Report_Produced_Date
0,1000000,F,48.767123,WHITE,FOIA # P046957\nReport 5.1\nComplainant Data\...,2016-10-29
1,1000001,M,31.641096,BLACK,FOIA # P046957\nReport 5.1\nComplainant Data\...,2016-10-29
2,1000002,M,42.621918,WHITE HISPANIC,FOIA # P046957\nReport 5.1\nComplainant Data\...,2016-10-29
3,1000004,M,60.254795,HISPANIC,FOIA # P046957\nReport 5.1\nComplainant Data\...,2016-10-29
4,1000004,M,52.750685,WHITE,FOIA # P046957\nReport 5.1\nComplainant Data\...,2016-10-29


In [88]:
final_df.to_csv(out_path_nov_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_nov_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_nov_2016_report+saving_files+"_metadata.csv",index=False)