In [1]:
import re
import datetime
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils.utils import *

### Establishes General Path

In [3]:
local_path = "/your/path/here"

In [4]:
path = '/Data/chicago-police-data/import'
db = dropbox_handler()
db.list_files(path+'/input')

['complaints-merged-2015_copy_20170112',
 'complaints-cpd-2016-dec_copy_20170112',
 'complaints-cpd-2016-jun_copy_20170112',
 'complaints-cpd-2016-nov_copy_20170112',
 'complaints-cpd-2016-oct_copy_20170112',
 'complaints-ipra-2016-apr_copy_20170112',
 'shootings-cpd-feb2016_copy_20170112',
 'shootings-ipra-may2016_copy_20170112',
 'TRRs']

In [6]:
in_path = path + '/input/shootings-cpd-feb2016_copy_20170112/'
out_path = local_path + '/output/shootings-cpd-feb2016_copy_20170112/'

In [7]:
location_code = db.download_file(path+'/doc/','Location_Code_Dictionary.csv')
location_code.dropna(how='all', inplace=True)
location_code['Location_Code']=location_code['Location_Code'].astype(int).astype(str)

In [8]:
location_code.head()

Unnamed: 0,Location_Code,Location_Value
0,1,Food Sales/Restaurant
1,2,Tavern/Liquor Store
2,3,Other Business Establishment
3,4,Police Building
4,5,Lockup Facility


## Feb 2016 Data

### Report 1

In [9]:
in_path_feb_2016_report = in_path
out_path_feb_2016_report = out_path

files = [x for x in db.list_files(in_path_feb_2016_report) if '.xls' in x]
files

['18A - 1 - Incident(1).xls',
 '18A - 3 -  Involved Member(1).xls',
 '18A - 4 -  cpd witness(1).xls',
 '18A - 5 -  cpd Reporting Party(1).xls',
 '18a 18b 20a victim-detainee demographics(1).xls',
 '18b - 1 - Incidents(1).xls',
 '18b - 3 - Involved Member(1).xls',
 '18b - 4 - cpd witness(1)Updated Info.xls',
 '18b - 5 - cpd reporting party(1).xls',
 '20A - 1 - Incident(1).xls',
 '20A - 3 -  Involved Member(1).xls',
 '20A - 4 - cpd Witness(1).xls',
 '20A - 5 - cpd Reporting Party(1).xls',
 'Copy of 18a - 2 - Incident Address(1) Block Level (2)(1)updated.xls',
 'Copy of 18b - 2 - incident address(1) block level(1)updated.xls',
 'Copy of 20A - 2 - Incident address(1)block level X(2)updated report.xls',
 'crms - 05j complaint and investigator(1).xls',
 'crms - 05j cpd witness, reporting party, victim(1).xls',
 'crms - 05j Officer(1).xls',
 'stars for 18a - involved members(1).xls',
 'stars for 18b - involved members(1).xls',
 'stars for 20a - involved members(1).xls']

In [10]:
incident_files = [file for file in files if '- 1 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    ##df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    df = db.download_file(in_path_feb_2016_report,file,rows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing, +1 because of header in first df
    skip = np.where(df.iloc[:,0]=="Log No")[0][0]+1
    ##df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df = db.download_file(in_path_feb_2016_report,file,skip=skip)
    df.dropna(how='all', inplace=True)
    ## remove end of record rows and page number row
    df = df.dropna(subset=["Log No","Assignment","Initial Category","Assigned Team"],how="all",axis=0)
    
    df.columns = ["CRID","Assignment","Initial_Category","Assigned_Team","Team_Assigned_Date",
                  "Investigator_Assigned","Investigator_Assigned_Date","Supervisor_Assigned","IPRA_Closed_Date"]

    df["Team_Assigned_Date"] = pd.to_datetime(df["Team_Assigned_Date"],format="%Y%m%d %H:%M",errors='coerce')
    
    df["CRID"] = df["CRID"].astype(int)
    
    ## Adding File Metadata
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''    

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [11]:
saving_files = "1_-_Incident(1)"

final_df.to_csv(out_path_feb_2016_report+saving_files+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_files+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_files+"_metadata.csv",index=False)

### Report 2

In [12]:
incident_files = [file for file in files if '- 2 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    ##df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    df = db.download_file(in_path_feb_2016_report,file,rows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    #df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df = db.download_file(in_path_feb_2016_report,file,skip=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    try:
        df.columns = ["CRID","Incident_Date","Incident_Address","District"]
    except:
        df.columns = ["CRID","Incident_Date","Incident_Number","Incident_Address","District"]
        df["Incident_Address"] = df[["Incident_Number","Incident_Address"]].apply(lambda x: ' '.join(x), axis=1)
        df = df[["CRID","Incident_Date","Incident_Address","District"]]
    
    ## Split Up Date into Start and End Date
    df1 = df["Incident_Date"].str.split(" - ",expand=True)
    df = df.merge(df1,how='left',right_index=True,left_index=True)
    
    df.columns = ["CRID","Incident_Date","Incident_Address","District","Incident_Start_Date","Incident_End_Date"]
    df["Incident_Start_Date"] = pd.to_datetime(df["Incident_Start_Date"],format="%d-%b-%Y %H:%M",errors='coerce')
    df["Incident_End_Date"] = pd.to_datetime(df["Incident_End_Date"],format="%d-%b-%Y %H:%M",errors='coerce')
    
    ## Split up address into all the relevant fields
    df2 = df["Incident_Address"].str.split(",",expand=True)
    def row_switcher(row):
        if row[3] is None:
            row[3]=row[2]
            row[2]=row[1]
            row[1]=None
        return row

    if df2.shape[1]==4:
        df2 = df2.apply(row_switcher,axis=1)
        max_val = df2.shape[1]-1
        df3 = df2[max_val].str.strip().str.split(" ",expand=True)
        df2 = df2.merge(df3,left_index=True,right_index=True)
        df2.columns = ["Incident_Address","Incident_Apt","Incident_City","State_Zip","Incident_State","Incident_Zip"]
        df2 = df2[["Incident_Address","Incident_Apt","Incident_City","Incident_State","Incident_Zip"]]
    else:
        df2[3]=None
        df2 = df2[[0,3,1,2]]
        df2.columns = [0,1,2,3]
        max_val = df2.shape[1]-1
        df3 = df2[max_val].str.strip().str.split(" ",expand=True)
        df2 = df2.merge(df3,left_index=True,right_index=True)
        df2.columns = ["Incident_Address","Incident_Apt","Incident_City","State_Zip","Incident_State","Incident_Zip"]
        df2 = df2[["Incident_Address","Incident_Apt","Incident_City","Incident_State","Incident_Zip"]]
    
    df = df.merge(df2,left_index=True,right_index=True)
    df = df[["CRID","Incident_Date","Incident_Address_y","Incident_Apt",
             "Incident_City","Incident_State","Incident_Zip","District",
             "Incident_Start_Date","Incident_End_Date"]]
    df.columns = ["CRID","Incident_Date","Incident_Address","Incident_Apt",
             "Incident_City","Incident_State","Incident_Zip","District",
             "Incident_Start_Date","Incident_End_Date"]

    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [13]:
saving_file = "Incident_Address(1)_Block_Level"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)


### Report 3

In [14]:
incident_files = [file for file in files if '- 3 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    #df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    df = db.download_file(in_path_feb_2016_report,file,rows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    #df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df = db.download_file(in_path_feb_2016_report,file,skip=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    df.columns = ["CRID","Involved_Officer"]
    
    df.dropna(how='all', inplace=True)
    
    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [15]:
saving_file = "Involved_Member(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)

### Report 4

In [16]:
incident_files = [file for file in files if '- 4 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    #df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    df = db.download_file(in_path_feb_2016_report,file,rows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    #df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df = db.download_file(in_path_feb_2016_report,file,skip=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    df.columns = ["CRID","Officer_Witness"]
    
    df.dropna(how='all', inplace=True)
    
    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [17]:
saving_file = "CPD_Witness(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)

### Report 5

In [18]:
incident_files = [file for file in files if '- 5 -' in file]

final_df = pd.DataFrame()
metadata_df = pd.DataFrame()
for file in incident_files:
    #df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    df = db.download_file(in_path_feb_2016_report,file,rows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # +1 because of python indexing
    skip = df[df.iloc[:,0].str.contains("Log No")==True].index.values[0]+1
    #df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df = db.download_file(in_path_feb_2016_report,file,skip=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    df.columns = ["CRID","Officer_Reporting_Party"]
    
    df.dropna(how='all', inplace=True)
    
    df["CRID"] = df["CRID"].astype(int)
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''
    
    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [19]:
saving_file = "CPD_Reporting_Party(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)

### Other Files In Works

In [20]:
done_list = ['- 1 -','- 2 -','- 3 -','- 4 -','- 5 -']
done_files = [item for item in files if any(x in item for x in done_list)]
other_files = [file for file in files if file not in done_files]

In [21]:
other_files

['18a 18b 20a victim-detainee demographics(1).xls',
 'crms - 05j complaint and investigator(1).xls',
 'crms - 05j cpd witness, reporting party, victim(1).xls',
 'crms - 05j Officer(1).xls',
 'stars for 18a - involved members(1).xls',
 'stars for 18b - involved members(1).xls',
 'stars for 20a - involved members(1).xls']

### Victim Detainee

In [22]:
file = other_files[0]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

##df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
df = db.download_file(in_path_feb_2016_report,file,rows=20)
## Making Sure Every File contains date the file was created and the foia that created it
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
# had to check file as it doesn't contain any relevant headers
skip = 7
#df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
df = db.download_file(in_path_feb_2016_report,file,skip=skip)
df.dropna(how='all', inplace=True)
df.reset_index(drop=True,inplace=True)
## Drop all null columns
df = null_dropper(df)
df.columns = ["CRID","Detainee_Age","Detainee_Gender","Detainee_Race"]
df['CRID'].fillna(method='ffill', inplace=True)
df['CRID'] = df['CRID'].astype(int)

df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

In [23]:
saving_file = "victim-detainee_demographics(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)

### Stars

In [24]:
star_files = [file for file in other_files if 'stars' in file]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

for file in star_files:
    ##df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
    df = db.download_file(in_path_feb_2016_report,file,rows=20)
    ## Making Sure Every File contains date the file was created and the foia that created it
    col_list = df.columns.tolist()
    Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
    col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
    FOIA_Request = [x for x in col_list if 'FOIA' in x][0]
    # distinct header
    try:
        skip = df[df.iloc[:,1].str.contains("PERS_LAST_NME")==True].index.values[0]+1+1
    except:
        skip = df[df.iloc[:,2].str.contains("PERS_LAST_NME")==True].index.values[0]+1+1
    #df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
    df = db.download_file(in_path_feb_2016_report,file,skip=skip)
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True,inplace=True)
    ## Drop all null columns
    df = null_dropper(df)
    df.columns = ["Officer_Last_Name","Officer_First_Name","Star","Officer_Description"]
 
    df["FOIA_Request_Number"]=FOIA_Request
    try:
        df["Report_Produced_Date"]=Report_Produced_Date[0].date()
    except:
        df["Report_Produced_Date"]=''

    final_df = final_df.append(df)
    final_df.reset_index(drop=True,inplace=True)
    metadata_df = metadata_df.append(metadata_dataset(df,file))
    metadata_df.reset_index(drop=True,inplace=True)

In [25]:
saving_file = "stars-for_involved_members(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)

### Complaint & Investigator

In [26]:
file = other_files[1]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

##df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
df = db.download_file(in_path_feb_2016_report,file,rows=20)
## Making Sure Every File contains date the file was created and the foia that created it
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in df.iloc[0,:].astype(str).tolist() if 'FOIA' in x][0]
# had to check file as it doesn't contain any relevant headers
skip = 12
#df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
df = db.download_file(in_path_feb_2016_report,file,skip=skip)
df.dropna(how='all', inplace=True)
df.reset_index(drop=True,inplace=True)
## Drop all null columns
df = null_dropper(df)

df['Unnamed: 0'] = df['Unnamed: 0'].replace("Beat:",None)
df['Unnamed: 0'] = df['Unnamed: 0'].replace("Incident Date/Time:",None)
df['Unnamed: 0'] = df['Unnamed: 0'].replace("Investigator:",None)

df['Unnamed: 0'].fillna(method='ffill', inplace=True)
df['Unnamed: 0'] = df['Unnamed: 0'].astype(int)

df.dropna(subset=['Unnamed: 1','Unnamed: 2',"Unnamed: 3","Unnamed: 4"], how='all', inplace=True)

df1=df.loc[:,['Unnamed: 0','Unnamed: 1',"Unnamed: 3","Unnamed: 4",'Unnamed: 5',
        'Unnamed: 7','Unnamed: 8','Unnamed: 10']]
df1.dropna(subset=['Unnamed: 1',"Unnamed: 10"], how='all', inplace=True)
df1.columns = ['CRID','Beat','Block','Location','Street','City','State','Location_Value']

df2 =df.loc[:,['Unnamed: 0','Unnamed: 2','Unnamed: 6']]
df2.dropna(subset=['Unnamed: 0','Unnamed: 2',"Unnamed: 6"], how='any', inplace=True)
df2.columns = ['CRID','Incident_Date','Complaint_Date']
df2 = df2[df2['Complaint_Date']!='Investigator End Date:']

df3 = df.loc[:,['Unnamed: 0','Unnamed: 2','Unnamed: 4','Unnamed: 8']]
df3.dropna(subset=['Unnamed: 0','Unnamed: 2',"Unnamed: 4",'Unnamed: 8'], how='all', inplace=True)
df3.columns = ['CRID','Investigator','Assigned_Date','Investigator_End_Date']

## Every 3rd row contains the correct data
df3 = df3.iloc[2::3,:]

df4 = df1.merge(df2,how='left',right_on='CRID',left_on='CRID')
df4 = df4.merge(df3,how='left',right_on='CRID',left_on='CRID')

df4['Block'] = df4['Block'].astype(str)
df4["Incident_Address"] = df4[["Block","Location","Street"]].apply(lambda x: ' '.join(x), axis=1)
df5 = df4['Location_Value'].str.split(' - ',n=1,expand=True)
df5.columns=['Location_Code','Location_Value']
df = df4.merge(df5,how='left',right_index=True,left_index=True)

df = df[['CRID','Beat','Incident_Address','City','State','Location_Code','Location_Value_y',
         'Incident_Date','Complaint_Date','Investigator','Assigned_Date','Investigator_End_Date']]

df.columns = ['CRID','Beat','Incident_Address','City','State','Location_Code','Location_Value',
         'Incident_Date','Complaint_Date','Investigator','Assigned_Date','Investigator_End_Date']

df["FOIA_Request_Number"]=FOIA_Request

try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

In [27]:
saving_file = "crms_-_05j_complaint_and_investigator(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)

### Witness Reporting Party, and Victim

In [28]:
file = other_files[2]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

##df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
df = db.download_file(in_path_feb_2016_report,file,rows=20)
## Making Sure Every File contains date the file was created and the foia that created it
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]
FOIA_Request = [x for x in df.iloc[0,:].astype(str).tolist() if 'FOIA' in x][0]
# had to check file as it doesn't contain any relevant headers
skip = 8
#df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
df = db.download_file(in_path_feb_2016_report,file,skip=skip)
df.dropna(how='all', inplace=True)
df.reset_index(drop=True,inplace=True)
## Drop all null columns
df = null_dropper(df)

df['CRID'] = df.iloc[df['Unnamed: 0'].astype(str).str.isnumeric().tolist(),0]
df['Values'] = df[~df.isin(df1)]['Unnamed: 0']

df['CRID'].fillna(method='ffill', inplace=True)
df['CRID'] = df['CRID'].astype(int)
df.dropna(subset=['Role','CPD or Not','Gender','Ethicity','Year of Birth','Star','Position','Values'],
          how='all',inplace=True)

df = df[['CRID','Values','Role','CPD or Not','Gender','Ethicity','Year of Birth','Star','Position']]
df.columns = ['CRID','Values','Role','CPD_or_Not','Gender','Ethnicity','Year of Birth','Star','Position']

df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

In [29]:
saving_file = "crms_-_05j_cpd_witness,_reporting_party,_victim(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)

### Officer

In [30]:
file = other_files[3]
final_df = pd.DataFrame()
metadata_df = pd.DataFrame()

##df = pd.read_excel(in_path_feb_2016_report + file,nrows=20)
df = db.download_file(in_path_feb_2016_report,file,rows=20)
## Making Sure Every File contains date the file was created and the foia that created it
col_list = df.columns.tolist()
Report_Produced_Date = [x for x in col_list if isinstance(x, datetime.datetime)]
col_list = [x for x in col_list if isinstance(x, datetime.datetime)==False]

FOIA_Request = [x for x in df.iloc[0,:].astype(str).tolist() if 'FOIA' in x][0]
# had to check file as it doesn't contain any relevant headers
skip = 11
#df = pd.read_excel(in_path_feb_2016_report + file, skiprows=skip)
df = db.download_file(in_path_feb_2016_report,file,skip=skip)
df.dropna(how='all', inplace=True)
df.reset_index(drop=True,inplace=True)

df['Unnamed: 0'].fillna(method='ffill', inplace=True)
df['Unnamed: 0'] = df['Unnamed: 0'].astype(int)

df['Unnamed: 1'] = df['Unnamed: 1'].replace("Accused:",None)
df['Unnamed: 2'] = df['Unnamed: 2'].replace("Accused Unit:",None)
df['Unnamed: 3'] = df['Unnamed: 3'].replace("Recom Finding:",None)
df['Unnamed: 3'] = df['Unnamed: 3'].replace("Recom Action:",None)
df['Unnamed: 9'] = df['Unnamed: 9'].replace("Rank:",None)
df['Unnamed: 14'] = df['Unnamed: 14'].replace("Star:",None)
df['Unnamed: 16'] = df['Unnamed: 16'].replace("Final Finding:",None)
df['Unnamed: 16'] = df['Unnamed: 16'].replace("Final Action:",None)
df['Unnamed: 18'] = df['Unnamed: 18'].replace("Gender:",None)
df['Unnamed: 22'] = df['Unnamed: 22'].replace("Date of Appointment:",None)
df['Unnamed: 24'] = df['Unnamed: 24'].replace("Ethnicity:",None)
df['Unnamed: 31'] = df['Unnamed: 31'].replace("Year of Birth:",None)

## Drop all null columns
df = null_dropper(df)

df.dropna(subset=['Unnamed: 4','Unnamed: 6','Unnamed: 8','Unnamed: 11',"Unnamed: 15",
                  'Unnamed: 20','Unnamed: 21','Unnamed: 26','Unnamed: 28'
                  ,'Unnamed: 32'],how='all', inplace=True)

df['Unnamed: 4'].fillna(method='ffill', inplace=True)

df1=df.loc[:,['Unnamed: 0','Unnamed: 4',"Unnamed: 15","Unnamed: 21",'Unnamed: 26',
        'Unnamed: 32']]
df1.dropna(subset=["Unnamed: 15",'Unnamed: 21','Unnamed: 26','Unnamed: 32'], how='all', inplace=True)
df1.columns = ['CRID','Officer','Star','Gender','Race','Age',]

df2 =df.loc[:,['Unnamed: 0','Unnamed: 4','Unnamed: 6','Unnamed: 11','Unnamed: 28']]
df2.dropna(subset=['Unnamed: 6','Unnamed: 11',"Unnamed: 28"], how='all', inplace=True)
df2.columns = ['CRID','Officer','Unit','Position','Appointment_Date']

df3 = df.loc[:,['Unnamed: 0','Unnamed: 4','Unnamed: 8','Unnamed: 20']]
df3.dropna(subset=['Unnamed: 8',"Unnamed: 20"], how='all', inplace=True)
df3.columns = ['CRID','Officer','Recommended_Finding','Final_Finding']
## Finding and action are under each other
df4=df3.iloc[::2,:]
df5=df3.iloc[1::2,:]
df5.columns =['CRID','Officer','Recommended_Action','Final_Action']
df4 = df4.merge(df5,how='left',on=['CRID','Officer'])

dff = df1.merge(df2,how='left',on=['CRID','Officer'])
dff = dff.merge(df4,how='left',on=['CRID','Officer'])

df = dff

df["FOIA_Request_Number"]=FOIA_Request
try:
    df["Report_Produced_Date"]=Report_Produced_Date[0].date()
except:
    df["Report_Produced_Date"]=''

final_df = final_df.append(df)
final_df.reset_index(drop=True,inplace=True)
metadata_df = metadata_df.append(metadata_dataset(df,file))
metadata_df.reset_index(drop=True,inplace=True)

In [31]:
saving_file = "crms_-_05j_Officer(1)"
final_df.to_csv(out_path_feb_2016_report+saving_file+".csv",index=False)
final_df.to_excel(out_path_feb_2016_report+saving_file+".xlsx",index=False)

metadata_df.to_csv(out_path_feb_2016_report+saving_file+"_metadata.csv",index=False)