In [1]:
import re
import numpy as np
import pandas as pd 
import os
import sys
import itertools
import io

#### Can be used to remove columns that are all nulls so you don't have to check 

In [2]:
def null_dropper(df):
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    info_values = [re.split("\\s\\s+",x) for x in s.split("\n")]
    info_values = [x for x in info_values if len(x)>1]
    info_values = [x[0] for x in info_values if x[1].startswith('0 non-null')]
    df = df.drop(info_values,axis=1)
    return df

In [3]:
path = '/Users/thudson/Documents/Invisible Institute/p046957/'
out_path = '/Users/thudson/Documents/Invisible Institute/Formatted_Files/'
files = os.listdir(path)
file_types = ['report 1','report 2','report 3','report 4','report 5']

file_groups = [[file for file in files if file_type in file] for file_type in file_types] 

In [4]:
final_df = pd.DataFrame()
final_invest_df = pd.DataFrame()
for i in file_groups[0]:
    df = pd.read_excel(path+i,nrows=20)
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
    df = pd.read_excel(path+i, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df['Number:'].fillna(method='ffill', inplace=True)
    df['Number:'] = df['Number:'].astype(int)
    invest_df = df[df['Beat:'] == 'Investigator/Assignment/Rank/Star/Appt Date:'].copy()
    df = df[df['Beat:'] != 'Investigator/Assignment/Rank/Star/Appt Date:'].copy()
    df = df.replace('----', float('nan')).replace('-----', float('nan'))
    df = null_dropper(df)
    df.columns = ['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street', 'Apartment', 
                  'City_State_Zipcode', 'Incident_Datetime', 'Complaint_Date', 'Closed_Date']
    invest_df.dropna(how='all', axis=1, inplace=True)
    invest_df = null_dropper(invest_df)
    invest_df.columns = ['Complaint_Number','Investigator_Name', 'Investigator_Current_Assignment'
                     , 'Investigator_Rank', 'Investigator_Star', 'Investigator_Appointed_Date']
    final_df = final_df.append(df)
    final_invest_df = final_invest_df.append(invest_df)

print(len(pd.unique(final_df["Complaint_Number"])))
subset = ['Incident_Datetime','Complaint_Date','Closed_Date']
final_df = final_df.drop_duplicates()
final_df = final_df.dropna(subset=subset, how='all')
final_df = final_df.sort_values(by='Complaint_Number')
final_df.reset_index(drop=True, inplace=True)
final_df.info()

109339
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109339 entries, 0 to 109338
Data columns (total 10 columns):
Complaint_Number      109339 non-null int64
Beat                  108485 non-null object
Location_Code         109330 non-null object
Address               83457 non-null object
Street                84818 non-null object
Apartment             1098 non-null object
City_State_Zipcode    87211 non-null object
Incident_Datetime     109339 non-null datetime64[ns]
Complaint_Date        109339 non-null datetime64[ns]
Closed_Date           107390 non-null datetime64[ns]
dtypes: datetime64[ns](3), int64(1), object(6)
memory usage: 8.3+ MB


In [5]:
## Joined address and Street for Full Address
final_df["Full_Address"] = final_df["Address"].str.cat(final_df["Street"],sep=" ")

## City state zipcode contains some strange behaviors. cleaned up and removed time strings
final_df['City_State_Zipcode'] = final_df['City_State_Zipcode'].apply(str)
final_df['City_State_Zipcode'] = final_df['City_State_Zipcode'].fillna(value='')
final_df["City_State_Zipcode"] = np.where(final_df["City_State_Zipcode"].apply(str)!='00:00:00',
                                          final_df["City_State_Zipcode"].apply(str),'')
final_df["City_State_Zipcode"] = np.where(final_df["City_State_Zipcode"].apply(str)!='nan',
                                          final_df["City_State_Zipcode"].apply(str),'')

In [6]:
invest_df = invest_df.sort_values(by='Complaint_Number')
invest_df.reset_index(drop=True, inplace=True)
invest_df.head()

Unnamed: 0,Complaint_Number,Investigator_Name,Investigator_Current_Assignment,Investigator_Rank,Investigator_Star,Investigator_Appointed_Date
0,106213,"KLIMAS, ROBERT",121,COMMANDER,0.0,2008-08-04 00:00:00
1,1038595,"DAUN, SHERRY",113,SUPERVISING INV IPRA,,2008-12-01 00:00:00
2,1039179,"JONES, VINCENT",113,INVESTIGATOR 2 IPRA,,
3,1051000,"DANIELSON, LOUIS",16,SERGEANT OF POLICE,1406.0,1990-03-26 00:00:00
4,1051001,"ALLEN, DERRICK",121,POLICE AGENT,18366.0,1993-05-17 00:00:00


In [7]:
duplicate_complaint_list = invest_df[invest_df.duplicated("Complaint_Number")].loc[:,"Complaint_Number"].tolist()
duplicate_investigations = invest_df[invest_df.isin(duplicate_complaint_list).iloc[:,0]]
duplicate_investigations

Unnamed: 0,Complaint_Number,Investigator_Name,Investigator_Current_Assignment,Investigator_Rank,Investigator_Star,Investigator_Appointed_Date
656,1052229,"ALEJO, LUIS",121,POLICE OFFICER,10381.0,1995-02-06 00:00:00
657,1052229,"RAMIREZ, ELIZABETH",192,POLICE OFFICER,3716.0,2001-12-17 00:00:00
3448,1057000,"PELLEGRINI, JOHN",620,SERGEANT OF POLICE,1932.0,1990-08-27 00:00:00
3449,1057000,"STEHLIK, JOSEPH",121,SERGEANT OF POLICE,,1985-07-01 00:00:00
7797,1064370,"JOHNSON, DWAYNE",543,SERGEANT OF POLICE,1505.0,1986-10-13 00:00:00
7798,1064370,"LAZZARO, MICHAEL",1,SERGEANT OF POLICE,2154.0,1982-03-01 00:00:00
9396,1067199,"PETROWSKI, STEVEN",121,SERGEANT OF POLICE,1947.0,2000-10-10 00:00:00
9397,1067199,"ALEJO, LUIS",121,POLICE OFFICER,10381.0,1995-02-06 00:00:00
9908,1067974,"FIEDLER, JAMES",121,SERGEANT OF POLICE,1989.0,1994-12-05 00:00:00
9909,1067974,"KUBIK, JAMES",9,SERGEANT OF POLICE,1397.0,1990-03-26 00:00:00


In [8]:
final_df = final_df.sort_values(by='Complaint_Number')
final_df.to_csv(out_path+"complaint_file.csv",index=False)
duplicate_investigations.to_csv(out_path+"duplicate_investigations.csv",index=False)
invest_df.to_csv(out_path+"investigation_file.csv",index=False)

report1 = pd.merge(final_df, invest_df, on='Complaint_Number', how='outer')
report1 = report1.sort_values(by='Complaint_Number')
report1.rename(columns={'Investigator_Appointed_Date': 'Investigator_Date_Appointed'}, inplace=True)
report1.head()

Unnamed: 0,Complaint_Number,Beat,Location_Code,Address,Street,Apartment,City_State_Zipcode,Incident_Datetime,Complaint_Date,Closed_Date,Full_Address,Investigator_Name,Investigator_Current_Assignment,Investigator_Rank,Investigator_Star,Investigator_Date_Appointed
0,106213,1631,17,3700.0,N HARLEM AVE,,CHICAGO IL 60634,2015-07-19 21:00:00,2015-07-20,2015-09-21,3700 N HARLEM AVE,"KLIMAS, ROBERT",121.0,COMMANDER,0.0,2008-08-04 00:00:00
1,107899,611,17,7843.0,S HERMITAGE AVE,,CHICAGO IL 6062,2015-08-16 20:20:00,2016-01-25,NaT,7843 S HERMITAGE AVE,,,,,
2,107901,1814,17,1622.0,N NORTH PARK AVE,,CHICAGO IL 60614,2016-01-25 21:50:00,2016-01-26,NaT,1622 N NORTH PARK AVE,,,,,
3,108026,1722,04,4650.0,N PULASKI RD,,CHICAGO IL,2013-10-14 00:01:00,2016-04-25,NaT,4650 N PULASKI RD,,,,,
4,108059,0,XX,,,,CHICAGO IL,2016-05-12 00:00:00,2016-05-12,NaT,,,,,,


In [9]:
print(report1.info())
print(len(pd.unique(report1["Complaint_Number"])))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109347 entries, 0 to 109346
Data columns (total 16 columns):
Complaint_Number                   109347 non-null int64
Beat                               108493 non-null object
Location_Code                      109338 non-null object
Address                            83464 non-null object
Street                             84825 non-null object
Apartment                          1098 non-null object
City_State_Zipcode                 109347 non-null object
Incident_Datetime                  109347 non-null datetime64[ns]
Complaint_Date                     109347 non-null datetime64[ns]
Closed_Date                        107396 non-null datetime64[ns]
Full_Address                       83102 non-null object
Investigator_Name                  18266 non-null object
Investigator_Current_Assignment    18266 non-null object
Investigator_Rank                  18266 non-null object
Investigator_Star                  12067 non-null object
Inves

In [10]:
report1.to_csv(out_path+"report1.csv",index=False)
report1.to_excel(out_path+"report1.xlsx",index=False)

### Geocoding Data

In [None]:
output_df = final_df[["Complaint_Number","Beat","Location_Code","Full_Address",'City_State_Zipcode']]
print(len(pd.unique(output_df["Complaint_Number"])))
print(output_df.shape)

In [None]:
df1 = output_df.drop_duplicates()
df2 = df1.loc[df1[['Beat','Location_Code','Full_Address','City_State_Zipcode']].dropna(how='all').index,:]

print(df1.shape[0])
print(len(pd.unique(df2["Complaint_Number"])))
print(df2.shape[0])

In [None]:
df3 = df2.loc[df2[['Full_Address']].dropna(how='all').index,:]
df3.shape

In [None]:
df3.head()

In [None]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [None]:
new_states_list=[]
for value in df3["City_State_Zipcode"]:
    if hasNumbers(value):
        split_state = value.split(" ")
        while len(split_state)>3:
            split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
    else:
        split_state = value.split(" ")
        while len(split_state)>2:
            split_state = [split_state[0]+' '+split_state[1]]+split_state[2:]
    new_states_list.append(split_state)
new_states_list[0]

In [None]:
city_state_zip =  pd.DataFrame(new_states_list)
city_state_zip.columns = ["City","State","Zip"]
city_state_zip.head()

In [None]:
df3.reset_index(drop=True, inplace=True)
df3 = df3.merge(city_state_zip,how='left',right_index=True,left_index=True)
df3.head()

In [None]:
final_df.head()

In [None]:
df3.to_csv(out_path+"geocode_file2.csv",index=False)

In [None]:
import civis
import civisio
from civis.sql import run_script
client = civis.APIClient()

database = 'redshift-verizon'
schema = 'Inv_Inst'
table = 'cass_geocode_Nov2016'
sql = 'create schema if not exists {};'.format(schema)
civis.sql.run_query(sql, database)

civis.io.import_csv(filename = out_path + 'geocode_file2.csv', database = database, table = schema + '.' + table, delimiter = ',', 
                        distkey = 'Complaint_Number', sortkey1 = 'Complaint_Number')

### Report 2

In [11]:
final_df = pd.DataFrame()
final_other_df = pd.DataFrame()
for i in file_groups[1]:
    df = pd.read_excel(path+i,nrows=20)
    skip = np.where(df.iloc[:,0]=="Number:")[0][0]+1
    df = pd.read_excel(path+i, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df['Number:'].fillna(method='ffill', inplace=True)
    df['Number:'] = df['Number:'].astype(int)
    df = null_dropper(df)
    df.columns = [col.strip().replace(':', '').replace(' ', '_') for col in df.columns]
    df.rename(columns={'Number': 'Complaint_Number',
                   'Accused': 'Name',
                   'Finding_&_Recommended_Discipline': 'Finding',
                   'Unnamed_13': 'Recommended_Discipline',
                   'Final_Finding_&_Discipline': 'Final_Finding',
                   'Unnamed_15': 'Discipline'}, inplace=True)
    other_df = df[pd.isnull(df['Name'])].copy()
    df = df[pd.isnull(df['Name'])==False].copy()
    df['Birth_Yr'] = df['Birth_Yr'].astype(int)
    #df['Recommended_Discipline'] = df['Recommended_Discipline'].astype(int)
    #df['Discipline'] = df['Discipline'].astype(int)
    final_df = final_df.append(df)
    final_other_df = final_other_df.append(other_df)
final_df.head(10)

Unnamed: 0,Complaint_Number,Name,Birth_Yr,Gender,Race_Code,Date_of_Appt,Current_Unit,Current_Rank,Star,Complaint_Category,Finding,Recommended_Discipline,Final_Finding,Discipline
1,258996,"BARRON, WILLIAM",1949,M,WHI,1978-02-27,18,SGT,,01A-USE OF PROFANITY,NS,600.0,NS,600.0
3,258997,"C0NNOLLY, KIMBERLY",1965,F,BLK,1990-07-30,55,,11026.0,01A-USE OF PROFANITY,UN,600.0,UN,600.0
4,258997,"KEENE, JOHN",1968,M,WHI,1999-03-08,153,PO,,01A-USE OF PROFANITY,UN,600.0,UN,600.0
6,258998,"SLAVIN, SCOTT",1965,M,WHI,1991-11-18,145,SGT,807.0,10J-NEGLECT OF DUTY/CONDUCT UNBECOMING - ON DUTY,EX,600.0,EX,600.0
8,259001,"MARTINEZ, ANTONIO",1971,M,S,1996-11-04,701,PO,,10U-INADEQUATE/FAILURE TO PROVIDE SERVICE,UN,600.0,UN,600.0
9,259001,"JONES, MICHAEL",1965,M,BLK,1995-12-04,166,,13425.0,10U-INADEQUATE/FAILURE TO PROVIDE SERVICE,UN,600.0,UN,600.0
11,259002,"BROWN, CORNELIUS",1969,M,BLK,1994-08-01,3,SGT,2235.0,05A-ARRESTEE - DURING ARREST,NS,600.0,NS,600.0
12,259002,"MOLESKY, KENNETH",1976,M,WHI,1999-06-21,116,PO,6538.0,05A-ARRESTEE - DURING ARREST,NS,600.0,NS,600.0
14,259005,"TERRONES, SOFIA",1969,F,S,1993-05-17,19,PO,10453.0,07A-MISCONDUCT DURING ISSUANCE OF CITATION,NS,600.0,NS,600.0
16,259008,"BRILL, JAMES",1957,M,WHI,1999-03-08,1,PO,6491.0,09G-ABUSE OF AUTHORITY,UN,600.0,UN,600.0


In [12]:
final_df = final_df.sort_values(by='Complaint_Number')
report2 = final_df.copy()
report2.rename(columns={'Date_of_Appt': 'Date_Appointed'}, inplace=True)
cols = ['Accused_' + col.strip().replace(' ', '_') for col in report2.columns[1:]]
report2.columns = ['Complaint_Number'] + cols
report2.head()

Unnamed: 0,Complaint_Number,Accused_Name,Accused_Birth_Yr,Accused_Gender,Accused_Race_Code,Accused_Date_Appointed,Accused_Current_Unit,Accused_Current_Rank,Accused_Star,Accused_Complaint_Category,Accused_Finding,Accused_Recommended_Discipline,Accused_Final_Finding,Accused_Discipline
1,107901,"GONZALES, ROBIN",1981,F,S,2008-04-28,18,PO,5137.0,,,,,
3,108026,"BECKER, JOHN",1970,M,WHI,2000-01-24,17,PO,4734.0,,,,,
4,108026,"MARKHAM, SEAN",1971,M,I,2000-06-19,17,PO,19054.0,,,,,
1,258996,"BARRON, WILLIAM",1949,M,WHI,1978-02-27,18,SGT,,01A-USE OF PROFANITY,NS,600.0,NS,600.0
4,258997,"KEENE, JOHN",1968,M,WHI,1999-03-08,153,PO,,01A-USE OF PROFANITY,UN,600.0,UN,600.0


In [13]:
report2.to_csv(out_path+"report2.csv",index=False)
report2.to_excel(out_path+"report2.xlsx",index=False)

### Report 3

In [14]:
final_df = pd.DataFrame()
final_other_df = pd.DataFrame()
for i in file_groups[2]:
    df = pd.read_excel(path+i,nrows=20)
    skip = np.where(df.iloc[:,1]=="Gender")[0][0]+1
    df = pd.read_excel(path+i, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df['Complaint_Number'] = df[df['Unnamed: 0']=='Number:']['Gender']
    cols = ['Complaint_Number'] + [col for col in df.columns if col != 'Complaint_Number']
    df = df[cols].copy()    
    df.Complaint_Number.fillna(method='ffill', inplace=True)
    df = df[(df['Unnamed: 0'] != 'Number:') & (df['Race'] != 'end of record')]
    df = null_dropper(df)
    df.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)
    final_df = final_df.append(df)
    #final_other_df = final_other_df.append(other_df)
final_df.head(10)

Unnamed: 0,Complaint_Number,Name,Gender,Race,Star,Birth Year,Date Appointed
1,259069,"DOLCIMASCOLO, NED",M,WHI,,1945.0,1972-10-23
4,259088,"BRYANT, YVONNE",F,BLK,,1956.0,1998-08-31
5,259088,"CARROLL, RONDY",M,BLK,,1973.0,1997-08-04
8,259100,"CASTILLO, DIEGO",M,WWH,7499.0,1963.0,1994-07-05
11,259108,"RODIRGUEZ, GINA",F,WWH,20045.0,1964.0,1990-03-26
12,259108,"VELAZQUEZ, HIPOLITO",M,API,21292.0,1970.0,1992-09-28
15,259110,"BRIGGS, DANNY",M,BLK,,,NaT
16,259110,"OWENS, MILTON",M,BLK,,1959.0,1987-04-13
19,259126,"JONES, MICHAEL",M,BLK,,,NaT
22,259152,"RODRIGUEZ, JUAN",M,WWH,,1947.0,1977-07-18


In [15]:
print(final_df.info())
print(len(pd.unique(final_df["Complaint_Number"])))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27922 entries, 1 to 53132
Data columns (total 7 columns):
Complaint_Number    27922 non-null object
Name                27914 non-null object
Gender              27922 non-null object
Race                27922 non-null object
Star                20032 non-null object
Birth Year          27150 non-null float64
Date Appointed      27149 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 1.7+ MB
None
12606


In [16]:
final_df = final_df.sort_values(by='Complaint_Number')
report3 = final_df.copy()
cols = ['PO_Witness_' + col.strip().replace(' ', '_') for col in report3.columns[1:]]
report3.columns = ['Complaint_Number'] + cols
report3.head()

Unnamed: 0,Complaint_Number,PO_Witness_Name,PO_Witness_Gender,PO_Witness_Race,PO_Witness_Star,PO_Witness_Birth_Year,PO_Witness_Date_Appointed
29224,1000009,"MURPHY, KARYN",F,WHI,19.0,1962.0,1986-08-11
29223,1000009,"AKERSON, WILLIE",M,BLK,4031.0,1965.0,1994-05-31
29225,1000009,"WARDA, ASHOR",M,WHI,18586.0,1980.0,2005-09-26
29228,1000015,"ANDERSON, MAURICE",M,BLK,11348.0,1965.0,1997-07-07
29231,1000020,"BROWN, DANIEL",M,WHI,1708.0,1968.0,2001-02-05


In [17]:
report3.to_csv(out_path+"report3.csv",index=False)
report3.to_excel(out_path+"report3.xlsx",index=False)

### Report 4

In [18]:
final_df = pd.DataFrame()
final_other_df = pd.DataFrame()
for i in file_groups[3]:
    df = pd.read_excel(path+i,nrows=20)
    skip = np.where(df.iloc[:,0]=="Number")[0][0]+1
    df = pd.read_excel(path+i, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df['Number'].fillna(method='ffill', inplace=True)
    df['Number'] = df['Number'].astype(int)
    df = null_dropper(df)
    subset = df.columns.tolist()[1:]
    df.dropna(subset=subset, how='all', inplace=True)
    df = df[df['Race Desc'] != 'end of record']
    df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)
    final_df = final_df.append(df)
    #final_other_df = final_other_df.append(other_df)
final_df.head(10)

Unnamed: 0,Complaint_Number,Gender,Age,Race_Desc
2,1000009,F,37.106849,WHITE HISPANIC
5,1000015,F,35.410959,BLACK
6,1000015,F,24.416438,BLACK
9,1000020,M,27.693151,BLACK
12,1000021,M,26.454795,BLACK
15,1000027,F,,BLACK
18,1000034,M,26.89589,BLACK
19,1000034,M,27.523288,BLACK
22,1000043,M,29.290411,BLACK
23,1000043,M,35.553425,BLACK


In [19]:
final_df = final_df.sort_values(by='Complaint_Number')
report4 = final_df.copy()
cols = ['Victim_' + col.strip().replace(' ', '_') for col in report4.columns[1:]]
report4.columns = ['Complaint_Number'] + cols
report4.head()

Unnamed: 0,Complaint_Number,Victim_Gender,Victim_Age,Victim_Race_Desc
2,1000009,F,37.106849,WHITE HISPANIC
5,1000015,F,35.410959,BLACK
6,1000015,F,24.416438,BLACK
9,1000020,M,27.693151,BLACK
12,1000021,M,26.454795,BLACK


In [20]:
report4.to_csv(out_path+"report4.csv",index=False)
report4.to_excel(out_path+"report4.xlsx",index=False)

### Report 5

In [21]:
final_df = pd.DataFrame()
final_other_df = pd.DataFrame()
for i in file_groups[4]:
    df = pd.read_excel(path+i,nrows=20)
    skip = np.where(df.iloc[:,0]=="Number")[0][0]+1
    df = pd.read_excel(path+i, skiprows=skip)
    df.dropna(how='all', inplace=True)
    df['Number'].fillna(method='ffill', inplace=True)
    df['Number'] = df['Number'].astype(int)
    df = null_dropper(df)
    subset = df.columns.tolist()[1:]
    df.dropna(subset=subset, how='all', inplace=True)
    df = df[df['Race Desc'] != 'end of record']
    df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)
    final_df = final_df.append(df)
    #final_other_df = final_other_df.append(other_df)
final_df.head(10)

Unnamed: 0,Complaint_Number,Gender,Age,Race_Desc
1,1000000,F,48.767123,WHITE
3,1000001,M,31.641096,BLACK
5,1000002,M,42.621918,WHITE HISPANIC
7,1000004,M,60.254795,HISPANIC
8,1000004,M,52.750685,WHITE
10,1000005,F,34.734247,BLACK
12,1000006,M,52.90137,BLACK
14,1000007,F,55.550685,WHITE
16,1000009,M,38.660274,WHITE HISPANIC
18,1000010,M,42.575342,BLACK


In [22]:
final_df = final_df.sort_values(by='Complaint_Number')
report5 = final_df.copy()
cols = ['Complainant_' + col.strip().replace(' ', '_') for col in report5.columns[1:]]
report5.columns = ['Complaint_Number'] + cols
report5.head()

Unnamed: 0,Complaint_Number,Complainant_Gender,Complainant_Age,Complainant_Race_Desc
1,1000000,F,48.767123,WHITE
3,1000001,M,31.641096,BLACK
5,1000002,M,42.621918,WHITE HISPANIC
7,1000004,M,60.254795,HISPANIC
8,1000004,M,52.750685,WHITE


In [23]:
report5.to_csv(out_path+"report5.csv",index=False)
report5.to_excel(out_path+"report5.xlsx",index=False)

### Merging Datasets

In [24]:
merged = pd.merge(report1, report2, on='Complaint_Number', how='outer')
merged = pd.merge(merged, report3, on='Complaint_Number', how='outer')
merged = pd.merge(merged, report4, on='Complaint_Number', how='outer')
merged = pd.merge(merged, report5, on='Complaint_Number', how='outer')

In [25]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205087 entries, 0 to 205086
Data columns (total 41 columns):
Complaint_Number                   205087 non-null object
Beat                               176056 non-null object
Location_Code                      177155 non-null object
Address                            126970 non-null object
Street                             129000 non-null object
Apartment                          1908 non-null object
City_State_Zipcode                 177165 non-null object
Incident_Datetime                  177165 non-null datetime64[ns]
Complaint_Date                     177165 non-null datetime64[ns]
Closed_Date                        173599 non-null datetime64[ns]
Full_Address                       126504 non-null object
Investigator_Name                  30089 non-null object
Investigator_Current_Assignment    30089 non-null object
Investigator_Rank                  30089 non-null object
Investigator_Star                  20211 non-null object
I

In [26]:
merged.to_csv(out_path+"merged.csv",index=False)
merged.to_excel(out_path+"merged.xlsx",index=False)