In [1]:
import glob
import re
import pandas as pd
import time
import numpy as np

In [2]:
def extract(df):
    # extract useful data and combine columns
    
    # strip space before and after words
    df.sort_values(by=['Incident Number'], inplace=True)
    df.iloc[:, 5:6] = df.apply(lambda x: x.astype(str).str.strip())
    df.iloc[:, 8:24] = df.apply(lambda x: x.astype(str).str.strip())
    
    # extract useful information
    df = df.dropna(subset=['Incident Type'])
    df['Incident Type'] = df['Incident Type'].apply(lambda x: x[0] if x[0].isdigit() else '')
    df['District'] = df['District'].map(lambda x: x.strip('0') if len(x) == 2 and x[0] == '0' else x)
    df['District'] = df['District'].map(lambda x: '' if x.isdigit() is False else x)
    df['Property Use'] = df['Property Use'].apply(lambda x: x[0] if x.isdigit() else '')
    df['Population'] = pd.to_numeric(df['Population'].astype(str).apply(lambda x: x.replace(',', '')), 
                                     errors='coerce')
    df['Population_Density'] = pd.to_numeric(df['Population_Density'].astype(str).apply(lambda x: x.replace(',', '')),
                                         errors='coerce')

    return df

In [3]:
def normalize(df):
    # combine address in one column
    df['Main Address'] = df['Street Number'] + ' ' + df['Street Prefix']  + ' ' + df['Street Type'] \
                         + ' ' + df['Street Suffix']
    
    # drop unuseful columns
    drop_col = ['Street Number', 'Street Prefix', 'Street Type', 'Street Suffix', 'xStreet Prefix', 
                'xStreet Name', 'xStreet Type', 'xStreet Suffix', 'Site', 'Date', 'Hour', 'Source', 
                'Index', 'Zip Code', 'National Rank']
    df.drop(drop_col, axis=1, inplace=True)
    
    # clean Weather Column     
    df['Weather'] = df['Weather'].str.replace('\-', '').str.replace('\+', '')
    df['Weather'] = df['Weather'].apply(lambda x: ' '.join([i for i in x.split() if i.isalpha()]))
    df['Weather'] = df['Weather'].str.replace('FEW', 'CLD').str.replace('BKN', 'CLD').\
                                 str.replace('SCT', 'CLD').str.replace('OVC', 'CLD')
    
    # fill empty blank or string "nan" with NaN
    df = df.replace(r'^\s*$', np.nan, regex=True).replace('nan', np.nan)
    return df

In [4]:
def fill_na(df):
    # drop or fill_missing values 
    
    # drop NaN in column Incident Type
    df = df.dropna(subset=['Incident Type'])
    
    # fill some vaues of Neighborhood and City Section with current values
    df = df.dropna(subset=['Incident Type'])
    df1 = df[(df.Neighborhood.notnull()) & df.Zip.notnull()].drop_duplicates()
    dict_zip = dict(zip(df1.Zip, df1.Neighborhood))
    df.loc[(df.Neighborhood.isnull()) & (df.Zip.notnull()), 'Neighborhood'] = \
                                         df.loc[(df.Neighborhood.isnull()) & \
                                                (df.Zip.notnull()), 'Zip'].map(dict_zip)
        
    df1 = df[(df['City Section'].notnull()) & df['Zip'].notnull()].drop_duplicates()
    dict_zip = dict(zip(df1['Zip'], df1['City Section']))
    df.loc[(df['City Section'].isnull()) & (df['Zip'].notnull()), 'City Section'] = \
                                         df.loc[(df['City Section'].isnull()) & \
                                                (df['Zip'].notnull()), 'Zip'].map(dict_zip)
    
    # fill missing values
    df['District'] = df['District'].fillna('Unknown')
    df['City Section'] = df['City Section'].fillna('Unknown')
    df['Neighborhood'] = df['Neighborhood'].fillna('Unknown')
    df['Zip'] = df['Zip'].fillna('00000')
    df['Property Use'] = df['Property Use'].fillna('Unknown')
    df['Property Description'] = df['Property Description'].fillna('None')
    df['Population'] = df['Population'].fillna(df['Population'].mean())
    df['Population_Density'] = df['Population_Density'].fillna(df['Population_Density'].mean())
    df['Precip'] = df['Precip'].fillna(0)
    return df
    

In [5]:
def main():
    data = pd.read_csv('E://springboard//capstone_project_1//raw_data/fire_weather_pop.csv', encoding='ISO-8859-1', 
                  index_col=[0], dtype={'Zip':'category'})
    data = extract(data)
    data = normalize(data)
    data = fill_na(data)
    data.to_csv('E://springboard//capstone_project_1//clean_data/clean_data.csv')


In [6]:
if __name__ == '__main__':
    main()

  if self.run_code(code, result):
