# Importing all libraries necessary for data cleaning 

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 

%matplotlib inline

### Loading data file to pandas dataframe and then getting a summary of the data 

In [2]:
df = pd.read_csv(filepath_or_buffer='Building_Permits.csv', header=0,
                     encoding='latin1', low_memory=False)

df.fillna(value=np.nan)
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198900 entries, 0 to 198899
Data columns (total 43 columns):
Permit Number                             198900 non-null object
Permit Type                               198900 non-null int64
Permit Type Definition                    198900 non-null object
Permit Creation Date                      198900 non-null object
Block                                     198900 non-null object
Lot                                       198900 non-null object
Street Number                             198900 non-null int64
Street Number Suffix                      2216 non-null object
Street Name                               198900 non-null object
Street Suffix                             196132 non-null object
Unit                                      29479 non-null float64
Unit Suffix                               1961 non-null object
Description                               198610 non-null object
Current Status                            198900 n

(198900, 43)

##### After loading the data into a dataframe and using the info method for dataframes, there are multiple columns with messing entries. There are a total of 43 columns and 198,900 entries. The next step is to convert these columns to the correct data type. 

### Functions used for cleaning 

In [3]:
def to_category(columns, dataframe):
    """Convert a list of columns, from a dataframe, to a category datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('category')

def to_integer(columns, dataframe):
    """Convert columns from a dataframe to an int64 datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('int64')

#### Rename column names

In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.rename(columns={'neighborhoods_-_analysis_boundaries': 'neighborhoods',
                       'voluntary_soft-story_retrofit': 'voluntary_soft_story_retrofit'},
              inplace=True)
df.columns

Index(['permit_number', 'permit_type', 'permit_type_definition',
       'permit_creation_date', 'block', 'lot', 'street_number',
       'street_number_suffix', 'street_name', 'street_suffix', 'unit',
       'unit_suffix', 'description', 'current_status', 'current_status_date',
       'filed_date', 'issued_date', 'completed_date',
       'first_construction_document_date', 'structural_notification',
       'number_of_existing_stories', 'number_of_proposed_stories',
       'voluntary_soft_story_retrofit', 'fire_only_permit',
       'permit_expiration_date', 'estimated_cost', 'revised_cost',
       'existing_use', 'existing_units', 'proposed_use', 'proposed_units',
       'plansets', 'tidf_compliance', 'existing_construction_type',
       'existing_construction_type_description', 'proposed_construction_type',
       'proposed_construction_type_description', 'site_permit',
       'supervisor_district', 'neighborhoods', 'zipcode', 'location',
       'record_id'],
      dtype='object')

#### Columns that should be datetime: Creation Date, Current Status Date, Filed Date, Issued Date, Completed Date, First Construction Document Date, Permit Expiration Date
#### Columns that should be categorical: Permit Type, Permit Type Definition, Street Name (Possibly; otherwise no change needed), Current Status, Fire Only Permit, Existing Use, Proposed Use, Existing Construction Type, Existing Construction Type Description, Proposed Construction Type, Proposed Construction Type Description, Supervisor District, Neighborhoods - Analysis Boundaries, Zipcode, TIDF Compliance 

### Filling in data for mising category columns 

In [5]:
cols_category = {'permit_type':'unknown', 'permit_type_definition':'unknown', 'street_name':'unknown', 
        'current_status':'unknown', 'fire_only_permit':'unknown', 'existing_use':'unknown', 
        'proposed_use':'unknown', 'existing_construction_type':'unknown',
        'existing_construction_type_description':'unknown', 'proposed_construction_type':'unknown',
        'proposed_construction_type_description':'unknown', 'supervisor_district':'unknown', 'neighborhoods':'unknown',
        'zipcode':'unknown', 'tidf_compliance':'unknown', 'site_permit':'unknown', 'unit':'unknown',
        'description':'unknown'}

df.fillna(value=cols_category, inplace=True)

###### Adding an address column that combines all relevant address info from dataset

In [6]:
cols = ['street_number','street_name']
for col in cols: 
    df[col] = df[col].astype('str')

df['address'] = df.street_number + " " + df.street_name + " " + df.zipcode.astype('str') + " " + df.unit.astype('str') + " " + df.block

###### Fixing location data syntax and adding separate columns for lat and long 

In [7]:
df['location'] = df['location'].str.replace(" ","").str.strip('(').str.strip(')')
df['latitude'] = df['location'].str.split(',').str[0]
df['longitude'] = df['location'].str.split(',').str[1]
 
df[['latitude','longitude']] = df[['latitude','longitude']].apply(pd.to_numeric)

###### Fixing category columns 

In [8]:
category_cols = ['permit_type_definition', 'current_status',
                 'fire_only_permit','existing_use', 'proposed_use',
                 'existing_construction_type_description',
                 'proposed_construction_type_description', 'neighborhoods',
                 'tidf_compliance', 'site_permit', 'permit_type',
                 'existing_construction_type','proposed_construction_type',
                 'supervisor_district','zipcode']
       
to_category(columns=(category_cols), dataframe=df)

In [9]:
df['current_status'].value_counts()

complete       97077
issued         83559
filed          12043
withdrawn       1754
cancelled       1536
expired         1370
approved         733
reinstated       563
suspend          193
revoked           50
plancheck         16
incomplete         2
disapproved        2
appeal             2
Name: current_status, dtype: int64

###### Fix columns with dates 

In [10]:
cols = ['permit_creation_date', 'current_status_date','filed_date',
        'issued_date', 'completed_date', 'first_construction_document_date',
        'permit_expiration_date']

for col in cols:
    df[col] = pd.to_datetime(df[col])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198900 entries, 0 to 198899
Data columns (total 46 columns):
permit_number                             198900 non-null object
permit_type                               198900 non-null category
permit_type_definition                    198900 non-null category
permit_creation_date                      198900 non-null datetime64[ns]
block                                     198900 non-null object
lot                                       198900 non-null object
street_number                             198900 non-null object
street_number_suffix                      2216 non-null object
street_name                               198900 non-null object
street_suffix                             196132 non-null object
unit                                      198900 non-null object
unit_suffix                               1961 non-null object
description                               198900 non-null object
current_status                      

The datatypes for every column has been fixed. We can now look into the data and drop rows and columns as necessary. 

## Dropping selected columns from dataset 
###### See how many null values there are 

In [12]:
df.fillna(value=np.nan)
df.isnull().sum().sort_values()

permit_number                                  0
tidf_compliance                                0
existing_construction_type                     0
fire_only_permit                               0
existing_construction_type_description         0
proposed_construction_type                     0
proposed_construction_type_description         0
site_permit                                    0
supervisor_district                            0
neighborhoods                                  0
zipcode                                        0
filed_date                                     0
current_status_date                            0
current_status                                 0
description                                    0
unit                                           0
record_id                                      0
street_name                                    0
address                                        0
street_number                                  0
lot                 

###### Columns that can be dropped are voluntary_soft_story_retrofit, unit_suffix, street_number_suffix, structural_notification, street_suffix

In [13]:
df.drop(columns=['voluntary_soft_story_retrofit','unit_suffix','street_number_suffix',
                 'structural_notification','street_suffix',
                 'permit_number','record_id', 'unit_suffix',
                 'street_number_suffix', 'voluntary_soft_story_retrofit', 'street_suffix'],
                 inplace=True)

In [14]:
df['time_range'] = (df['issued_date'].sub(df['filed_date'], axis=0))/np.timedelta64(1, 'D')
df['time_range'].describe()

count    183960.000000
mean         26.054697
std          91.061716
min           0.000000
25%           0.000000
50%           0.000000
75%           6.000000
max        1740.000000
Name: time_range, dtype: float64

In [15]:
df['time_range'].quantile([.5,.75,.80,.90,.91,.92,.93,.94,.95,.96,.97,.98,.99])

0.50      0.00
0.75      6.00
0.80     13.00
0.90     56.00
0.91     68.00
0.92     82.00
0.93    100.00
0.94    126.00
0.95    159.00
0.96    204.00
0.97    251.00
0.98    326.00
0.99    471.82
Name: time_range, dtype: float64

In [16]:
df['time_frame'] = "unknown"
df.loc[df['time_range'] == 0, 'time_frame'] = "instant approval"
df.loc[df['time_range'] >= 1, 'time_frame'] = "1 - 59 days"
df.loc[df['time_range'] >= 60, 'time_frame'] = "60 - 119 days"
df.loc[df['time_range'] >= 120, 'time_frame'] = "120 - 179 days"
df.loc[df['time_range'] >= 180, 'time_frame'] = "180+ days"
df.loc[df['current_status'] == 'withdrawn', 'time_frame'] = "denied"
df.loc[df['current_status'] == 'cancelled', 'time_frame'] = "denied"

In [17]:
df['decision'] = "non-instant approval"
df.loc[df.time_range == 0, 'decision'] = "instant approval"
df.loc[df.current_status == 'cancelled', 'decision'] = "denied"
df.loc[df.current_status == 'withdrawn', 'decision'] = "denied"

In [18]:
df['region'] = "unknown"
# North Region 
df.loc[df.zipcode == 94129.0, 'region'] = "north"
df.loc[df.zipcode == 94123.0, 'region'] = "north"
df.loc[df.zipcode == 94109.0, 'region'] = "north"
df.loc[df.zipcode == 94133.0, 'region'] = "north"
df.loc[df.zipcode == 94130.0, 'region'] = "north"
df.loc[df.zipcode == 94111.0, 'region'] = "north"

# South Region 
df.loc[df.zipcode == 94132.0, 'region'] = "south"
df.loc[df.zipcode == 94112.0, 'region'] = "south"
df.loc[df.zipcode == 94134.0, 'region'] = "south"
df.loc[df.zipcode == 94124.0, 'region'] = "south"
df.loc[df.zipcode == 94127.0, 'region'] = "south"

# West region
df.loc[df.zipcode == 94116.0, 'region'] = "west"
df.loc[df.zipcode == 94122.0, 'region'] = "west"
df.loc[df.zipcode == 94121.0, 'region'] = "west"
df.loc[df.zipcode == 94118.0, 'region'] = "west"

# Central region
df.loc[df.zipcode == 94131.0, 'region'] = "central"
df.loc[df.zipcode == 94114.0, 'region'] = "central"
df.loc[df.zipcode == 94117.0, 'region'] = "central"
df.loc[df.zipcode == 94115.0, 'region'] = "central"

# East Region
df.loc[df.zipcode == 94110.0, 'region'] = "east"
df.loc[df.zipcode == 94107.0, 'region'] = "east"
df.loc[df.zipcode == 94103.0, 'region'] = "east"
df.loc[df.zipcode == 94102.0, 'region'] = "east"
df.loc[df.zipcode == 94108.0, 'region'] = "east"
df.loc[df.zipcode == 94105.0, 'region'] = "east"
df.loc[df.zipcode == 94104.0, 'region'] = "east"
df.loc[df.zipcode == 94111.0, 'region'] = "east"
df.loc[df.zipcode == 94158.0, 'region'] = "east"

In [19]:
df['proposed_use_category'] = 'other'
df.loc[df['proposed_use'] == "1 family dwelling", 'proposed_use_category'] = 'house'
df.loc[df['proposed_use'] == "2 family dwelling", 'proposed_use_category'] = 'house'
df.loc[df['proposed_use'] == "apartments", 'proposed_use_category'] = 'apartment'
df.loc[df['proposed_use'] == "residential hotel", 'proposed_use_category'] = 'apartment'
df.loc[df['proposed_use'] == "unknown", 'proposed_use_category'] = 'unknown'
df.loc[df['proposed_use'] == "office", 'proposed_use_category'] = 'office'
df.loc[df['proposed_use'] == "retail sales", 'proposed_use_category'] = 'retail sales'

In [20]:
df['existing_use_category'] = 'other'
df.loc[df['existing_use'] == "1 family dwelling", 'existing_use_category'] = 'house'
df.loc[df['existing_use'] == "2 family dwelling", 'existing_use_category'] = 'house'
df.loc[df['existing_use'] == "apartments", 'existing_use_category'] = 'apartment'
df.loc[df['existing_use'] == "residential hotel", 'existing_use_category'] = 'apartment'
df.loc[df['existing_use'] == "unknown", 'existing_use_category'] = 'unknown'
df.loc[df['existing_use'] == "office", 'existing_use_category'] = 'office'
df.loc[df['existing_use'] == "retail sales", 'existing_use_category'] = 'retail sales'

In [21]:
df.to_csv(path_or_buf='Building_Permits_Cleaned.csv', date_format='%Y/%m/%d')

###### Dataset has been cleaned