# Importing all libraries necessary for data cleaning 

In [1]:
from datetime import datetime 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sb 

### Loading data file to pandas dataframe and then getting a summary of the data 

In [2]:
#with open(file='Building_Permits.csv', mode='rb') as f:
#    result = chardet.detect(f.read())
df_raw = pd.read_csv(filepath_or_buffer = 'Building_Permits.csv', header = 0, encoding='latin1', low_memory=False)
df_raw.info()
df_raw.shape
df_raw.fillna(value=np.nan)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198900 entries, 0 to 198899
Data columns (total 43 columns):
Permit Number                             198900 non-null object
Permit Type                               198900 non-null int64
Permit Type Definition                    198900 non-null object
Permit Creation Date                      198900 non-null object
Block                                     198900 non-null object
Lot                                       198900 non-null object
Street Number                             198900 non-null int64
Street Number Suffix                      2216 non-null object
Street Name                               198900 non-null object
Street Suffix                             196132 non-null object
Unit                                      29479 non-null float64
Unit Suffix                               1961 non-null object
Description                               198610 non-null object
Current Status                            198900 n

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Existing Construction Type,Existing Construction Type Description,Proposed Construction Type,Proposed Construction Type Description,Site Permit,Supervisor District,Neighborhoods - Analysis Boundaries,Zipcode,Location,Record ID
0,2.02E+11,4,sign - erect,5/6/15,326,23,140,,Ellis,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.785719256680785, -122.40852313194863)",1.380000e+12
1,2.02E+11,4,sign - erect,4/19/16,306,7,440,,Geary,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.78733980600732, -122.41063199757738)",1.420000e+12
2,2.02E+11,3,additions alterations or repairs,5/27/16,595,203,1647,,Pacific,Av,...,1.0,constr type 1,1.0,constr type 1,,3.0,Russian Hill,94109.0,"(37.7946573324287, -122.42232562979227)",1.420000e+12
3,2.02E+11,8,otc alterations permit,11/7/16,156,11,1230,,Pacific,Av,...,5.0,wood frame (5),5.0,wood frame (5),,3.0,Nob Hill,94109.0,"(37.79595867909168, -122.41557405519474)",1.440000e+12
4,2.02E+11,6,demolitions,11/28/16,342,1,950,,Market,St,...,3.0,constr type 3,,,,6.0,Tenderloin,94102.0,"(37.78315261897309, -122.40950883997789)",1.450000e+11
5,2.02E+11,8,otc alterations permit,6/14/17,4105,9,800,,Indiana,St,...,1.0,constr type 1,1.0,constr type 1,,10.0,Potrero Hill,94107.0,"(37.75922331346539, -122.39170402628598)",1.470000e+12
6,2.02E+11,8,otc alterations permit,6/30/17,1739,20,1291,,11th,Av,...,5.0,wood frame (5),5.0,wood frame (5),,5.0,Inner Sunset,94122.0,"(37.764145640138565, -122.46875112470363)",1.470000e+12
7,M803667,8,otc alterations permit,6/30/17,4789,14,1465,,Revere,Av,...,,,,,,10.0,Bayview Hunters Point,94124.0,"(37.73005099023611, -122.38784938916618)",1.470000e+12
8,M804227,8,otc alterations permit,7/5/17,1212,54,2094,,Fell,St,...,,,,,,5.0,Lone Mountain/USF,94117.0,"(37.772393498502595, -122.45231466824669)",1.470000e+12
9,M804767,8,otc alterations permit,7/6/17,1259,16,89,,Alpine,Tr,...,,,,,,8.0,Haight Ashbury,94117.0,"(37.7691724293766, -122.43734859051908)",1.470000e+11


##### After loading the data into a dataframe and using the info method for dataframes, there are multiple columns with messing entries. There are a total of 43 columns and 198,900 entries. The next step is to convert these columns to the correct data type. 

### Functions used for cleaning 

#### Converting column to catergory type

In [3]:
def to_category(columns, dataframe):
    """Convert a list of columns, from a dataframe, to a category datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('category')

#### Converting column to int type

In [4]:
def to_integer(columns, dataframe):
    """Convert columns from a dataframe to an int64 datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('int64')

#### Rename column names

In [5]:
df_raw.columns = df_raw.columns.str.replace(' ', '_').str.lower()
df_raw.rename(columns={'neighborhoods_-_analysis_boundaries': 'neighborhoods',
                       'voluntary_soft-story_retrofit': 'voluntary_soft_story_retrofit'},
              inplace=True)
df_raw.columns

Index(['permit_number', 'permit_type', 'permit_type_definition',
       'permit_creation_date', 'block', 'lot', 'street_number',
       'street_number_suffix', 'street_name', 'street_suffix', 'unit',
       'unit_suffix', 'description', 'current_status', 'current_status_date',
       'filed_date', 'issued_date', 'completed_date',
       'first_construction_document_date', 'structural_notification',
       'number_of_existing_stories', 'number_of_proposed_stories',
       'voluntary_soft_story_retrofit', 'fire_only_permit',
       'permit_expiration_date', 'estimated_cost', 'revised_cost',
       'existing_use', 'existing_units', 'proposed_use', 'proposed_units',
       'plansets', 'tidf_compliance', 'existing_construction_type',
       'existing_construction_type_description', 'proposed_construction_type',
       'proposed_construction_type_description', 'site_permit',
       'supervisor_district', 'neighborhoods', 'zipcode', 'location',
       'record_id'],
      dtype='object')

### Inspect column data and then convert it appropriate datatype 

In [6]:
for column in df_raw.columns:
    print(df_raw[column])
    

0         2.02E+11
1         2.02E+11
2         2.02E+11
3         2.02E+11
4         2.02E+11
5         2.02E+11
6         2.02E+11
7          M803667
8          M804227
9          M804767
10         M805287
11         M805907
12         M806447
13        2.02E+11
14         M813729
15         M813907
16         M813967
17         M814148
18        2.02E+11
19         M814368
20         M814967
21        2.02E+11
22        2.02E+11
23         M816927
24        2.02E+11
25        2.02E+11
26         M820728
27         M821207
28         M821268
29         M821847
            ...   
198870     M893328
198871     M893347
198872    2.02E+11
198873    2.02E+11
198874    2.02E+11
198875    2.02E+11
198876     M893367
198877    2.02E+11
198878    2.02E+11
198879    2.02E+11
198880     M893387
198881     M893407
198882    2.02E+11
198883     M893427
198884     M893447
198885    2.02E+11
198886    2.02E+11
198887    2.02E+11
198888    2.02E+11
198889    2.02E+11
198890    2.02E+11
198891    2.

### Columns that need datatype to be converted
#### Columns that should be datetime: Creation Date, Current Status Date, Filed Date, Issued Date, Completed Date, First Construction Document Date, Permit Expiration Date
#### Columns that should be strings: None
#### Columns that should be numeric: None
#### Columns that should be categorical: Permit Type, Permit Type Definition, Street Name (Possibly; otherwise no change needed), Current Status, Fire Only Permit, Existing Use, Proposed Use, Existing Construction Type, Existing Construction Type Description, Proposed Construction Type, Proposed Construction Type Description, Supervisor District, Neighborhoods - Analysis Boundaries, Zipcode, TIDF Compliance 

### Converting columns to category type 

In [7]:
cols = ['permit_type', 'permit_type_definition', 'street_name', 'current_status', 'fire_only_permit', 'existing_use', 
        'proposed_use', 'existing_construction_type', 'existing_construction_type_description', 
        'proposed_construction_type', 'proposed_construction_type_description', 'supervisor_district', 
        'neighborhoods', 'zipcode', 'tidf_compliance', 'site_permit']

to_category(columns = cols, dataframe = df_raw)
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198900 entries, 0 to 198899
Data columns (total 43 columns):
permit_number                             198900 non-null object
permit_type                               198900 non-null category
permit_type_definition                    198900 non-null category
permit_creation_date                      198900 non-null object
block                                     198900 non-null object
lot                                       198900 non-null object
street_number                             198900 non-null int64
street_number_suffix                      2216 non-null object
street_name                               198900 non-null category
street_suffix                             196132 non-null object
unit                                      29479 non-null float64
unit_suffix                               1961 non-null object
description                               198610 non-null object
current_status                            1

In [8]:
cols = ['permit_creation_date', 'current_status_date','filed_date', 'issued_date', 'completed_date', 
       'first_construction_document_date', 'permit_expiration_date']
df_raw[cols] = df_raw[cols].apply(pd.to_datetime)

In [9]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198900 entries, 0 to 198899
Data columns (total 43 columns):
permit_number                             198900 non-null object
permit_type                               198900 non-null category
permit_type_definition                    198900 non-null category
permit_creation_date                      198900 non-null datetime64[ns]
block                                     198900 non-null object
lot                                       198900 non-null object
street_number                             198900 non-null int64
street_number_suffix                      2216 non-null object
street_name                               198900 non-null category
street_suffix                             196132 non-null object
unit                                      29479 non-null float64
unit_suffix                               1961 non-null object
description                               198610 non-null object
current_status                     

#### All columns have been changed to the appropriate data type. Next step is to look at the missing values 

In [10]:
df_raw.isnull().sum().sort_values()

permit_number                                  0
filed_date                                     0
current_status_date                            0
current_status                                 0
street_name                                    0
street_number                                  0
record_id                                      0
block                                          0
permit_creation_date                           0
permit_type_definition                         0
permit_type                                    0
lot                                            0
description                                  290
location                                    1700
zipcode                                     1716
supervisor_district                         1717
neighborhoods                               1725
street_suffix                               2768
revised_cost                                6066
issued_date                                14940
first_construction_d

##### From the summary of null values in all of the columns, the following can be potentially be ignored for further investigation : tidf compliance, unit suffix, street number suffix, unit, completed date, permit expiration date, street suffix

##### The rest of columns must be investigated 

#### Dropping columns with missing values and columns that are not needed 

In [11]:
df_raw = df_raw.drop(columns=['permit_number','block','lot','street_number','street_number_suffix','street_suffix', 
                             'unit', 'unit_suffix', 'voluntary_soft_story_retrofit', 'permit_expiration_date', 
                             'tidf_compliance', 'record_id', 'structural_notification'], axis=1 )

In [13]:
df_raw = df_raw.drop(columns=['first_construction_document_date'], axis=1)

In [16]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198900 entries, 0 to 198899
Data columns (total 29 columns):
permit_type                               198900 non-null category
permit_type_definition                    198900 non-null category
permit_creation_date                      198900 non-null datetime64[ns]
street_name                               198900 non-null category
description                               198610 non-null object
current_status                            198900 non-null category
current_status_date                       198900 non-null datetime64[ns]
filed_date                                198900 non-null datetime64[ns]
issued_date                               183960 non-null datetime64[ns]
completed_date                            97191 non-null datetime64[ns]
number_of_existing_stories                156116 non-null float64
number_of_proposed_stories                156032 non-null float64
fire_only_permit                          18827 non-null cat

In [22]:
df_raw[df_raw.duplicated()]

Unnamed: 0,permit_type,permit_type_definition,permit_creation_date,street_name,description,current_status,current_status_date,filed_date,issued_date,completed_date,...,plansets,existing_construction_type,existing_construction_type_description,proposed_construction_type,proposed_construction_type_description,site_permit,supervisor_district,neighborhoods,zipcode,location
525,8,otc alterations permit,2013-01-02,18th,to obtain final inspection approved under pa# ...,complete,2013-04-18,2013-01-02,2013-01-02,2013-04-18,...,0.0,5.0,wood frame (5),5.0,wood frame (5),,8.0,Castro/Upper Market,94114.0,"(37.76120881848921, -122.43342817668643)"
532,8,otc alterations permit,2013-01-02,Cole,reroofing,issued,2013-01-02,2013-01-02,2013-01-02,NaT,...,0.0,5.0,wood frame (5),5.0,wood frame (5),,5.0,Haight Ashbury,94117.0,"(37.769823543172045, -122.45008165265183)"
539,8,otc alterations permit,2013-01-02,Ocean,replace fire damaged materials in kind to incl...,issued,2013-01-02,2013-01-02,2013-01-02,NaT,...,2.0,5.0,wood frame (5),5.0,wood frame (5),,7.0,West of Twin Peaks,94112.0,"(37.72480987268102, -122.45931548632585)"
556,8,otc alterations permit,2013-01-02,10th,repair back of the building siding and paintin...,complete,2013-03-23,2013-01-02,2013-01-02,2013-03-23,...,0.0,5.0,wood frame (5),5.0,wood frame (5),,1.0,Inner Richmond,94118.0,"(37.776567168754084, -122.46792521121702)"
567,8,otc alterations permit,2013-01-02,Prosper,voluntary safety strengthening upgrade to exis...,complete,2013-08-09,2013-01-02,2013-01-02,2013-08-09,...,2.0,5.0,wood frame (5),5.0,wood frame (5),,8.0,Castro/Upper Market,94114.0,"(37.76331368403074, -122.43177542013316)"
572,8,otc alterations permit,2013-01-02,Loyola,upgrade existing furnace room by installing 5/...,cancelled,2016-11-14,2013-01-02,2013-01-02,NaT,...,0.0,5.0,wood frame (5),5.0,wood frame (5),,1.0,Lone Mountain/USF,94117.0,"(37.776229196787504, -122.44877581259863)"
620,8,otc alterations permit,2013-01-03,Lower,street space,issued,2013-01-03,2013-01-03,2013-01-03,NaT,...,,,,,,,8.0,Castro/Upper Market,94114.0,"(37.76304675597241, -122.44172133662688)"
637,8,otc alterations permit,2013-01-03,Minna,street space,issued,2013-01-03,2013-01-03,2013-01-03,NaT,...,,,,,,,6.0,South of Market,94103.0,"(37.77779688692988, -122.41137469845343)"
639,8,otc alterations permit,2013-01-03,Filbert,replace (e) exterior service stair in kind.,complete,2013-03-28,2013-01-03,2013-02-19,2013-03-28,...,2.0,5.0,wood frame (5),5.0,wood frame (5),,2.0,Marina,94123.0,"(37.797469404717916, -122.43776016525479)"
646,8,otc alterations permit,2013-01-03,Gough,to comply with physical inspection report #cc-...,complete,2013-01-04,2013-01-03,2013-01-03,2013-01-04,...,0.0,5.0,wood frame (5),5.0,wood frame (5),,2.0,Marina,94123.0,"(37.79960736037713, -122.427881244637)"
