In [1]:
# Dependencies and Setup
import pandas as pd

# Source data location
source_file_loc = 'source_data/'


## 1.) Read in source dataset and filter rows/columns

In [2]:
# 1a.) Read in housing dataset into dataframe
data_file = 'opa_properties_public_2016_2020.csv'

housing_df_raw = pd.read_csv(source_file_loc + data_file,low_memory=False)

In [3]:
housing_df_raw.head()

Unnamed: 0,objectid,assessment_date,basements,beginning_point,book_and_page,building_code,building_code_description,category_code,category_code_description,census_tract,...,unfinished,unit,utility,view_type,year_built,year_built_estimate,zip_code,zoning,lat,lng
0,30283185,,H,70' N FROM NS OF,586035,O50,ROW 3 STY MASONRY,1,Single Family,11.0,...,,,,I,1875,Y,19147.0,RM1,-75.164191,39.943765
1,30788703,,,83.573' W 41ST,382467,CA0,APTS 5-50 UNITS MASONRY,2,Multi Family,110.0,...,,,,I,1915,,19104.0,RM1,-75.207255,39.975039
2,30788704,,,101.573' W 41ST,382467,CA0,APTS 5-50 UNITS MASONRY,2,Multi Family,110.0,...,,,,I,1915,Y,19104.0,RM1,-75.207315,39.97506
3,30788705,,,20.667' W 42ND ST,382467,CA0,APTS 5-50 UNITS MASONRY,2,Multi Family,110.0,...,,,,I,1915,,19104.0,RSA3,-75.209677,39.976058
4,30788706,,,93' W 42ND ST,382467,CA0,APTS 5-50 UNITS MASONRY,2,Multi Family,110.0,...,,,,I,1915,Y,19104.0,RSA3,-75.20991,39.976144


In [4]:
len(housing_df_raw)

143152

In [5]:
# 1b.) Remove rows out of scope for analysis


# Keep only residential building categories - category_code_description is one of Mixed Use, Multi Family, Single Family
category_codes = ['Single Family','Multi Family','Mixed Use']
housing_df_row_filter = housing_df_raw.loc[housing_df_raw['category_code_description'].isin(category_codes) == True,:]

# Take out any records designated as "Unfinished" - unfinished field equals "U"
housing_df_row_filter = housing_df_row_filter.loc[housing_df_row_filter['unfinished'] != 'U',:]

# Take out any records with a sale price under 1000 - assumed to be "gifts" and not relevant for a pricing analysis
housing_df_row_filter = housing_df_row_filter.loc[housing_df_row_filter['sale_price'] >= 1000,:]

In [6]:
len(housing_df_row_filter)


99479

In [7]:
# 1c.) Keep only relevant columns for analysis
# Define list of columns to keep
columns = ['basements',
'building_code_description',
'category_code_description',
'census_tract',
'central_air',
'depth',
'exempt_building',
'exempt_land',
'exterior_condition',
'fireplaces',
'frontage',
'fuel',
'garage_spaces',
'garage_type',
'geographic_ward',
'interior_condition',
'location',
'market_value',
'market_value_date',
'number_of_bathrooms',
'number_of_bedrooms',
'number_of_rooms',
'number_stories',
'quality_grade',
'sale_date',
'sale_price',
'street_designation',
'street_direction',
'street_name',
'taxable_building',
'taxable_land',
'topography',
'total_area',
'total_livable_area',
'type_heater',
'unit',
'view_type',
'year_built',
'year_built_estimate',
'zip_code',
'zoning']

# Create new dataframe that only contains columns in above list
housing_df_column_filter = housing_df_row_filter[columns]
housing_df_column_filter.head()

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,topography,total_area,total_livable_area,type_heater,unit,view_type,year_built,year_built_estimate,zip_code,zoning
29,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.0,49200,0,4.0,0.0,...,F,938.0,1344.0,A,,I,1895,Y,19144.0,RSA5
30,,RES CONDO 3 STY MAS+OTH,Single Family,337.0,Y,0.0,45000,0,4.0,0.0,...,,0.0,947.0,,B307,I,1970,Y,19152.0,RM2
31,,ROW 2 STY MASONRY,Single Family,201.0,,70.0,0,0,4.0,0.0,...,F,1044.0,1190.0,,,I,1940,Y,19140.0,RM1
32,H,ROW B/GAR 2 STY MASONRY,Single Family,281.0,N,95.5,0,0,4.0,0.0,...,F,1686.53,1633.0,B,,I,1940,Y,19141.0,RSA3
37,,ROW 2 STY MASONRY,Single Family,293.0,,112.5,0,0,4.0,0.0,...,F,2165.62,1320.0,B,,I,1940,Y,19124.0,RSA5


In [8]:
len(housing_df_column_filter)

99479

In [9]:
# Declare final dataframe for use downstream
housing_df_final = housing_df_column_filter
housing_df_final.head()

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,topography,total_area,total_livable_area,type_heater,unit,view_type,year_built,year_built_estimate,zip_code,zoning
29,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.0,49200,0,4.0,0.0,...,F,938.0,1344.0,A,,I,1895,Y,19144.0,RSA5
30,,RES CONDO 3 STY MAS+OTH,Single Family,337.0,Y,0.0,45000,0,4.0,0.0,...,,0.0,947.0,,B307,I,1970,Y,19152.0,RM2
31,,ROW 2 STY MASONRY,Single Family,201.0,,70.0,0,0,4.0,0.0,...,F,1044.0,1190.0,,,I,1940,Y,19140.0,RM1
32,H,ROW B/GAR 2 STY MASONRY,Single Family,281.0,N,95.5,0,0,4.0,0.0,...,F,1686.53,1633.0,B,,I,1940,Y,19141.0,RSA3
37,,ROW 2 STY MASONRY,Single Family,293.0,,112.5,0,0,4.0,0.0,...,F,2165.62,1320.0,B,,I,1940,Y,19124.0,RSA5


In [10]:
# Export cleaned df to csv to be used in other notebooks to create visualizations

cleaned_data_loc = f'{source_file_loc}housing_data_cleaned.csv'

housing_df_final.to_csv(cleaned_data_loc,sep=',',index=False)