In [1]:
import re
import pandas as pd

In [2]:
zipcode = '97035'
filename = zipcode + '_homes_data.csv'
df = pd.read_csv(filename)
df.rename(columns={'Unnamed: 0': 'Address'}, inplace=True)

In [3]:
df

Unnamed: 0,Address,link,price,beds,baths,sqft,parking_features,parking_total,garage_type,garage_spaces,...,fuel_desc,sewer,cooling_yn,has_hoa,senior_community_yn,stories,style,yr_renovated,county,new_construction_yn
0,4965 SW Park Bluff Pl,https://www.redfin.com/OR/Lake-Oswego/4965-Par...,"$2,099,000",5,4.5,4568,"Driveway, On Street",2.0,Attached,2.0,...,Gas,Public Sewer,Yes,Yes,No,—,Single Family Residential,—,Clackamas County,Yes
1,3569 Sunwood Ct,https://www.redfin.com/OR/Lake-Oswego/3569-Sun...,"$700,000",3,1.5,1350,"Driveway, Off Street",2.0,"Attached, Oversized",2.0,...,Gas,Public Sewer,No,,No,1.0,Single Family Residential,—,Clackamas County,No
2,5225 Firwood Rd,https://www.redfin.com/OR/Lake-Oswego/5225-Fir...,"$1,550,000",3,2.5,3240,"Driveway, RV Access/Parking",2.0,"Attached, Extra Deep, Oversized",2.0,...,Gas,Septic Tank,Yes,,No,1.0,Single Family Residential,—,Clackamas County,No
3,38 Cervantes Cir,https://www.redfin.com/OR/Lake-Oswego/38-Cerva...,"$293,000",2,1.5,912,Off Street,1.0,Other,1.0,...,Electricity,"All Landscaping, Athletic Court, Basketball Co...",No,Yes,No,—,Condo/Co-op,—,Multnomah County,No
4,3119 Douglas Cir,https://www.redfin.com/OR/Lake-Oswego/3119-Dou...,"$2,675,000",4,6,6524,RV Access/Parking,14.0,"Attached, Detached, Oversized",14.0,...,"Electricity, Gas",Public Sewer,Yes,,No,2.0,Single Family Residential,—,Clackamas County,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,5254 Westfield Ct,https://www.redfin.com/OR/Lake-Oswego/5254-Wes...,"$1,300,000",4,2.5,3346,Driveway,3.0,Attached,3.0,...,Gas,Public Sewer,Yes,Yes,No,2.0,Single Family Residential,—,Clackamas County,No
172,4638 Lower Dr,https://www.redfin.com/OR/Lake-Oswego/4638-Low...,"$307,500",2,1.5,1260,,,,,...,Electricity,Public Sewer,No,Yes,No,—,Condo/Co-op,—,Clackamas County,No
173,4324 Woodside Cir,https://www.redfin.com/OR/Lake-Oswego/4324-Woo...,"$579,000",2,2,1210,"Driveway, On Street",2.0,Detached,2.0,...,"Electricity, Gas",Public Sewer,Yes,Yes,No,1.0,Single Family Residential,—,Clackamas County,No
174,110 Greenridge Ct,https://www.redfin.com/OR/Lake-Oswego/110-Gree...,"$527,000",3,2.5,2868,"Off Street, Other",2.0,Attached,2.0,...,Gas,Public Sewer,No,Yes,No,2.0,Single Family Residential,—,Clackamas County,No


## Convert price to float, then remove any rows where values are null or not numeric where expected

In [4]:
df['price'] = df['price'].str.replace('[$,]', '', regex=True)
df = df[pd.to_numeric(df['price'], errors='coerce').notnull()]
df = df[pd.to_numeric(df['beds'], errors='coerce').notnull()]
df = df[pd.to_numeric(df['baths'], errors='coerce').notnull()]
df['sqft'] = df['sqft'].str.replace('[,]', '', regex=True)
df = df[pd.to_numeric(df['sqft'], errors='coerce').notnull()]
df = df[pd.to_numeric(df['yr_built'], errors='coerce').notnull()]

## Remove unexpected categorical values from style

In [5]:
styles = ['Single Family Residential', 'Condo/Co-op', 'Townhouse']
df = df[df['style'].isin(styles)]

## Explore count of unique and null values to determine which variables to use

In [6]:
column_stats = {}
for col in df.columns:
    column_stats[col] = {}
    column_stats[col]['value_count'] = df[col].value_counts()
    column_stats[col]['null_count'] = df[col].isna().sum()

for col in column_stats:
    print(col)
    print()
    print('VALUE COUNT')
    print('****************************')
    print(column_stats[col]['value_count'])
    print()
    print('NULL COUNT')
    print('****************************')
    print(column_stats[col]['null_count'])
    print('\n\n\n')

Address

VALUE COUNT
****************************
4965 SW Park Bluff Pl      1
2815 SW Orchard Hill Pl    1
2121 Wembley Park Rd       1
23 Tanglewood Dr           1
5754 Charles Cir           1
                          ..
4694 Winthrop Ct           1
18521 Don Lee Way          1
4 Touchstone #117          1
4322 Collins Way           1
4487 Westbay Rd            1
Name: Address, Length: 163, dtype: int64

NULL COUNT
****************************
0




link

VALUE COUNT
****************************
https://www.redfin.com/OR/Lake-Oswego/4965-Park-Bluff-Pl-97035/home/183781964      1
https://www.redfin.com/OR/Lake-Oswego/2815-Orchard-Hill-Pl-97035/home/26485233     1
https://www.redfin.com/OR/Lake-Oswego/2121-Wembley-Park-Rd-97034/home/26055635     1
https://www.redfin.com/OR/Lake-Oswego/23-Tanglewood-Dr-97035/home/25940174         1
https://www.redfin.com/OR/Lake-Oswego/5754-Charles-Cir-97035/home/26004817         1
                                                                       

## Dropping rows that seem too messy

In [7]:
dropped_columns = ['parking_features',
                   'parking_total',
                   'garage_type',
                   'garage_spaces',
                   'hot_water_desc',
                   'fireplace_yn',
                   'fireplace_total',
                   'basement',
                   'roof',
                   'lot_size',
                   'property_type',
                   'main_level_area',
                   'fuel_desc',
                   'sewer',
                   'stories',
                   'yr_renovated',
                   'lot_size_sqft']
df = df.drop(columns=dropped_columns)

## Feature Engineering

### I will clean view, cooling, senior community, new construction, and HOA features before converting them to dummy variables

In [8]:
features = ['view_yn', 'cooling_yn', 'senior_community_yn', 'new_construction_yn', 'has_hoa']

for feature in features:
    df[feature] = df[feature].fillna('No')
    df = df[df[feature].isin(['Yes', 'No'])]

In [9]:
df

Unnamed: 0,Address,link,price,beds,baths,sqft,view_yn,yr_built,cooling_yn,has_hoa,senior_community_yn,style,county,new_construction_yn
0,4965 SW Park Bluff Pl,https://www.redfin.com/OR/Lake-Oswego/4965-Par...,2099000,5,4.5,4568,Yes,2022,Yes,Yes,No,Single Family Residential,Clackamas County,Yes
1,3569 Sunwood Ct,https://www.redfin.com/OR/Lake-Oswego/3569-Sun...,700000,3,1.5,1350,No,1966,No,No,No,Single Family Residential,Clackamas County,No
2,5225 Firwood Rd,https://www.redfin.com/OR/Lake-Oswego/5225-Fir...,1550000,3,2.5,3240,Yes,1936,Yes,No,No,Single Family Residential,Clackamas County,No
4,3119 Douglas Cir,https://www.redfin.com/OR/Lake-Oswego/3119-Dou...,2675000,4,6,6524,Yes,1940,Yes,No,No,Single Family Residential,Clackamas County,No
5,6338 Washington Ct,https://www.redfin.com/OR/Lake-Oswego/6338-Was...,800000,2,1.5,1588,Yes,1958,Yes,No,No,Single Family Residential,Clackamas County,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,5254 Westfield Ct,https://www.redfin.com/OR/Lake-Oswego/5254-Wes...,1300000,4,2.5,3346,Yes,1988,Yes,Yes,No,Single Family Residential,Clackamas County,No
172,4638 Lower Dr,https://www.redfin.com/OR/Lake-Oswego/4638-Low...,307500,2,1.5,1260,No,1975,No,Yes,No,Condo/Co-op,Clackamas County,No
173,4324 Woodside Cir,https://www.redfin.com/OR/Lake-Oswego/4324-Woo...,579000,2,2,1210,Yes,1979,Yes,Yes,No,Single Family Residential,Clackamas County,No
174,110 Greenridge Ct,https://www.redfin.com/OR/Lake-Oswego/110-Gree...,527000,3,2.5,2868,No,1973,No,Yes,No,Single Family Residential,Clackamas County,No


In [10]:
df.to_csv(f'{zipcode}_homes_data_cleaned.csv', index=False)