# Real Estate Stock Price Prediction - Data cleaning and preparation steps

In [14]:
# Import all necessary libraries for the project

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

In [15]:
# Read csv file and add in a dataframe
re_df1 = pd.read_csv(r"..\source_data\florida_real_estate_dataset.csv")

In [16]:
# Display the first few records
re_df1.head()

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,30750.0,for_sale,649000.0,5.0,4.0,0.33,1444806.0,Apopka,Florida,32703.0,3690.0,10/22/2020
1,36577.0,for_sale,395000.0,3.0,2.0,0.21,632930.0,Apopka,Florida,32703.0,2046.0,10/31/2013
2,16352.0,for_sale,418000.0,4.0,3.0,0.24,24767.0,Apopka,Florida,32703.0,2247.0,7/29/2015
3,30655.0,for_sale,57000.0,2.0,1.0,0.06,923773.0,Apopka,Florida,32703.0,568.0,6/14/2016
4,16352.0,for_sale,210000.0,3.0,2.0,0.29,713543.0,Apopka,Florida,32703.0,1188.0,5/16/2003


In [17]:
# Display number of rows and columns
re_df1.shape

(79146, 12)

In [21]:
# Drop columns that are not needed for data analysis
re_df2 = re_df1.drop(['brokered_by','status','street','state','prev_sold_date','acre_lot'], axis='columns')

In [22]:
# Display first few records of updated dataframe
re_df2.head()

Unnamed: 0,price,bed,bath,city,zip_code,house_size
0,649000.0,5.0,4.0,Apopka,32703.0,3690.0
1,395000.0,3.0,2.0,Apopka,32703.0,2046.0
2,418000.0,4.0,3.0,Apopka,32703.0,2247.0
3,57000.0,2.0,1.0,Apopka,32703.0,568.0
4,210000.0,3.0,2.0,Apopka,32703.0,1188.0


# Data cleaning steps

In [23]:
# Find count of rows with null values for each of the category
re_df2.isnull().sum()

price            12
bed           14411
bath          15120
city            118
zip_code         10
house_size    15162
dtype: int64

In [30]:
# Remove all rows with null values
re_df_clean = re_df2.dropna()

In [34]:
# Check if any rows have null values now
re_df_clean.isnull().sum()

price         0
bed           0
bath          0
city          0
zip_code      0
house_size    0
dtype: int64

In [35]:
# Display few of the rows of the updated dataframe
re_df_clean.head()

Unnamed: 0,price,bed,bath,city,zip_code,house_size
0,649000.0,5.0,4.0,Apopka,32703.0,3690.0
1,395000.0,3.0,2.0,Apopka,32703.0,2046.0
2,418000.0,4.0,3.0,Apopka,32703.0,2247.0
3,57000.0,2.0,1.0,Apopka,32703.0,568.0
4,210000.0,3.0,2.0,Apopka,32703.0,1188.0


In [36]:
# Check the number of rows and cloumns of the updated dataframe
re_df_clean.shape

(62009, 6)

In [38]:
# Check number of rows with number of bedrooms > 10
re_df_clean[re_df_clean.bed>10]

Unnamed: 0,price,bed,bath,city,zip_code,house_size
2978,749900.0,14.0,7.0,Deland,32720.0,555.0
3095,325000.0,11.0,6.0,Deland,32724.0,4012.0
6006,699900.0,12.0,6.0,Orlando,32805.0,3384.0
8513,25000000.0,12.0,15.0,Orlando,32836.0,14706.0
9358,4250000.0,11.0,14.0,Reunion,34747.0,10377.0
...,...,...,...,...,...,...
69553,1900000.0,55.0,29.0,Mulberry,33860.0,28050.0
69656,3215625.0,74.0,49.0,Fort Meade,33841.0,45580.0
74927,2500000.0,24.0,24.0,Rotonda West,33947.0,11328.0
74961,999999.0,12.0,9.0,Arcadia,34266.0,3456.0


In [39]:
# Create a new dataframe with bed size > 10 removed
re_df_cleaned = re_df_clean[re_df_clean['bed'] <= 10]

In [40]:
# Display some of the rows of the cleaned dataset
re_df_cleaned.head()

Unnamed: 0,price,bed,bath,city,zip_code,house_size
0,649000.0,5.0,4.0,Apopka,32703.0,3690.0
1,395000.0,3.0,2.0,Apopka,32703.0,2046.0
2,418000.0,4.0,3.0,Apopka,32703.0,2247.0
3,57000.0,2.0,1.0,Apopka,32703.0,568.0
4,210000.0,3.0,2.0,Apopka,32703.0,1188.0


In [42]:
# Check the number of rows and columns of the updated dataframe
re_df_cleaned.shape

(61906, 6)

In [43]:
# Check if there are any rows with number of beds > 10
re_df_cleaned[re_df_cleaned.bed>10]

Unnamed: 0,price,bed,bath,city,zip_code,house_size
