# Austin Housing Project (Exploratory Data Analysis and Predictive Modelling)

In [1]:
# Importing Necessary Libraries
# Libraries for Data Analysis and Visualization
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from uszipcode import SearchEngine
import folium
from branca.colormap import linear, LinearColormap

# Libraries for Machine Learning 



In [2]:
# Import Data and Check Shape (Please replace filepath to suit your needs)
filepath = 'C:/Users/New/Downloads/datasets/Austin-Housing-Project/austinHousingData.csv'
austin = pd.read_csv(filepath)
print(f"The Austin Housing Dataset has {austin.shape[0]} rows and {austin.shape[1]} columns.")

The Austin Housing Dataset has 15171 rows and 47 columns.


In [17]:
# Checking Datatypes and Missing Values (Either Command can be Used)
austin.info()
#austin.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15171 entries, 0 to 15170
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   zpid                        15171 non-null  int64  
 1   city                        15171 non-null  object 
 2   streetAddress               15171 non-null  object 
 3   zipcode                     15171 non-null  int64  
 4   description                 15169 non-null  object 
 5   latitude                    15171 non-null  float64
 6   longitude                   15171 non-null  float64
 7   propertyTaxRate             15171 non-null  float64
 8   garageSpaces                15171 non-null  int64  
 9   hasAssociation              15171 non-null  bool   
 10  hasCooling                  15171 non-null  bool   
 11  hasGarage                   15171 non-null  bool   
 12  hasHeating                  15171 non-null  bool   
 13  hasSpa                      151

In [None]:
# Feature Engineering and Transformation of Dataset Variables
# Map elements of the City Column to be either Austin or Surrounding Areas
def city_func(element):
    if element  == 'austin':
        return 'Austin'
    else:
        return 'Surrounding Areas'
austin['city'] = austin['city'].apply(city_func)

# Transform zipcode from Int to Object
austin['zipcode'] = austin['zipcode'].astype('object') 

# Rename latestPrice to Price
rename(columns={'latestPrice' : 'Price'}) 

# Transform yearBuilt to ageofHouse (drop yearBuilt)
austin["ageofHouse"] = 2021 - austin["yearBuilt"] 

# Transform the Latest_salemonth Column to Season Column 
month = {1: 'Winter', 2: 'Winter', 3 : 'Spring',
         4: 'Spring', 5: 'Spring', 6: 'Summer',
         7: 'Summer', 8: 'Summer', 9: 'Fall',
         10: 'Fall', 11: 'Fall', 12: 'Winter'}
austin['Season'] = austin['latest_salemonth'].map(season) 

# Transform latest_salemonth from Int to Object
month = {1: 'January', 2: 'February', 3 : 'March',
         4: 'April', 5: 'May', 6: 'June',
         7: 'July', 8: 'August', 9: 'September',
         10: 'October', 11: 'November', 12: 'December'}
austin['latest_salemonth'] = austin['latest_salemonth'].map(month) 

# Transform latest_saleyear from Int to Object
austin['latest_saleyear'] = austin['latest_saleyear'].astype('object') 

# Dropping Ouliers (Top and Bottom 10% will be Removed)

# Drop Columns Deemed Not Relevant to the Prediction after Feature Engineering
austin.drop(columns=['streetAddress', 'description', 'latestPriceSource', 'homeImage', 'yearBuilt'], inplace=True)

In [None]:
print(f"The Cleaned and Pre-processed Austin Housing Dataset has {austin.shape[0]} rows and {austin.shape[1]} columns.")

In [None]:
# Data Visualization (with Question Asked for Visualization)

In [None]:
austin.hist(figsize=(18,15), bins=100);

In [None]:
austin['latest_salemonth'].value_counts().plot(kind = 'barh');

In [None]:
# What Zipcodes have the Highest and Lowest Median Prices
median_zipcode = austin.groupby('latest_salemonth')['latestPrice'].median().sort_values(ascending=False)
median_zipcode.head(15).plot(kind = 'barh');

In [None]:
# House Density by Zipcode
austin['zipcode'].value_counts().tail(10).plot(kind="barh");

In [None]:
# Descriptive Insights and Vizualization
# Sort the Variables into Separate Lists of the Categorical and Numerical Columns in the Dataset
float_cols = list(austin.select_dtypes(include=['float64']).columns)
print(len(float_cols))
print(float_cols)

int_cols = list(austin.select_dtypes(include=['int64']).columns)
print(len(int_cols))
print(int_cols)

bool_cols = list(austin.select_dtypes(include=['bool']).columns)
print(len(bool_cols))
print(bool_cols)

cat_cols = list(austin.select_dtypes(include=['object']).columns)
print(len(cat_cols))
print(cat_cols)

In [None]:
# Use Histograms to View the Distribution of the Int Variables
plt.figure(figsize=(15, 20))
for i, col in enumerate(int_cols, 1):
    if col != "latestPrice":
        plt.subplot(13, 2, i)
        sns.histplot(austin[col], kde=True)
        plt.title(col)

plt.show()

In [None]:
# Use Histograms to View the Distribution of the Float Variables
plt.figure(figsize=(15, 10))
for i, col in enumerate(float_cols, 1):
    plt.subplot(2, 2, i)
    sns.histplot(austin[col], kde=True)
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
# Use Countplots to View the Distribution of the Object Variables
plt.figure(figsize=(15, 20))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(6, 2, i)
    sns.countplot(data=austin, x=col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Use Countplots to View the Distribution of the Boolean Variables
plt.figure(figsize=(15, 20))
for i, col in enumerate(bool_cols, 1):
    plt.subplot(6, 2, i)
    plt.subplot(6, 2, i)
    sns.countplot(data=austin, x=col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# LatestPrice (Target Variable) Distribution
plt.figure(figsize=(15, 10))
for i, col in enumerate(int_cols, 1):
    if col = "latestPrice":
    plt.subplot(2, 2, i)
    sns.histplot(austin[col], kde=True)
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
# Correlation of Numeric Features with Target (Ignore the Direction to View the Magnitude)
feat_corr = austin.corr(numeric_only = True)['latestPrice'].abs()
feat_corr = feat_corr.sort_values(axis=0, ascending=False)
feat_corr = feat_corr.drop('latestPrice')

fig, axs = plt.subplots(2, 1, figsize=(15, 10))
feat_corr.head(20).plot(kind = 'barh', ax = axs[0], title = 'Top 20 Numeric Feature Correlations with House Price')
feat_corr.tail(20).plot(kind = 'barh', ax = axs[1], title = 'Bottom 20 Numeric Feature Correlations with House Price');

In [None]:
# Visualizing the Median House Prices by Zipcode (Using Folium and USZipcodes)
median_prices = austin.groupby("zipcode")["latestPrice"].median()
# Instantiante SearchEngine
search = SearchEngine()
zipcodes = [search.by_zipcode(zipcode) for zipcode in median_prices.index]
# Define a color scale
custom_colors = ['#2c7bb6', '#abd9e9', '#fdae61', '#d7191c']
colormap = LinearColormap(custom_colors, vmin=0, vmax=100).scale(median_prices.min(), median_prices.max())
# Create the map
austin_zip_map = folium.Map(location=[30.2672, -97.7431], zoom_start=11, tiles="OpenStreetMap")
# Add circles for each ZIP code
for zipcode in zipcodes:
    center_lat = zipcode.lat
    center_lon = zipcode.lng
    median_price = median_prices.get(float(zipcode.zipcode), 0)
    color = colormap(median_price)
    folium.CircleMarker(
        location=[center_lat, center_lon],
        radius=10,
        fill=True,
        color=color,
        fill_opacity=1.0,
        tooltip="ZIP code: {}<br>Median Price: ${:,.0f}".format(zipcode.zipcode, median_price)
    ).add_to(austin_zip_map)

# Display the map
austin_zip_map


In [None]:
# Machine Learning - Price Prediction Model
target = 'latestPrice'
features = austin.drop(columns = ['latestPrice', 'zpid'])

X = austin.drop('latestPrice', axis=1)
y = austin['latestPrice']

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)