# House Sales
Dataset 2 (house_sales.csv) has data related to house sales for a county in Washington state.

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

## Part 1: Data Importing and Pre-processing

In [None]:
# 1.1 Import dataset and describe characteristics such as dimensions, data types, file types, and import methods used

df = pd.read_csv('https://raw.githubusercontent.com/jvo012/ads-500b-project/main/house_sales.csv')
df.head()

In [None]:
# Identify the data types of key variables
df.info()

In [None]:
# 1.2 Clean, wrangle, and handle missing data
# 1.3 Transform data appropriately using techniques such as aggregation, normalization, and feature construction
# 1.4 Reduce redundant data and perform need based discretization

# Drop "ID", "long", and "lat" columns
# These columns are irrelevant values that are too specific and do not fit the requirements needed for our analysis on the house market.
df.drop(["id","long","lat"], axis=1, inplace = True)

# Column "yr_renovated" converted to binary values (0=No, 1=Yes)
# The year of renovation is an irrelevant observation. Instead, homes that were renovated have the value 1 (True), unrenovated homes kept the value 0 (False) to observe if a relationship exists with renovations and the housing market.
df['yr_renovated'] = (df['yr_renovated'] > 0).astype(bool)
df['sqft_basement'] = (df['sqft_basement'] > 0).astype(bool)

# Determine the index corresponding to the max value of "bedrooms"
x = df['bedrooms'].idxmax()

# Display "sqft_lot" at max value of bedrooms
print("Sqft_lot at max value of bedrooms: ", df.loc[x, 'sqft_living'])

# 33 bedroom count highly unlikely given the corresponding "sqft_living" value of "1620.0"; likely data entry error
# Replace "33" bedroom column count by "3"
df['bedrooms'] = np.where(df['bedrooms'] == 33, 3, df['bedrooms'])

# Converted data type to date_time
# Changed the data type in order to use for further analysis.
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d %H:%M:%S')

In [None]:
# Count of missing values per column
df.isnull().sum()

In [None]:
# Verify columns do not contain "0" value
print("Price 0 values: ", '0' in df.price.values)
print("Bathroom 0 values: ", '0' in df.bathrooms.values)
print("Bedroom 0 values: ", '0' in df.bedrooms.values)
print("Sqft_living 0 values: ", '0' in df.sqft_living.values)
print("Sqft_lot 0 values: ", '0' in df.sqft_lot.values)
print("Floors 0 values: ", '0' in df.floors.values)

In [None]:
# Replace missing bedroom and bathroom count by the mean
df['bedrooms'].replace(np.nan, df['bedrooms'].mean(), inplace=True)
df['bathrooms'].replace(np.nan, df['bathrooms'].mean(), inplace=True)

In [None]:
# Display column unique count to check for outliers
print("Bedroom unique count:", df.bedrooms.unique())
print("Bathrooms unique count:", df.bathrooms.unique())
print("Floors unique count:", df.floors.unique())
print("Waterfront unique count:", df.waterfront.unique())
print("View unique count:", df.view.unique())
print("Condition unique count:", df.condition.unique())
print("Grade unique count:", df.grade.unique())

In [None]:
# Mode of "view" column
print("View mode: ", df["view"].mode())

# Replace all "view" column values greater than 1 with the mode, "0"
df['view'].values[df['view'] > 1] = 0

# Confirm "view" unique count to confirm command above worked and check for remaining outliers
print("View unique count:", df.view.unique())

In [None]:
# Determine the index corresponding to the max value of "bedrooms"
x = df['bedrooms'].idxmax()

# Display "sqft_lot" at max value of bedrooms
print("Sqft_lot at max value of bedrooms: ", df.loc[x, 'sqft_living'])

# 33 bedroom count highly unlikely given the corresponding "sqft_living" value; likely data entry error
# Replace "33" bedroom column count by "3"
# df["bedrooms"].replace({"33": "3"}, inplace=True)
df['bedrooms'] = df['bedrooms'].replace(['33'], '3')

# Confirm "bedroom" unique count to confirm command above worked and check for remaining outliers
print("Bedroom unique count:", df.bedrooms.unique())

In [None]:
# All values in "bedrooms", "bathrooms", and "floors" rounded to one decial place for consistency.
df['bedrooms'] = df['bedrooms'].round(1)
df['bathrooms'] = df['bathrooms'].round(1)
df['floors'] = df['floors'].round(1)

In [None]:
# Count of missing values per column
df.isnull().sum()

## Part 2: Data Analysis and Visualization

In [None]:
df.head()

In [None]:
# 2.1 Identify categorical, ordinal, and numerical variables within data

# Categorical data: Zipcode
# Ordinal data: condition, grade
# Numerical data: Price, Bedrooms, Bathroom, sqft_living, sqft_lot, sqft_above, floors, yr_built, sqft_living15, sqft_lot15
# Dates are not categorical, ordinal, nor numerical
# Boolean: Waterfront, view, sqft_basement, yr_renovated

print("Categorical data: Zipcode\nOrdinal data: condition, grade\nNumerical data: Price, Bedrooms, Bathroom, sqft_living, sqft_lot, sqft_above, floors, yr_built, sqft_living15, sqft_lot15\nDates are not categorical, ordinal, nor numerical\nBoolean: Waterfront, view, sqft_basement, yr_renovated")

In [None]:
# 2.2 Provide measures of centrality and distribution with visualizations

# Key statistics of studied variables
df.describe()

In [None]:
# 2.3 Diagnose for correlations between variables and determine independent and dependent variables

#Define correlation matrix
cor_matrix = df.corr().abs()

#Only reflect upper triangle of correlation matrix
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
print(upper_tri)

#Only print values in which correlation coefficient exceeds +/- 0.7 (4/10 - still wip)
#to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.70)]
#print(to_drop)

In [None]:
print(df.corr())

In [None]:
# 2.4 Perform exploratory analysis in combination with visualization techniques to discover patterns and features of interest

# Scatterplot for sqft_living (X) and price (Y)
plt.figure()
plt.scatter(df.sqft_living, df.price, alpha=0.7)

# Label axis
plt.xlabel('Living SqFt')
plt.ylabel('Price')

In [None]:
# Price distribution plot
plt.figure(figsize=(10, 4))
sns.distplot(df.price)

plt.title("Prices of Houses")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Box plot to observe relationship between bedrooms and price while considering view
plt.figure(figsize = (17, 9))
sns.boxplot(x="sqft_living", y="price", hue="view", data=df)
plt.title("Relationship between Bedrooms and Price considering View")

In [None]:
# Histogram of living square footage
df.hist(column = 'sqft_living', bins = 5)

In [None]:
# Boxplot to identify bedroom numbers distribution
plt.boxplot(df['bedrooms'])
plt.show()

In [None]:
# Boxplot to compare prices of houses with and without waterfront
sns.boxplot(x="waterfront", y="price", data=df)
plt.show()

print("Houses with a waterfront tend to cost more than houses without a waterfront.\nThere are more outliers for houses without a waterfront.")

In [None]:
# Bedrooms x bathrooms x price
from mpl_toolkits.mplot3d import Axes3D

# Figure out X and Y axis using ranges from TV and Radio
X_axis, Y_axis = np.meshgrid(np.linspace(X.bedrooms.min(), X.bedrooms.max(), 100), np.linspace(X.bathrooms.min(), X.bathrooms.max(), 100))

# Plot the hyperplane by calculating corresponding Z axis (Sales)
Z_axis = lr_model.params[0] + lr_model.params[1] * X_axis + lr_model.params[2] * Y_axis

# Create matplotlib 3D axes
fig = plt.figure(figsize=(12, 8)) # figsize refers to width and height of the figure
ax = Axes3D(fig, azim=-100)

# Plot hyperplane
ax.plot_surface(X_axis, Y_axis, Z_axis, cmap=plt.cm.cool-warm, alpha=0.5, linewidth=0)

# Plot data points
ax.scatter(X.bedrooms, X.bathrooms, y)
# set axis labels
ax.set_xlabel('Bedrooms')
ax.set_ylabel('Bathrooms')
ax.set_zlabel('Price')

## Part 3: Data Analytics

In [None]:
# 3.1 Determine the need for a supervised or unsupervised learning method and identify dependent and independent variables

# Supervised learning: Predict price based on amount of bedrooms from bathrooms

# 3.2 Train, test, and provide accuracy and evaluation metrics for model results
# Import library
import statsmodels.api as sm

Y = df.price # Dependent variable
X = df.bedrooms # Independent variable
X = sm.add_constant(X) # Add constant term to predictor

# Establish linear regression model results
linreg_model = sm.OLS(Y,X).fit()
print(linreg_model.summary())

X_prime = np.linspace(df.bedrooms.min(), df.bedrooms.max(),100)
X_prime = sm.add_constant(X_prime)

In [None]:
import seaborn as sns
# Calculate predicted value
Y_hat = linreg_model.predict(X_prime)

# Scatterplot to observe relationship between bedrooms and price while considering bathrooms
sns.scatterplot(x = df.bedrooms, y = df.price, hue = df.bathrooms)
#plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) #Shift Airline key to upper left

# Create labels for x and y axis
plt.xlabel("Bedrooms")
plt.ylabel("Price")

# Plot linear regression line
plt.plot(X_prime[:,1],Y_hat, 'red', alpha = 0.9)

In [None]:
print("Correlation coefficient = ", np.corrcoef(df.bedrooms,df.price)[0,1])
print("\nThe regression model that uses 'bedrooms' to predict 'price' has a weak positive correlation.\nWe can predict that the more bedrooms there are, the higher the price of the house will be.\n")

In [None]:
# Use both independent variables (Bedrooms and Bathrooms) to predict Sqft living
y = df['price'] # Dependent variable
X = df[['bedrooms', 'bathrooms']] # Independent variable
X = sm.add_constant(X)

lr_model = sm.OLS(y,X).fit()

print(lr_model.summary())
print(lr_model.params)