# Import Dependencies

In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

# intsall seaborn
!pip install seaborn

Requirement already up-to-date: sklearn in /Users/kristin/opt/anaconda3/lib/python3.8/site-packages (0.0)


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Read the CSV

In [None]:
df = pd.read_csv("austinHousingData.csv")

# Clean the data 

In [None]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

# Basic data analysis

In [None]:
df.describe()

In [None]:
# Charting features
selected_features = df[['latest_saleyear','yearBuilt','garageSpaces', 'lotSizeSqFt',
                        'livingAreaSqFt', 'numOfBathrooms', 'numOfBedrooms', 'numOfStories',
                        'avgSchoolRating','avgSchoolDistance', 'latestPrice', 'latest_salemonth']]

selected_features.hist(bins=50, figsize=(20,15))
plt.savefig("../project-3/images/feature_histogram_plots.png")
plt.show()

# Basic Latitude + Longitude Plots

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.2)
plt.savefig("../project-3/images/lat_long_plot.png")

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, figsize=(10,7),
    c="latestPrice", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.savefig("../project-3/images/price_lat_long_plot.png")

# Determine feature correlation

In [None]:
#correlation matrix
corr_matrix = df.corr()
corr_matrix["latestPrice"].sort_values(ascending=False)

In [None]:
corr_matrix.shape

In [None]:
# Heatmap of correlation 
plt.figure(figsize=(20,20))
sns.heatmap(corr_matrix, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':10}, cmap='Greens')
plt.savefig("../project-3/images/heatmap.png")

# Select your features (columns)

In [None]:
df.info()

In [None]:
# Set features. This will also be used as your x values.
selected_features = df[['yearBuilt','livingAreaSqFt', 
                        'numOfBathrooms', 'numOfBedrooms']]

selected_features.head()

# Train, Test, Split

In [None]:
# train, test, split
X = selected_features
y = df['latestPrice'].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
# split the data to train and test values
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X_train.head()

# Create the Model

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Train the Model



In [None]:
# Fit the model to the training data and calculate the scores for the training and testing data

### BEGIN SOLUTION
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

### END SOLUTION 

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

# Plot Residuals

In [None]:
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")
plt.savefig("../project-3/images/residual_plot.png")

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
# import joblib
# filename = 'your_name.sav'
# joblib.dump(your_model, filename)