# Predicting Sale Prices for Ames Iowa Housing Dataset

#### By: _Noah C. (DSI)_

## Import Libraries & Load in Data

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df_train = pd.read_csv("./datasets/train.csv")
df_test  = pd.read_csv("./datasets/test.csv")

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
## What column in train is not in test?
set(df_train) - set(df_test)

In [None]:
df_train.head()

## Data Cleaning

**Set `Id` column to be our new index for both train and test datasets.**

This will make submitting to Kaggle easier later on. It also lets us reference rows with the `Id` number when using `.loc`.

In [None]:
df_train.set_index('Id', inplace = True)
df_test.set_index('Id', inplace = True)

In [None]:
df_train.head()

Check to see how many missing values are in each column.

In [None]:
df_train.isnull().sum().sort_values(ascending = False)

In [None]:
### EXAMPLE ONLY - DO NOT DO THIS IN YOUR OWN PROJECT!
### Lazy workflow - I'm going to fill all NAs with 0 (again, do not do this)

df_train.fillna(0, inplace = True)
df_test.fillna(0, inplace = True)

In [None]:
df_train.info()

## EDA (Exploratory Data Analysis)
I'm going to skip this step right now, but you should not do so in your project.

Things that go here:
- descriptive statistics
- visualizations
- visualizations
- visualizations
- interpretations of descriptive statistics and visualizations

## Feature Engineering

In [None]:
## one-hot encode the values in the Neighborhood column for both the training and testing datasets
train_dummies = pd.get_dummies(df_train['Neighborhood'])
test_dummies  = pd.get_dummies(df_test['Neighborhood'])

In [None]:
## Find the columns that are in test, but are not in train
missing_columns_train = list(set(test_dummies) - set(train_dummies))

## Find the columns that are in train, but are not in test
missing_columns_test  = list(set(train_dummies) - set(test_dummies))

print(f"Columns missing from Train: {missing_columns_train}")
print(f"Columns missing from Test: {missing_columns_test}")

In [None]:
## Add the missing columns to the test_dummies dataframe
### Since no rows in the test dataset have values for these new columns, fill them with 0's
for col in missing_columns_test:
    test_dummies[col] = 0

In [None]:
## Check to see that it worked.
set(train_dummies) - set(test_dummies)

In [None]:
## Take a look at all the dummy columns created
train_dummies.columns

Since we want to do `drop_first = True`, we'll manually drop the `Blmngtn` column from both dummy dataframes.

- I chose `Blmngtn` to be my reference category simply because it was the first alphabetically.

- However, this also means that when I interpret my coefficients later on, I'll want to keep in mind that a zero in every Neighborhood dummy column means the row had a house in `Blmngtn`. So the effect of the `Blmngtn` column is now captured in the intercept, and all of my other coefficients will be interpreted in comparison to a house in `Blmngtn`. (e.g. "for a house in Old Town **compared to** a house in Bloomington").

In [None]:
## Drop one of the neighborhood columns from both train and test
train_dummies.drop(columns = ['Blmngtn'], inplace = True)
test_dummies.drop(columns = ['Blmngtn'], inplace = True)

In [None]:
## Join the train and test dummy columns back with the original dataframes
df_train_full = df_train.join(train_dummies)
df_test_full  = df_test.join(test_dummies)

## Make sure the columns in `df_test_full` are in the same order as the column in `df_train_full`
df_test_full = df_test_full[df_train_full.drop(columns = ['SalePrice']).columns]

In [None]:
df_train_full.columns

In [None]:
df_test_full.columns

## Preprocessing & Modeling

In [None]:
## Define `features` list of features to include in your model
features = ['Overall Qual', 'Lot Area']

## We use the list `.extend()` method here instead of `.append()` in order to add
## all of the elements of the list individually, instead of adding them all as one list.
features.extend(train_dummies.columns)

print(features)

In [None]:
## Define X and y
X = df_train_full[features]
y = df_train_full['SalePrice']

In [None]:
## Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
## Take a look at the shapes of X_train and X_test
print(X_train.shape)

print(X_test.shape)

In [None]:
## Establish a baseline model for comparison
### We can think of our baseline model as a naive model that always
### predicts the mean of our target
y_train_baseline = [y_train.mean()]*len(y_train)        ## multiplying a list by an integer dupicates the elements of that list that many times.
y_test_baseline  = [y_train.mean()]*len(y_test)

In [None]:
## Calculate the RMSE for the baseline train and test "predictions"
print(f"Baseline RMSE - Train: {mean_squared_error(y_train, y_train_baseline)**0.5}")
print(f"Baseline RMSE - Test: {mean_squared_error(y_test, y_test_baseline)**0.5}")

### Linear Regression

In [None]:
## Instantiate and fit a linear regression model to your training data
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
## Evaluate model on both training and testing data using RMSE
print(f"Train RMSE: {mean_squared_error(y_train, lr.predict(X_train))**0.5}")
print(f"Test RMSE: {mean_squared_error(y_test, lr.predict(X_test))**0.5}")

In [None]:
## Look at coefficients for each feature
lr.coef_

In [None]:
## Let's make that a little easier to read
coef_df = pd.DataFrame({
    'column': X.columns,
    'coef'  : lr.coef_
})

In [None]:
(abs(coef_df['coef']))

In [None]:
coef_df.sort_values(by = 'coef', ascending = False).head()

### Use your model to make predictions on the test dataset

In [None]:
## First we subset `df_test_full` to just the features we included in our model
X_kaggle = df_test_full.loc[:,features]

In [None]:
X_kaggle.head()

In [None]:
## Then we use the same model to predict on the test data, save predictions to a `SalePrice` column
X_kaggle['SalePrice'] = lr.predict(X_kaggle)

In [None]:
X_kaggle.head()

In [None]:
## Set output to be a dataframe with only the `SalePrice` column
output = X_kaggle[['SalePrice']]

In [None]:
output.head()

In [None]:
## Save output to a csv
### (note: we're not using `index = False` here because we want our index `Id` to be a new column)
output.to_csv('./datasets/first_submission.csv')