In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [4]:
cd ..

/home/jovyan/Ames-Iowa-Data


In [5]:
df = pd.read_csv('data/final_ames_df.csv')
unscaled = pd.read_csv('data/unscaled_ames_df.csv')

In [6]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,MSSubClass.20,MSSubClass.30,MSSubClass.40,MSSubClass.45,MSSubClass.50,MSSubClass.60,MSSubClass.70,MSSubClass.75,MSSubClass.80,...,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,FirstFlrSF,GrLivArea,GarageYrBlt,GarageArea,SalePrice
0,1,-0.760586,-0.223368,-0.052559,-0.091287,-0.331813,1.974674,-0.207617,-0.105556,-0.202142,...,1.050985,0.882076,0.720502,0.089174,0.108421,-0.802632,0.531537,1.022655,0.27711,0.563968
1,2,1.31387,-0.223368,-0.052559,-0.091287,-0.331813,-0.506064,-0.207617,-0.105556,-0.202142,...,0.16923,-0.418806,0.755273,0.204922,0.226417,0.421404,-0.378547,-0.092048,0.239049,0.216827
2,3,-0.760586,-0.223368,-0.052559,-0.091287,-0.331813,1.974674,-0.207617,-0.105556,-0.202142,...,0.986079,0.834209,0.680661,0.281818,0.130339,-0.575272,0.661806,0.940601,0.299701,0.737866
3,4,-0.760586,-0.223368,-0.052559,-0.091287,-0.331813,-0.506064,4.813248,-0.105556,-0.202142,...,-1.868071,-0.710305,0.594138,0.321442,0.070659,-0.437788,0.543775,0.817366,0.311532,-0.433016
4,5,-0.760586,-0.223368,-0.052559,-0.091287,-0.331813,1.974674,-0.207617,-0.105556,-0.202142,...,0.953601,0.738402,0.712502,0.303824,0.196843,0.114615,1.283624,0.899543,0.368943,1.018339


In [7]:
final_ames_df = df.drop(['Unnamed: 0'], axis = 1)
target = final_ames_df['SalePrice']
features = final_ames_df.drop('SalePrice', axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [9]:
def run_model(model, model_name, target, features):
    X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                                        random_state=42)
    model.fit(X_train, y_train)
    model.predict(X_train)
    
    return {
        'model_name' : model_name,
        'model' : model,
        'train_score' : model.score(X_train, y_train),
        'test_score' : model.score(X_test, y_test)
    }
model_scores = []

### Ridge Regression

Ridge Regression runs a linear least squares regression with an L2-regularization.

The penalty is determined using the L2-norm loss function which is the Least Squares Error.
$$ S = \sum_{i = 1}^n (y_i - f(x_i))^2 $$

In [10]:
model_scores.append(run_model(Ridge(), 'ridge', target, features))

### Lasso Regression

Lasso Regression runs a linear least squares regression with L1-regularization.

The penality is determined by using the L1-norm loss function which is Least Absolute error.

$$ S = \sum_{i=1}^n | y_i - f(x_i)|$$

In [11]:
model_scores.append(run_model(Lasso(alpha=0.1), 'lasso', target, features))

### K-Nearest Neighbors Regression

The target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set

In [12]:
model_scores.append(run_model(KNeighborsRegressor(), 'knn', target, features))

### Decision Tree Regressor

The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.

In [13]:
model_scores.append(run_model(DecisionTreeRegressor(), 'dtr', target, features))

### Support Vector Machine Regressor

The model produced by Support Vector Regression depends only on a subset of the training data, because the cost function for building the model ignores any training data close to the model prediction.

In [14]:
model_scores.append(run_model(SVR(), 'svr', target, features))

### Model Comparison
By just running the models as without any turning, we can tell that Ridge and Lasso, both linear models, have the best test scores. 

The underlying structures for KNN, DTRegressor, and SVR are all non-linear. 

The decision tree regressor had a 1.000 train_score, and the lowest test score, which suggests that this model is overfit.

In [15]:
model_scores = pd.DataFrame(model_scores)
model_scores

Unnamed: 0,model,model_name,test_score,train_score
0,"Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...",ridge,0.868772,0.95936
1,"Lasso(alpha=0.1, copy_X=True, fit_intercept=Tr...",lasso,0.820449,0.816814
2,"KNeighborsRegressor(algorithm='auto', leaf_siz...",knn,0.74224,0.841907
3,"DecisionTreeRegressor(criterion='mse', max_dep...",dtr,0.701009,1.0
4,"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3...",svr,0.731889,0.934353
