In [102]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [103]:
data = pd.read_csv('../datasets/train_with_dummies.csv')

In [104]:
data.shape

(2051, 199)

In [105]:
#from previous modelling 
#these 5 features as the most important - they seem to impact the price the most

#Total SF - Which is 'Gr Liv Area' + 'Total Bsmt SF'
#Exter Qual
#Kitchen Qual
#Year Built
#Overall Qual

In [106]:
#built a simplified model based on these 5 items for the presentation

In [107]:
data['Total SF'] = data['Gr Liv Area'] + data['Total Bsmt SF']

In [108]:
df = pd.DataFrame({
    'Total SF': data['Total SF'],
    'Exter Qual': data['Exter Qual'],
    'Kitchen Qual': data['Kitchen Qual'],
    'Year Built': data['Year Built'],
    'Overall Qual': data['Overall Qual'],
    'SalePrice': data['SalePrice']})

In [109]:
df.head()

Unnamed: 0,Total SF,Exter Qual,Kitchen Qual,Year Built,Overall Qual,SalePrice
0,2204.0,2,2,1976,6,130500
1,3035.0,2,2,1996,7,220000
2,2114.0,3,2,1953,5,109000
3,1828.0,3,3,2006,5,174000
4,2121.0,3,3,1900,6,138500


In [110]:
#create X and y values
X = df.drop('SalePrice',axis=1)
y = df['SalePrice']

In [111]:
#create poly features, train test split then scale
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
poly_convert = PolynomialFeatures(include_bias=False)
X_poly = poly_convert.fit_transform(X)

In [112]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_poly,y,test_size=0.3)

In [113]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [114]:
pipe = Pipeline([('ss',StandardScaler()),('en',ElasticNet())])

In [115]:
pipe_params = {'en__l1_ratio':[.1, .7, 1],
              'en__selection':['random'], #random so it will run faster
              'en__warm_start':[True], #warm start to run faster
              'en__alpha':np.logspace(-6, 6, 5),
              'en__max_iter': [10000000]}

In [116]:
model = GridSearchCV(pipe,pipe_params,cv=5,verbose=1,scoring='neg_mean_squared_error')
#set scoring to neg mean squared error as this is what we are trying to minimize

In [117]:
model.fit(X_train,y_train)
#train model using only 5 base features

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  6.3min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('en', ElasticNet())]),
             param_grid={'en__alpha': array([1.e-06, 1.e-03, 1.e+00, 1.e+03, 1.e+06]),
                         'en__l1_ratio': [0.1, 0.7, 1],
                         'en__max_iter': [10000000],
                         'en__selection': ['random'],
                         'en__warm_start': [True]},
             scoring='neg_mean_squared_error', verbose=1)

In [118]:
model.best_params_

{'en__alpha': 0.001,
 'en__l1_ratio': 0.7,
 'en__max_iter': 10000000,
 'en__selection': 'random',
 'en__warm_start': True}

In [119]:
#save model 
from joblib import dump,load

In [120]:
dump(model,'../models/test_model.joblib')

['../models/test_model.joblib']

In [121]:
#Get predictions
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
predictions = model.predict(X_poly)

In [122]:
from sklearn.metrics import mean_squared_error

In [123]:
train_predictions[:5]

array([145291.10837599, 109812.439518  , 126116.91175205, 214662.56303444,
        99620.44644092])

In [124]:
#RSME score: train set
np.sqrt(mean_squared_error(y_train,train_predictions))

29079.07895916296

In [125]:
#RSME score: test set
np.sqrt(mean_squared_error(y_test,test_predictions))

28671.817959312804

In [126]:
#RSME SCORE is actually quite good even with just 5 features
#train this model with full train data set and same hyperparameters

In [127]:
#create another instance of Grid Search
full_model = GridSearchCV(pipe,pipe_params,cv=5,verbose=1,scoring='neg_mean_squared_error')

In [129]:
#train model on full dataset
full_model.fit(X_poly,y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  8.2min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('en', ElasticNet())]),
             param_grid={'en__alpha': array([1.e-06, 1.e-03, 1.e+00, 1.e+03, 1.e+06]),
                         'en__l1_ratio': [0.1, 0.7, 1],
                         'en__max_iter': [10000000],
                         'en__selection': ['random'],
                         'en__warm_start': [True]},
             scoring='neg_mean_squared_error', verbose=1)

In [130]:
#save model for application use
dump(model,'../models/presentation_model.joblib')

['../models/presentation_model.joblib']

In [131]:
#get predictions for full model
predictions = full_model.predict(X_poly)

In [132]:
#RSME score:full model
np.sqrt(mean_squared_error(y,predictions))

#RSME seems reasonable
#We will use this model with only 5 base features for our presentation application

28842.6622966087

In [None]:
#code and test logic for application

In [133]:
#load model from file
model = load('../models/presentation_model.joblib')

In [134]:
type(X)

pandas.core.frame.DataFrame

In [172]:
#this will be the result from the html form
#test with dummy user input
d = {'tot-sf': '2000', 'ext-qual': '2','kit-qual': '6','year': '1998','over-qual': '2'}

In [173]:
#turn form result into a dataframe
df = pd.DataFrame.from_dict([d])

df

Unnamed: 0,tot-sf,ext-qual,kit-qual,year,over-qual
0,2000,2,6,1998,2


In [174]:
#convert the 5 features the user input into poly features for a better model
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
poly_convert = PolynomialFeatures(include_bias=False)
poly = poly_convert.fit_transform(df)

In [175]:
#use model to predict price from 5 base features (user input)
predictions = model.predict(poly)

In [176]:
predictions

array([123434.63454525])

In [179]:
#final result to show on website
np.round(predictions[0],2)

123434.63

In [None]:
#end
#website logic coded
#transfer into application.py file