In [1]:
## Imports
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
import statsmodels.api as sm
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

import warnings 
warnings.filterwarnings("ignore")

In [2]:
#Reading the clean and data without outliers
df = pd.read_csv('../datasets/new_data.csv').drop('Unnamed: 0', 
                                                 axis = 1)
df.head(2)

Unnamed: 0,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,...,central_air,electrical,kitchen_qual,functional,garage_type,garage_finish,garage_qual,garage_cond,paved_drive,sale_type
0,60,68.0,13517,6,8,1976,2005,289.0,533.0,0.0,...,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD
1,60,43.0,11492,7,5,1996,1997,132.0,637.0,0.0,...,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD


In [3]:
#Test Data
df_test = pd.read_csv("../datasets/cleaned_test.csv").drop('Unnamed: 0',
                                                         axis = 1)
df_test.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Reg,Lvl,AllPub,...,0,60,112,0,0,0,0,4,2006,WD
1,2718,905108090,90,RL,68.0,9662,Pave,IR1,Lvl,AllPub,...,170,0,0,0,0,0,0,8,2006,WD


In [4]:
#Fitting Linear Regression with highly correlated numeric vars
y = df['saleprice']
X = df[['overall_qual','gr_liv_area','garage_area',
        'total_bsmt_sf','1st_flr_sf']]

In [5]:
#Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.3,
                                                   random_state = 42)

In [6]:
#Baseline
y_bar = y_train.mean()
base_preds = [y_bar] * len(y_test)
print(len(base_preds))
base_preds[:2],2

616


([180717.9693379791, 180717.9693379791], 2)

In [7]:
#Baseline Predictions
dummy = DummyRegressor()
dummy.fit(X_train, y_train)
base_preds = dummy.predict(y_test)

In [8]:
lr = LinearRegression()

In [9]:
#Fitting Linear Regression with 5 Highly correlated Numeric Vars
lr.fit(X_train, y_train)
print(f'Train R2: {round(lr.score(X_train, y_train),2)}')
print(f'Test R2: {round(lr.score(X_test, y_test),2)}')

Train R2: 0.73
Test R2: 0.77


In [10]:
lr.coef_

array([2.92173398e+04, 5.64400917e+01, 3.50413718e+01, 5.09442219e+01,
       7.04027072e+00])

### The model with 5 highly correlated numeric variables are explaining about 73% variability in training data and 77% in test data. By looking at the coefficients we can see that all else held equal for 1 unit increase in overall quality of the house the saleprice is going up by 29,217 USD and for 1 sqfeet increase in ground living area the saleprice is increasing by 56,440 USD.

In [11]:
#Base metrics
base_mae = mean_absolute_error(y_test, base_preds)
base_mse = mean_squared_error(y_test, base_preds)
base_rmse = mean_squared_error(y_test, base_preds, squared = False)

In [12]:
y_preds = lr.predict(X_test)

In [13]:
#Code borrowed from Loren's lesson
print(f'Baseline MSE: {base_mse}')
print(f'Test MSE: {mean_squared_error(y_test, y_preds)}')
print()
print('='*10)
print()
print(f'Baseline RMSE: {base_rmse}')
print(f'Test RMSE: {mean_squared_error(y_test, y_preds, squared = False)}')
print()
print('='*10)
print()
print(f'Baseline MAE: {base_mae}')
print(f'Test MAE: {mean_absolute_error(y_test, y_preds)}')

Baseline MSE: 5983691627.56332
Test MSE: 1359644150.1839936


Baseline RMSE: 77354.3252026887
Test RMSE: 36873.35284706279


Baseline MAE: 58082.873996560935
Test MAE: 27412.914798531747


### From the above metrics we can clearly see that model is performing a lot better than the baseline model

In [14]:
#Adding  categorical variable ms_zoning and neighborhood to the model,
# The categorical vars that I think are influencing saleprice based on
#plots
y = df['saleprice']
X = df[['overall_qual','gr_liv_area','garage_area',
        'total_bsmt_sf','1st_flr_sf', 'neighborhood','ms_zoning']]

In [15]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.3,
                                                   random_state = 42)

In [16]:
#Fitting Linear Regression through column transformer and pipeline
lr = LinearRegression()
column_trans = make_column_transformer(
(OneHotEncoder(), ['neighborhood', 'ms_zoning']), 
remainder = 'passthrough')
column_trans.fit_transform(X_train, y_train)
pipe = make_pipeline(column_trans, lr)
print(f'Train Score: {cross_val_score(pipe,X_train,y_train, cv = 5)}')
print(f'Test Score:{cross_val_score(pipe, X_test, y_test, cv = 5)}')

Train Score: [       nan 0.75105473 0.79788228 0.78261588        nan]
Test Score:[0.74667973        nan 0.80306213 0.79919991 0.79156241]


In [17]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['neighborhood',
                                                   'ms_zoning'])])),
                ('linearregression', LinearRegression())])

In [18]:
pipe.score(X,y)

0.7978246109196361

In [19]:
y_preds = pipe.predict(X_test)

In [20]:
len(y_preds)

616

In [21]:
#Code borrowed from Loren's lesson
print(f'Baseline MSE: {base_mse}')
print(f'Test MSE: {mean_squared_error(y_test, y_preds)}')
print()
print('='*10)
print()
print(f'Baseline RMSE: {base_rmse}')
print(f'Test RMSE: {mean_squared_error(y_test, y_preds, squared = False)}')
print()
print('='*10)
print()
print(f'Baseline MAE: {base_mae}')
print(f'Test MAE: {mean_absolute_error(y_test, y_preds)}')

Baseline MSE: 5983691627.56332
Test MSE: 1138507705.8179524


Baseline RMSE: 77354.3252026887
Test RMSE: 33741.77982587689


Baseline MAE: 58082.873996560935
Test MAE: 24036.740446693937


### The model improved after adding 2 more features, the mean absolute error came down from 27412 to 24037 and the root mean squared error came dowm from 36873 to 33740 and our new features are explaining about 80% change in variability of Saleprice

In [22]:
#Added exter_qual, garage_type and removed Total_bsmt_sqft features
X = df[['overall_qual' ,'gr_liv_area', 'garage_area','1st_flr_sf',
        'neighborhood', 'ms_zoning', 'garage_type',
     'exter_qual']]
y = df['saleprice']

In [23]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.3,
                                                    random_state = 42)

In [24]:
#Fitting Linear Regression through column transformer and pipeline
lr = LinearRegression()
column_trans = make_column_transformer(
(OneHotEncoder(), ['neighborhood', 'ms_zoning', 'garage_type',
                  'exter_qual']), 
remainder = 'passthrough')
column_trans.fit_transform(X_train, y_train)
pipe1 = make_pipeline(column_trans, lr)
print(f'Train Score: {cross_val_score(pipe,X_train,y_train, cv = 5)}')
print(f'Test Score:{cross_val_score(pipe, X_test, y_test, cv = 5)}')

Train Score: [nan nan nan nan nan]
Test Score:[nan nan nan nan nan]


In [25]:
pipe1.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['neighborhood', 'ms_zoning',
                                                   'garage_type',
                                                   'exter_qual'])])),
                ('linearregression', LinearRegression())])

In [26]:
y_preds1 = pipe1.predict(X_test)

In [27]:
pipe1.score(X,y)

0.8210569538858894

In [28]:
#Code borrowed from Loren's lesson
print(f'Baseline MSE: {base_mse}')
print(f'Test MSE: {mean_squared_error(y_test, y_preds1)}')
print()
print('='*10)
print()
print(f'Baseline RMSE: {base_rmse}')
print(f'Test RMSE: {mean_squared_error(y_test, y_preds1, squared = False)}')
print()
print('='*10)
print()
print(f'Baseline MAE: {base_mae}')
print(f'Test MAE: {mean_absolute_error(y_test, y_preds1)}')

Baseline MSE: 5983691627.56332
Test MSE: 1046865687.6713958


Baseline RMSE: 77354.3252026887
Test RMSE: 32355.303856885595


Baseline MAE: 58082.873996560935
Test MAE: 22717.532021046645


#### removing total_bsmt_sqft and adding exter_qual and garage_type improved the model slightly and the features are now explaing about 82% change in saleprice, removing total basmt sqft didnot effect the model, it is important to remove this as we need to build a parsimonious model that means we need to choose few very important variables that explains more change in saleprice than other

In [29]:
# Added Kitchen_qual  categorical feature
X = df[['overall_qual' ,'gr_liv_area', 'garage_area','1st_flr_sf',
        'neighborhood', 'ms_zoning', 'garage_type','exter_qual', 
       'kitchen_qual']]
y = df['saleprice']

In [30]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.3,
                                                    random_state = 42)

In [31]:
#Fitting Linear Regression
lr = LinearRegression()
column_trans = make_column_transformer(
(OneHotEncoder(), ['neighborhood', 'ms_zoning', 'garage_type',
                  'exter_qual', 'kitchen_qual']), 
remainder = 'passthrough')
column_trans.fit_transform(X_train, y_train)
pipe2 = make_pipeline(column_trans,lr)
print(f'Train Score: {cross_val_score(pipe,X_train,y_train, cv = 5)}')
print(f'Test Score:{cross_val_score(pipe, X_test, y_test, cv = 5)}')

Train Score: [nan nan nan nan nan]
Test Score:[nan nan nan nan nan]


In [32]:
pipe2.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['neighborhood', 'ms_zoning',
                                                   'garage_type', 'exter_qual',
                                                   'kitchen_qual'])])),
                ('linearregression', LinearRegression())])

In [33]:
y_preds2 = pipe2.predict(X_test)

In [34]:
pipe2.score(X,y)

0.8344422949815116

In [35]:
pipe2.score(X_train, y_train)

0.8309324947092396

In [36]:
pipe2.score(X_test, y_test)

0.843083894879173

In [37]:
#Code borrowed from Loren's lesson
print(f'Baseline MSE: {base_mse}')
print(f'Test MSE: {mean_squared_error(y_test, y_preds)}')
print()
print('='*10)
print()
print(f'Baseline RMSE: {base_rmse}')
print(f'Test RMSE: {mean_squared_error(y_test, y_preds, squared = False)}')
print()
print('='*10)
print()
print(f'Baseline MAE: {base_mae}')
print(f'Test MAE: {mean_absolute_error(y_test, y_preds)}')

Baseline MSE: 5983691627.56332
Test MSE: 1138507705.8179524


Baseline RMSE: 77354.3252026887
Test RMSE: 33741.77982587689


Baseline MAE: 58082.873996560935
Test MAE: 24036.740446693937


#### About 83.4% variability is now explained by adding an additional feature kitchen quality and also the Root mean squared error and Mean absolute error also came downn slightly which indicates a better model. Adding some interaction variables and scaling the variables might improve the model, applying those techniques below

In [38]:
#Fitting Linear Regression by scaling and adding polynomial features
lr = LinearRegression()
column_trans = make_column_transformer(
    (StandardScaler(), ['overall_qual', 'gr_liv_area', 'garage_area',
                       '1st_flr_sf']),
    (PolynomialFeatures(), ['garage_area','1st_flr_sf', 'garage_area',
                           '1st_flr_sf']),
    (OneHotEncoder(), ['neighborhood', 'ms_zoning', 'garage_type',
                  'exter_qual', 'kitchen_qual']),
    remainder = 'passthrough')
column_trans.fit_transform(X_train, y_train)
pipe = make_pipeline(column_trans, lr)
print(f'Train Score: {cross_val_score(pipe,X_train,y_train, cv = 5)}')
print(f'Test Score:{cross_val_score(pipe, X_test, y_test, cv = 5)}')

Train Score: [       nan 0.7837871  0.84041549 0.8218202         nan]
Test Score:[0.77279095        nan 0.85933738 0.84674701 0.82809784]


In [39]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['overall_qual',
                                                   'gr_liv_area', 'garage_area',
                                                   '1st_flr_sf']),
                                                 ('polynomialfeatures',
                                                  PolynomialFeatures(),
                                                  ['garage_area', '1st_flr_sf',
                                                   'garage_area',
                                                   '1st_flr_sf']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['neighbor

In [40]:
pipe.score(X,y)

0.8395526034248724

In [41]:
y_preds = pipe.predict(X_test)

In [42]:
#Code borrowed from Loren's lesson
print(f'Baseline MSE: {base_mse}')
print(f'Test MSE: {mean_squared_error(y_test, y_preds)}')
print()
print('='*10)
print()
print(f'Baseline RMSE: {base_rmse}')
print(f'Test RMSE: {mean_squared_error(y_test, y_preds, squared = False)}')
print()
print('='*10)
print()
print(f'Baseline MAE: {base_mae}')
print(f'Test MAE: {mean_absolute_error(y_test, y_preds)}')

Baseline MSE: 5983691627.56332
Test MSE: 884216772.5136886


Baseline RMSE: 77354.3252026887
Test RMSE: 29735.78269549481


Baseline MAE: 58082.873996560935
Test MAE: 21182.360912589284


### Adding interaction variables and scaling numeric variables only improved the model slightly. As our intention is to find best features that determine the sale price we will be selecting our previous model without interaction variables and scaling which I think is less complicated, more interpretable and parsimonious. The model selected has a rsquared of 83.09 for train data and 84.30 for train data and the model is not overfitting.

In [43]:
features = ['overall_qual', 'gr_liv_area', 'garage_area', '1st_flr_sf',
       'neighborhood', 'ms_zoning', 'garage_type', 'exter_qual',
       'kitchen_qual']

In [44]:
dftest = df_test[['overall_qual', 'gr_liv_area', 'garage_area', '1st_flr_sf',
       'neighborhood', 'ms_zoning', 'garage_type', 'exter_qual',
       'kitchen_qual']]

In [45]:
test_preds = pipe2.predict(dftest)

In [46]:
df_test['saleprice'] = test_preds
new_submission = df_test[['id', 'saleprice']]
new_submission.set_index('id', inplace=True) 
new_submission.to_csv('../datasets/submission.csv')

### Conclusion:

#### The most important factors that are determining the saleprice in Ames, IA are Overall quality of the house, Ground Living area, Garage area, Kitchen Quality, which are explaining about 78% varaiability in saleprice, other factors like Kitchen Quality, Neighborhood, External Quality, MS Zoning that is zoning classification are also important for people purchasing a house in Ames. For instance if the house falls under Floating village residential zoning the average sale price is about 200,000USD and the neighborhood Stonebrook looks to be in high demand and neighborhoods like Meadow village and Briardale have an average saleprice of 100,000 USD.

### Next steps 

#### There may be other factors influencing house prices like interest rates, Government subsidies and inflation. We may need to include these factors as well for accurate predictions as real estate prices have high association with interest rates along with the factors that were mentioned in conclusion