#  House stuff

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import pprint
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from data_util import *

houseA = pd.read_csv('AmesHousingSetA.csv')
print(houseA.shape)


houseB = pd.read_csv('AmesHousingSetB.csv')
print(houseB.shape)

(2344, 81)
(586, 81)


## 1. Data Preparation
I fill in the NA values of both the house A and house B sets.
I follow that up by applying one hot encoding to both sets.
After that I attempt to fill in the columns missing from both sets after OHE.
This is all to get the data in an acceptable format to train the model with.

In [3]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import numpy as np

houseA = houseA.apply(lambda x:x.fillna(x.value_counts().index[0]))
print(houseA.shape)

houseB = houseB.apply(lambda x:x.fillna(x.value_counts().index[0]))
print(len(cat_features(houseB)))

(2344, 81)
43


In [4]:
houseA = pd.get_dummies(houseA, columns=cat_features(houseA))
print(houseA.shape)

houseB = pd.get_dummies(houseB, columns=cat_features(houseB))
print(houseB.shape)

(2344, 300)
(586, 271)


In [5]:
featureA = set(list(houseA))
featureB = set(list(houseB))
missingFeats = list(featureA - featureB)
print(len(featureA), '-', len(featureB), '=', len(missingFeats))
dictFeats = { i : 0 for i in missingFeats }
houseB = houseB.assign(**dictFeats)

300 - 271 = 35


In [6]:
missingFeats = list(featureB - featureA)
print(len(featureA), '-', len(featureB), '=', len(missingFeats))
dictFeats = { i : 0 for i in missingFeats }
houseA = houseA.assign(**dictFeats)

300 - 271 = 6


In [7]:
features = list(houseA)
features.remove('SalePrice')
features.remove('PID')

data_x = houseA[features]
data_y = houseA['SalePrice']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y,
                                                    test_size=0.2)

## 2. Exploratory analysis
My exploratory analysis consisted mostly of viewing the p-values of all the columns. It shows that lot.area, low.qual.fin.sf, and garage.yr.blt are the three important columns, p-value wise.

In [8]:
#Base model
base_model = linear_model.LinearRegression()
base_model.fit(x_train, y_train)
preds = base_model.predict(x_test)
print_regression_error_report(y_test, preds)

MSE, MAE, R^2, EVS: [703305915.855146, 10591.771428100881, 0.8988739324658478, 0.8988868716901801]


In [9]:
selector_f = SelectPercentile(f_regression, percentile=25)
selector_f.fit(x_train, y_train)

dicto = {}
# filling dictionary
for name, score, pv in zip(list(houseA), selector_f.scores_,
                          selector_f.pvalues_):
    dicto[name] = pv
    
dicto = sorted(dicto.items(), key=lambda x: x[1])
print(dicto)

[('Lot.Area', 0.0), ('Low.Qual.Fin.SF', 1.0817494038151571e-265), ('Garage.Yr.Blt', 6.165167249807467e-221), ('Garage.Cars', 1.210187411104425e-214), ('Bsmt.Unf.SF', 2.093093115610574e-200), ('Total.Bsmt.SF', 2.809264497814274e-191), ('Overall.Cond', 1.6308749711711436e-149), ('Bsmt.Half.Bath', 7.35285846377562e-146), ('Year.Built', 4.314023943337278e-133), ('Year.Remod.Add', 1.5577532284997153e-110), ('Kitchen.AbvGr', 9.22385697098912e-106), ('TotRms.AbvGrd', 8.237244046540062e-96), ('Fireplaces', 2.3755643441188007e-95), ('Mas.Vnr.Area', 1.6828310466439684e-87), ('Neighborhood_NWAmes', 8.031680968506664e-83), ('Garage.Area', 1.9888883750325708e-51), ('MS.SubClass', 2.0985510014428709e-44), ('Lot.Frontage', 5.556883251907022e-44), ('Lot.Shape_IR2', 1.3270096597965693e-42), ('Wood.Deck.SF', 4.812344031640597e-40), ('Neighborhood_NPkVill', 1.1573921788671377e-39), ('MS.Zoning_RH', 4.0527365359251255e-38), ('Alley_Grvl', 3.1985749560705083e-35), ('Full.Bath', 2.3537556186067975e-34), ('G

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


## 3. Model Building
Of the models, I think either the percentile based feature selection or the base model are the best of the two models, although I think that percentile based would likely work better with a new data set, as I have the feeling that the base model is overtrained.

In [10]:
# rank the features based on scores, best 25% features
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)


# create a model that uses these 25% best features only
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)

# make predictions and look at results
preds = model.predict(xt_test)
print_regression_error_report(preds, y_test)

#R^2 went up and EVS went down. A bit better results by R^2

MSE, MAE, R^2, EVS: [801790457.2901453, 13817.393195902929, 0.868436472772449, 0.8687449702269437]


In [11]:
selector_f = SelectKBest(f_regression, k=3) # select top 3 
selector_f.fit(x_train, y_train)

# rank the features based on scores, best 3 features
xt_train, xt_test= selector_f.transform(x_train), selector_f.transform(x_test)

# create a model that uses these 3 best features only
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)

# make predictions and look at results
preds = model.predict(xt_test)
print_regression_error_report(preds, y_test)

MSE, MAE, R^2, EVS: [1457254426.6767094, 22549.87230170949, 0.7028862701294384, 0.7029962802989402]


  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


## 4. Predicting and Validating
I was unable to predict and validate on dataset B. I don't understand to use the transform function on houseB. It gives me strange errors, and it's probably something super simple that I'm just overlooking, but I cannot for the life of me figure it out. Sorry.

In [None]:
preds2 = model.predict(houseB)