# House Price Prediction using Regression Models

In [244]:
# loading libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import date

mode : "TRAIN_EXPLORE" or "TRAIN" or "PREDICT" = "PREDICT"

## Importing Data

In [213]:
# Importing the dataset

# TRAIN
if mode == "TRAIN_EXPLORE" or mode == "TRAIN":
    dataset = pd.read_csv('data-week-2b-train.csv')
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

# TEST
if mode == "PREDICT":
    dataset = pd.read_csv('data-week-2b-test.csv')
    X = dataset

In [214]:
X

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


In [215]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [216]:
# print(X.info())
# X.head()
# pd.DataFrame(filter(lambda isNull: not isNull, X.isnull().any()))
X.isnull().sum().sort_values(ascending=False).head(20)

# The top 7 features in this list are not worth including.
# The rest of the features can be imputed or ommitted

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageCond        81
GarageType        81
GarageYrBlt       81
GarageQual        81
GarageFinish      81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
Electrical         1
BedroomAbvGr       0
dtype: int64

In [217]:
filtered_dataset = dataset.drop(columns=["PoolQC","MiscFeature","Alley","Fence","MasVnrType","FireplaceQu","LotFrontage"])
X_id = filtered_dataset.iloc[:,0]
X = filtered_dataset.iloc[:, 1:-1] # omit id 1:
y = filtered_dataset.iloc[:, -1]
X.info()

# We can impute the rest of the null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 72 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   object 
 4   LotShape       1460 non-null   object 
 5   LandContour    1460 non-null   object 
 6   Utilities      1460 non-null   object 
 7   LotConfig      1460 non-null   object 
 8   LandSlope      1460 non-null   object 
 9   Neighborhood   1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Condition2     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  HouseStyle     1460 non-null   object 
 14  OverallQual    1460 non-null   int64  
 15  OverallCond    1460 non-null   int64  
 16  YearBuilt      1460 non-null   int64  
 17  YearRemodAdd   1460 non-null   int64  
 18  RoofStyl

In [218]:
X_id

0          1
1          2
2          3
3          4
4          5
        ... 
1455    1456
1456    1457
1457    1458
1458    1459
1459    1460
Name: Id, Length: 1460, dtype: int64

## Preprocessing

### Impute and Encode

In [219]:
# %pip install scikit-learn
# Taking care of missing data
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(exclude=['number']).columns


imputer = SimpleImputer(strategy='mean')
X[num_cols] = imputer.fit_transform(X[num_cols])

imputer = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imputer.fit_transform(X[cat_cols])

X.info()

# NOTE though the dtype info of the variables are lost, we are anyway
# going to normalize them, so it won't be a problem

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 72 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   float64
 1   MSZoning       1460 non-null   object 
 2   LotArea        1460 non-null   float64
 3   Street         1460 non-null   object 
 4   LotShape       1460 non-null   object 
 5   LandContour    1460 non-null   object 
 6   Utilities      1460 non-null   object 
 7   LotConfig      1460 non-null   object 
 8   LandSlope      1460 non-null   object 
 9   Neighborhood   1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Condition2     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  HouseStyle     1460 non-null   object 
 14  OverallQual    1460 non-null   float64
 15  OverallCond    1460 non-null   float64
 16  YearBuilt      1460 non-null   float64
 17  YearRemodAdd   1460 non-null   float64
 18  RoofStyl

In [220]:
# one-hot-encode columns
X = pd.get_dummies(X)
X.head(5)

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,150.0,...,False,False,False,True,False,False,False,False,True,False
1,20.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,284.0,...,False,False,False,True,False,False,False,False,True,False
2,60.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,434.0,...,False,False,False,True,False,False,False,False,True,False
3,70.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,540.0,...,False,False,False,True,True,False,False,False,False,False
4,60.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,490.0,...,False,False,False,True,False,False,False,False,True,False


### Scaling

In [221]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

# Scale X
X = sc.fit_transform(X)
X = pd.DataFrame(X)

X.isnull().any().any()

False

In [222]:

# Scale y
if mode == "TRAIN_EXPLORE" or mode == "TRAIN":
    min_y = min(y)
    max_y = max(y)
    range_y = max_y - min_y
    y = y.map(lambda r: (r - min_y)/range_y)
    # y = [(revenue - min(y))/(max(y) - min(y)) for revenue in y]

In [223]:
y

0       0.241078
1       0.203583
2       0.261908
3       0.145952
4       0.298709
          ...   
1455    0.194556
1456    0.243161
1457    0.321622
1458    0.148903
1459    0.156367
Name: SalePrice, Length: 1460, dtype: float64

### List of Features

In [224]:
import pprint
features = list(X.columns)
pprint.pprint(features, compact=True)
print("# of features:",len(features))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
 213, 214, 215, 216, 217, 218, 21

### Splitting

In [225]:
# Splitting the dataset into the Training set and Test set
# NOTE we still split the data into train and test, because our test data, in the other csv file doesn't have response var
if mode == "TRAIN_EXPLORE":
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
    # TODO random_state, test_size


In [226]:
# TEST data train the model with all data
if mode == "TRAIN":
    X_train = X
    y_train = y

In [227]:
# print(X_train)
# print(X_test)
print(y_train)
# print(y_test)

0       0.241078
1       0.203583
2       0.261908
3       0.145952
4       0.298709
          ...   
1455    0.194556
1456    0.243161
1457    0.321622
1458    0.148903
1459    0.156367
Name: SalePrice, Length: 1460, dtype: float64


## Training

### Multiple Linear Regression model

In [228]:

if mode == "TRAIN" or "TRAIN_EXPLORE":
    from sklearn.linear_model import LinearRegression
    multi_lin_reg = LinearRegression()
    multi_lin_reg.fit(X_train, y_train)

In [229]:
if mode == "TRAIN_EXPLORE":
    y_pred = multi_lin_reg.predict(X_test)
    sq_error1 = (y_test-y_pred)**2
    msq_error1 = sq_error1.mean()
    print(msq_error1)

# Mean squared error

### Decision Tree Regression model

In [230]:

if mode == "TRAIN" or "TRAIN_EXPLORE":
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)

In [231]:
if mode == "TRAIN_EXPLORE":
    y_pred2 = regressor.predict(X_test)
    sq_error2 = (y_test-y_pred2)**2
    msq_error2 = sq_error2.mean()
    print(msq_error2)

# Mean Squared Error

### Polynomial Regression models

In [232]:
from sklearn.preprocessing import PolynomialFeatures

#### Degree 2 (Not optimal)

In [233]:
if mode == "TRAIN" or mode == "TRAIN_EXPLORE":
    poly_reg = PolynomialFeatures(degree = 2) # A
    X_poly2 = poly_reg.fit_transform(X_train) # A
    lin_reg_quad = LinearRegression() # B
    lin_reg_quad.fit(X_poly2, y_train) # B

In [234]:
if mode == "TRAIN_EXPLORE":
    y_pred3 = lin_reg_quad.predict(poly_reg.fit_transform(X_test))
    sq_error3 = (y_test-y_pred3)**2
    msq_error3 = sq_error3.mean()
    print(msq_error3)

#### Degree 3 (non-optimal memory requirements) SKIPPING

In [235]:
# poly_reg = PolynomialFeatures(degree = 3)
# X_poly3 = poly_reg.fit_transform(X_train)
# lin_reg_cubic = LinearRegression()
# lin_reg_cubic.fit(X_poly3, y_train)
# y_pred4 = lin_reg_cubic.predict(poly_reg.fit_transform(X_test))
# y_pred4

# Ran out of memory :( "Unable to allocate 27.6 GiB for this  NOT OPTIMAL. SKIPPING!

In [236]:
# sq_error4 = (y_test-y_pred4)**2
# msq_error4 = sq_error4.mean()
# msq_error4

### Random Forest Regression model

In [237]:

if mode == "TRAIN" or mode == "TRAIN_EXPLORE":
    from sklearn.ensemble import RandomForestRegressor
    rf_regressor = RandomForestRegressor(n_estimators = 25, random_state = 0)
    rf_regressor.fit(X_train, y_train)

In [238]:
if mode == "TRAIN_EXPLORE":
    y_pred6 = rf_regressor.predict(X_test)
    sq_error6 = (y_test-y_pred6)**2
    msq_error6 = sq_error6.mean()
    print(msq_error6)


#### Trying to make a better model by taking the most important metrics

In [239]:
if mode == "TRAIN_EXPLORE":
    importances = list(rf_regressor.feature_importances_)
    features
    sorted_importance = sorted([i for i in zip(features,importances,range(len(features)))],key= lambda x: x[1],reverse=True)
    # sorted_importance = np.array(sorted_importance)
    most_important_features = pd.DataFrame(sorted_importance, columns=['Feature','Importance','Index']).head(50)
    most_important_features.head(20)


### Building a better model (Feature Importance)

In [240]:
if mode == "TRAIN_EXPLORE":
    fig = plt.figure(figsize=(12,5))
    plt.bar(most_important_features["Feature"], most_important_features["Importance"], orientation = 'vertical')

    plt.xticks(most_important_features["Feature"], rotation=45, ha="right")
    plt.ylim(0,0.21)
    plt.ylabel('Importance')
    plt.xlabel('Feature')
    plt.title('Feature Importance')
    plt.show()

In [241]:
# TAKE ONLY THE TOP 7 features

if mode == "TRAIN_EXPLORE":
    topfeatures = most_important_features.iloc[0:7,2]
    X_train_2 = X_train[topfeatures]
    X_test_2 = X_test[topfeatures]
    # TRAIN THE MODEL
    rf_regressor_2 = RandomForestRegressor(n_estimators = 34, random_state = 0)
    rf_regressor_2.fit(X_train_2, y_train)
    y_pred_2 = rf_regressor_2.predict(X_test_2)
    #  CALCULATE THE MSQ
    sq_error_2 = (y_test-y_pred_2)**2
    msq_error_2 = sq_error_2.mean()
    msq_error_2


In [242]:
# The top 7 features of this dataset are
most_important_features[0:7]

Unnamed: 0,Feature,Importance,Index
0,2,0.577948,2
1,14,0.115367,14
2,10,0.036235,10
3,11,0.030108,11
4,7,0.029889,7
5,12,0.029043,12
6,25,0.019356,25


# Prediction on test data

In [245]:
# Run through all the preprocessing steps with the test data

if mode == "PREDICT":
    X_test = X[topfeatures]

    # PREDICT
    y_pred_test = rf_regressor_2.predict(X_test)
    y_pred_test = (y_pred_test * range_y) + min_y

    # CSV
    out = pd.DataFrame({
        "Id": X_id,
        "SalePrice": y_pred_test,
    }
    )

    print(out)
    out.to_csv('prediction-week-2a.csv',index=False)

        Id      SalePrice
0        1  208749.735294
1        2  126681.735294
2        3  221920.941176
3        4  213617.352941
4        5  278519.294118
...    ...            ...
1455  1456  161648.529412
1456  1457  149989.705882
1457  1458  259338.705882
1458  1459  104350.000000
1459  1460  112641.911765

[1460 rows x 2 columns]
