from https://www.kaggle.com/code/hugosjoberg/house-prices-prediction-using-keras

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler # Used for scaling of data
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend as K
from keras.wrappers.scikit_learn import KerasRegressor

In [6]:
show_graphs = False

In [7]:
# Read in train data
try:
    df_train = pd.read_csv('untidy_df_listings_v06.csv', index_col=0)
except:
    !wget https://raw.githubusercontent.com/jayportfolio/capstone_streamlit/main/data/final/untidy_df_listings_v06.csv
    df_train = pd.read_csv('untidy_df_listings_v06.csv', index_col=0)

In [8]:
df_train.head()

Unnamed: 0,Price,bedrooms,bathrooms,nearestStation,location.latitude,location.longitude,latitude_deviation,longitude_deviation,tenure.tenureType
14520525,550000.0,3.0,1.0,0.274316,51.52995,-0.20702,0.030472,0.1018,LEASEHOLD
27953107,400000.0,2.0,2.0,0.305845,51.54939,-0.4826,0.049912,0.37738,LEASEHOLD
33593487,579950.0,2.0,1.0,0.438045,51.44718,-0.33877,0.052298,0.23355,FREEHOLD
35271294,370000.0,2.0,1.0,0.399307,51.449568,-0.140154,0.04991,0.034934,LEASEHOLD
35429088,599950.0,2.0,1.0,0.238187,51.57703,-0.14123,0.077552,0.03601,


# Prepare data
    Investigate what data that has a linear or some kind of relation to the sale price
    Drop the unimportant features or less unimportant features
    Drop features which has many NaN values

In [9]:
#descriptive statistics summary
df_train['Price'].describe()

count     54067.000000
mean     416448.380528
std      113505.624206
min      100000.000000
25%      325000.000000
50%      425000.000000
75%      500000.000000
max      600000.000000
Name: Price, dtype: float64

In [10]:
#skewness and kurtosis
print("Skewness: %f" % df_train['Price'].skew())
print("Kurtosis: %f" % df_train['Price'].kurt())

Skewness: -0.220900
Kurtosis: -0.724044


    - Skewness means the top of the iceberg is not in the middle but rather towards left or right.
    - Kurtosis describe if the gaussian distrubution is very small and narrow or very wide

Let's have a  look at the missing data.

Let's display a % of the data that is missing from some columns.

In [11]:
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
tenure.tenureType,3654,0.067583
bathrooms,3498,0.064698
bedrooms,1802,0.033329
location.latitude,10,0.000185
latitude_deviation,10,0.000185
Price,0,0.0
nearestStation,0,0.0
location.longitude,0,0.0
longitude_deviation,0,0.0


Some of theese features are of interest for us and they don't show a massive shortage of data so lets create mean data for those values.

In [12]:
df_train = df_train.fillna(df_train.mean())

  df_train = df_train.fillna(df_train.mean())


Now let's remove outliers for example data that doesn't match what we expect like an insane price for a house

To do this we standardize the data so that the mean is 0 and a standard deviation of 1. 

In [13]:
#standardizing data
saleprice_scaled = StandardScaler().fit_transform(df_train['Price'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

outer range (low) of the distribution:
[[-2.78797909]
 [-2.78797909]
 [-2.78797909]
 [-2.78797909]
 [-2.78797909]
 [-2.78797909]
 [-2.78797909]
 [-2.78797909]
 [-2.78797909]
 [-2.78797909]]

outer range (high) of the distribution:
[[1.61712971]
 [1.61712971]
 [1.61712971]
 [1.61712971]
 [1.61712971]
 [1.61712971]
 [1.61712971]
 [1.61712971]
 [1.61712971]
 [1.61712971]]


  saleprice_scaled = StandardScaler().fit_transform(df_train['Price'][:,np.newaxis]);


    -Values that are similar to each other stay close to 0
    -Values that are a bit odd get high values such as the 7 values.

# Prepare data
Right now I think we have an idea of what kind of data we are interested in and what data we don't think are useful for us. Let's build a pipeline for removing the data.

Let's reload the data so we can have a fresh start!

In [14]:
df_train = pd.read_csv('untidy_df_listings_v06.csv', index_col=0)

Let's not log the data since a neural network is quite good at working with non-linear data. I also tested and verified that the model didn't perform better or worse if I logged the data before hand.

In [15]:
cols = ['Price','bedrooms','bathrooms','nearestStation','location.latitude','location.longitude','latitude_deviation','longitude_deviation']
features = ['bedrooms','bathrooms','nearestStation','location.latitude','location.longitude','latitude_deviation','longitude_deviation']
df_train = df_train[cols]
# Create dummy values
df_train = pd.get_dummies(df_train)
#filling NA's with the mean of the column:
df_train = df_train.fillna(df_train.mean())
# Always standard scale the data before using NN
scale = StandardScaler()
X_train = df_train[features]
X_train = scale.fit_transform(X_train)
# Y is just the 'SalePrice' column
y = df_train['Price'].values
seed = 7
np.random.seed(seed)
# split into 67% for train and 33% for test
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.33, random_state=seed)
X_test.shape, y_test.shape

((17843, 7), (17843,))

In [16]:
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(40, activation='relu'))
    model.add(Dense(1))
    # Compile model
    model.compile(optimizer ='adam', loss = 'mean_squared_error', 
              metrics =[metrics.mae])
    return model

In [17]:
model = create_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                80        
                                                                 
 dense_1 (Dense)             (None, 30)                330       
                                                                 
 dense_2 (Dense)             (None, 40)                1240      
                                                                 
 dense_3 (Dense)             (None, 1)                 41        
                                                                 
Total params: 1,691
Trainable params: 1,691
Non-trainable params: 0
_________________________________________________________________


2022-11-28 14:02:34.651485: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
history = model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=150, batch_size=32)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
 196/1132 [====>.........................] - ETA: 1s - loss: 11422338048.0000 - mean_absolute_error: 88191.0391

Exception ignored in: <function UniquePtr.__del__ at 0x7ffa73466430>
Traceback (most recent call last):
  File "/home/guava/PycharmProjects/capstone_streamlit/venv/lib/python3.8/site-packages/tensorflow/python/framework/c_api_util.py", line 70, in __del__
    def __del__(self):
KeyboardInterrupt: 


KeyboardInterrupt: 

Let's investigate how well this model did!

In [None]:
# summarize history for accuracy
plt.plot(history.history['mean_absolute_error'])
plt.plot(history.history['val_mean_absolute_error'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

This result is not very good and gives us a mean absolute error just above 20000 dollars. I beleive this model performs bad due to the fact that we have a quite small data-set becuase a neural network performs the best when having a big dataset. 

In [None]:
# Read in train data
try:
    df_train = pd.read_csv('untidy_df_listings_v06.csv', index_col=0)
except:
    !wget https://raw.githubusercontent.com/jayportfolio/capstone_streamlit/main/data/final/untidy_df_listings_v06.csv
    df_train = pd.read_csv('untidy_df_listings_v06.csv', index_col=0)

In [None]:
scale = StandardScaler()
X_test = scale.fit_transform(X_test)

In [None]:
prediction = model.predict(X_test)

In [None]:
submission = pd.DataFrame()
submission['Predicted Price'] = prediction.flatten()
submission['Actual'] = y_test
submission