## Housing Prices in Singapore

This exercise will attempt to predict resale housing prices in Singapore

In [None]:
# data 
import pandas as pd
import numpy as np

# directories
import os
#os.getcwd() #get working directory

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


In [None]:
# Set the relative director for training data
root_dir = "~"
train_data_dir = os.path.join(root_dir,"pythonprojects","Yidu","HW1 Data","train.csv")
test_data_dir = os.path.join(root_dir,"pythonprojects","Yidu","HW1 Data","test.csv")

# read the data frame
train_data = pd.read_csv(train_data_dir)
test_data = pd.read_csv(test_data_dir)

# Print out number of instances for each dataset
print("Training Data has {0} instances".format(len(train_data)))
print("Test Data has {0} instances".format(len(test_data)))

In [None]:
print(train_data.columns)
train_data.describe()

### Data Exploration

In [None]:
train_data.info()

In [None]:
train_data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
train_data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.01)
plt.show()

In [None]:
# Splitting y_train
y_train_data = train_data.loc[:,'resale_price']

In [None]:
# Getting x_train_data and x_test_data

x_train_data = train_data.loc[:, train_data.columns != 'resale_price']
x_train_data = x_train_data.loc[:, x_train_data.columns != 'index']
x_train_data.info()

x_test_data = test_data
x_test_data = x_test_data.loc[:, x_test_data.columns != 'index']
x_test_data.info()

### Data Preparation

In [None]:
x_train_data.head()

In [None]:
# uniqueness of datasets
print("street_name: ", len(set(x_train_data['street_name'])))
print("town: ", len(set(x_train_data['town'])))
print("flat_model: ", len(set(x_train_data['flat_model'])))
print("storey_range:", len(set(x_train_data['storey_range'])))
print("floor:", len(set(x_train_data['floor'])))
print("flat model: ", len(set(x_train_data['flat_model'])))
print("postal code: ", len(set(x_train_data['postal_code'])))
print("block: ", len(set(x_train_data['block'])))

In [None]:
# creating new variables
x_train_data['lease_age'] = 2018 - x_train_data['lease_commence_date']
x_test_data['lease_age'] = 2018 - x_test_data['lease_commence_date']

In [None]:
# dropping variables

# street name - TOO MANY UNIQUE POINTS, PROBLEM WITH ENCODING
x_train_data = x_train_data.loc[:, x_train_data.columns != 'street_name']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'street_name']

# floor and storey range too similar, will drop STOREY RANGE 
x_train_data = x_train_data.loc[:, x_train_data.columns != 'storey_range']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'storey_range']

# lease age > lease commence date, dropp lease commence date
x_train_data = x_train_data.loc[:, x_train_data.columns != 'lease_commence_date']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'lease_commence_date']

# dropping postal code - since we already have town
x_train_data = x_train_data.loc[:, x_train_data.columns != 'postal_code']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'postal_code']

# dropping block - too unique, probably not that important
x_train_data = x_train_data.loc[:, x_train_data.columns != 'block']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'block']

# dropping lat and long - already have town
x_train_data = x_train_data.loc[:, x_train_data.columns != 'latitude']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'latitude']

x_train_data = x_train_data.loc[:, x_train_data.columns != 'longitude']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'longitude']

# dropping month - already have lease_age
x_train_data = x_train_data.loc[:, x_train_data.columns != 'month']
x_test_data = x_test_data.loc[:, x_test_data.columns != 'month']


In [None]:
# check before ecoding
x_train_data.head()

One Hot Encoding

In [None]:
set(x_train_data['flat_type'])

In [None]:
 def map_flat_type_bedroom(flat_type):
    if flat_type == '1 ROOM':
        no_bedroom = 0
    elif flat_type == '2 ROOM':
        no_bedroom = 1
    elif flat_type == '3 ROOM':
        no_bedroom = 2
    elif flat_type == '4 ROOM':
        no_bedroom = 3
    elif flat_type == '5 ROOM':
        no_bedroom = 3
    elif flat_type == 'EXECUTIVE':
        no_bedroom = 3
    elif flat_type == 'MULTI GENERATION':
        no_bedroom = 4
    return (no_bedroom)

 def map_flat_type_bath(flat_type):
    if flat_type == '1 ROOM':
        no_bath = 1
    elif flat_type == '2 ROOM':
        no_bath = 1
    elif flat_type == '3 ROOM':
        no_bath = 2
    elif flat_type == '4 ROOM':
        no_bath = 2
    elif flat_type == '5 ROOM':
        no_bath = 2
    elif flat_type == 'EXECUTIVE':
        no_bath = 2
    elif flat_type == 'MULTI GENERATION':
        no_bath = 3
    return (no_bath)

In [None]:
x_train_data['no_bath']=x_train_data['flat_type'].apply(map_flat_type_bath)
x_train_data['no_bedroom']=x_train_data['flat_type'].apply(map_flat_type_bedroom)

In [None]:
x_train_data.head()

In [None]:
#filling NA's with the median of the column:
x_train_data = x_train_data.fillna(x_train_data.median())
x_test_data = x_test_data.fillna(x_test_data.median())

In [None]:
x_train_data.info()

In [None]:
# Encoding categorical variables

x_train_encoded = pd.get_dummies(x_train_data)

# Visualize
x_train_encoded.columns

### Model Selection

#### Linear Regression

In [None]:
import sklearn.linear_model
from sklearn.model_selection import cross_val_score

lin_model = sklearn.linear_model.LinearRegression()
lin_scores = cross_val_score(lin_model, x_train_encoded, y_train_data,
                         scoring="neg_mean_absolute_error", cv=3)
model1_mae_scores = (-lin_scores)
print(model1_mae_scores)

In [None]:
some_data = x_train_encoded.iloc[:5]
print(some_data)
print(y_train_data.iloc[:5])

In [None]:
lin_model.fit(x_train_encoded, y_train_data)
y_predict = lin_model.predict(some_data)
print(y_predict)
print(list(y_train_data.iloc[:5]))

#### SGD Regressor

In [None]:
sgdr_model = sklearn.linear_model.SGDRegressor()
sgdr_scores = cross_val_score(sgdr_model, x_train_encoded, y_train_data,
                         scoring="neg_mean_absolute_error", cv=3)
sgdr_mae_scores = -sgdr_scores
print(sgdr_mae_scores)

In [None]:
sgdr_model.fit(x_train_encoded, y_train_data)
y_predict = sgdr_model.predict(some_data)
print(y_predict)
print(list(y_train_data.iloc[:5]))

#### Lasso Regression

In [None]:
lasso_model = sklearn.linear_model.Lasso()
lasso_scores = cross_val_score(lasso_model, x_train_encoded, y_train_data,
                         scoring="neg_mean_absolute_error", cv=3)
lasso_mae_scores = -lasso_scores
print(lasso_mae_scores)

In [None]:
lasso_model.fit(x_train_encoded, y_train_data)
y_predict = lasso_model.predict(some_data)
print(y_predict)
print(list(y_train_data.iloc[:5]))

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg_scores = cross_val_score(svm_reg, x_train_encoded, y_train_data,
                         scoring="neg_mean_absolute_error", cv=3)
svm_reg_mae_scores = -lasso_scores
print(svm_reg_mae_scores)

In [None]:
svm_reg.fit(x_train_encoded, y_train_data)
y_predict = svm_reg_model.predict(some_data)
print(y_predict)
print(list(y_train_data.iloc[:5]))