# Data Analytics

## Importing Packages

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

import os

print(os.getcwd())

/Users/javian/TheCode/Housing-Price-Time-Series-Forecasting


## Check out Datasets

In [35]:
# Load the dataset
data = pd.read_csv("./data/train-test/data_nerual_network_models.csv")

data.head()

Unnamed: 0,storey_range,resale_price,poi_vector,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,...,flat_model_Simplified,flat_model_Standard,flat_model_Terrace,flat_model_Type S1,flat_model_Type S2,floor_area_sqm,remaining_lease,sora,bto_supply_within_4km,distance2cbd
0,2,12.449019,[-3.3962243e+00 -4.0309663e+00 3.7387407e+00 ...,0,0,0,0,0,0,0,...,0,0,0,0,0,-1.546558,-0.345484,-0.802967,-0.833389,-0.524994
1,0,12.524526,[-3.4233415 -4.574654 3.5261717 -0.558677...,0,0,0,0,0,0,0,...,0,0,0,0,0,-1.213944,-0.727997,-0.802967,-0.16797,-0.58202
2,0,12.560244,[-3.3962243e+00 -4.0309663e+00 3.7387407e+00 ...,0,0,0,0,0,0,0,...,0,0,0,0,0,-1.172367,-0.8045,-0.802967,-0.899113,-0.565965
3,0,12.577636,[-3.2633016e+00 -4.6608467e+00 3.5853505e+00 ...,0,0,0,0,0,0,0,...,0,0,0,0,0,-1.213944,-0.881003,-0.802967,-0.296103,-0.738546
4,2,12.577636,[-3.4233415 -4.574654 3.5261717 -0.558677...,0,0,0,0,0,0,0,...,0,0,0,0,0,-1.213944,-0.8045,-0.802967,-0.04403,-0.637076


In [36]:
# Convert poi_vector column to mean float
data['poi_vector_mean'] = data['poi_vector'].apply(lambda x: np.mean([float(i) for i in x.strip('][').split()]))

# Drop the original poi_vector column
data.drop(columns=['poi_vector'], inplace=True)

In [37]:
data.head()

Unnamed: 0,storey_range,resale_price,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,...,flat_model_Standard,flat_model_Terrace,flat_model_Type S1,flat_model_Type S2,floor_area_sqm,remaining_lease,sora,bto_supply_within_4km,distance2cbd,poi_vector_mean
0,2,12.449019,0,0,0,0,0,0,0,0,...,0,0,0,0,-1.546558,-0.345484,-0.802967,-0.833389,-0.524994,0.218139
1,0,12.524526,0,0,0,0,0,0,0,0,...,0,0,0,0,-1.213944,-0.727997,-0.802967,-0.16797,-0.58202,0.237577
2,0,12.560244,0,0,0,0,0,0,0,0,...,0,0,0,0,-1.172367,-0.8045,-0.802967,-0.899113,-0.565965,0.218139
3,0,12.577636,0,0,0,0,0,0,0,0,...,0,0,0,0,-1.213944,-0.881003,-0.802967,-0.296103,-0.738546,0.225553
4,2,12.577636,0,0,0,0,0,0,0,0,...,0,0,0,0,-1.213944,-0.8045,-0.802967,-0.04403,-0.637076,0.237577


In [38]:
print(data.dtypes)

storey_range                           int64
resale_price                         float64
town_BEDOK                             int64
town_BISHAN                            int64
town_BUKIT BATOK                       int64
town_BUKIT MERAH                       int64
town_BUKIT PANJANG                     int64
town_BUKIT TIMAH                       int64
town_CENTRAL AREA                      int64
town_CHOA CHU KANG                     int64
town_CLEMENTI                          int64
town_GEYLANG                           int64
town_HOUGANG                           int64
town_JURONG EAST                       int64
town_JURONG WEST                       int64
town_KALLANG/WHAMPOA                   int64
town_MARINE PARADE                     int64
town_PASIR RIS                         int64
town_PUNGGOL                           int64
town_QUEENSTOWN                        int64
town_SEMBAWANG                         int64
town_SENGKANG                          int64
town_SERAN

In [39]:
# Normalize numerical features
scaler = StandardScaler()
data[['floor_area_sqm', 'remaining_lease', 'sora', 'bto_supply_within_4km', 'distance2cbd']] = scaler.fit_transform(data[['floor_area_sqm', 'remaining_lease', 'sora', 'bto_supply_within_4km', 'distance2cbd']])

In [40]:
# Split dataset into features and target
X = data.drop(columns=['resale_price'])
y = data['resale_price']

In [41]:
# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert data to numpy arrays
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)


In [42]:
print("X_train dtype:", X_train.dtype)
print("y_train dtype:", y_train.dtype)
print("X_val dtype:", X_val.dtype)
print("y_val dtype:", y_val.dtype)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train dtype: float64
y_train dtype: float64
X_val dtype: float64
y_val dtype: float64
X_train shape: (144420, 47)
y_train shape: (144420,)
X_val shape: (30947, 47)
y_val shape: (30947,)


In [43]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

  super().__init__(**kwargs)


In [44]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stop])

Epoch 1/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 26ms/step - loss: 6.3261 - val_loss: 0.1216
Epoch 2/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 27ms/step - loss: 0.8752 - val_loss: 0.1281
Epoch 3/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 27ms/step - loss: 0.6767 - val_loss: 0.0267
Epoch 4/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 26ms/step - loss: 0.4806 - val_loss: 0.0301
Epoch 5/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 27ms/step - loss: 0.3637 - val_loss: 0.0214
Epoch 6/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 28ms/step - loss: 0.2541 - val_loss: 0.0189
Epoch 7/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 28ms/step - loss: 0.1708 - val_loss: 0.0192
Epoch 8/100
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 27ms/step - loss: 0.1111 - val_loss:

KeyboardInterrupt: 

In [None]:
# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)

In [None]:
# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Make predictions
predictions = model.predict(X_test)

In [None]:
# Visualize predictions against actual values
plt.figure(figsize=(10, 6))
plt.plot(y_test, label='Actual')
plt.plot(predictions, label='Predicted')
plt.title('Actual vs Predicted')
plt.xlabel('Samples')
plt.ylabel('Resale Price')
plt.legend()
plt.show()