## CS4287 - Neural Network
### Assignment 1 - 4th Year Semester 1 2024

Adam Collins: 21332967

Italo da Silva: 21326312

The Code executes to the end without an error. 

In [None]:
# Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [None]:
# 1. The Data Set 
#   (a). Visualisation of key attributes.

# Loading the csv file and viewing first rows using pandas library
housing_data = pd.read_csv('Melbourne_housing_FULL.csv', low_memory=False) 

# Filtering the Dataset
# Dropping every cell that contains a NaN value (there was alot)
columns = ["Suburb","Rooms","Type","Price","Method","SellerG","Date","Distance","Bedroom2","Bathroom","Car","Landsize","BuildingArea","YearBuilt","CouncilArea","Lattitude","Longtitude","Regionname","Propertycount"]
housing_data = housing_data.dropna(subset=columns)

# Dropping all the columns that don't contain numbers (there was alot too )
housing_data.drop(["Suburb", "Address", "Type", "Method", "SellerG", "CouncilArea", "Regionname", "Date"], inplace=True, axis="columns")
housing_data.head()

In [None]:
# Draw graph of correlation between Price and Building Area

## Converting the data in the columns to numeric 
housing_data[['Price', 'BuildingArea']] = housing_data[['Price', 'BuildingArea']].apply(pd.to_numeric)

## Reshaping the data to a 2D array in order to plot it in the Linear Regression 
price = housing_data['Price'].values.reshape(-1,1)
building_area = housing_data['BuildingArea'].values.reshape(-1,1)

## Creating a Linear Regression model to predict the data for Building Area based on the Price
reg = LinearRegression().fit(price, building_area)

## Functions to plot the graph
plt.plot(housing_data[['Price']], housing_data[['BuildingArea']], '*')
plt.plot(housing_data[['Price']], reg.predict(price), 'r')

## Adding labels to the graph
plt.xlabel('Price')
plt.ylabel('Building Area')
plt.title('Price vs Building Area')
plt.show()

In [None]:
# From sample Assignment 1 provided by the lecturer.

# Dataframe of key attributes
housing_data.corr()

# Correlation matrix
corr_matrix = housing_data.corr()

# Generate Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='jet', cbar=True, fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# # 1. The Data Set 
#   (b). Pre-Processing - Normalisation

# Normalizing the data
scaler = MinMaxScaler()
housing_data[['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Propertycount']] = scaler.fit_transform(housing_data[['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Propertycount']])

# Dropping Postcode, Landsize and Propertycount as the values are very close to 0
# Also dropping Longitude and Langitude columns as they're useless for correlating price as we already have a Distance (to CBD) column.

housing_data.drop(["Postcode", "Landsize", "Propertycount", "Longtitude", "Lattitude"], inplace=True, axis="columns")

# Sine we dropped alot of rows previously due to NaN values and Unnecessary data, the rows index stayed the same.
# So we now need to reset the index back to normal to avoid confusion and keep them in order.
housing_data = housing_data.reset_index(drop=True)

housing_data.head()

In [None]:
rows = len(housing_data.axes[0])
columns = len(housing_data.axes[1])

print("Number of columns: ", columns)
print("Number of rows: ", rows)

In [None]:
# Select the features and target variable
# Correlating the price of each property based on it's features
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'BuildingArea','YearBuilt']

x = housing_data[features]
y = housing_data['Price']

# Dividing the price column into 5 sections (labels) in order to classify correctly and can be used in the output layer of our model.

# 0.00 to 0.15 (first section)
# 0.15 to 0.38 (second section)
# 0.38 to 0.62 (third section)
# 0.62 to 0.85 (fourth section)
# 0.85 to 1.00 (last section)

y = pd.cut(y, bins=[0.00, 0.15, 0.38, 0.62, 0.85, 1.00], labels=[0, 1, 2, 3, 4])
print(y)

# In order to validate and evaluate the model, we are splitting the dataset into two parts, training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

print("\nx_train shape: ", x_train.shape[0])
print("x_test shape: ", x_test.shape[0])

In [None]:
# One hot representation of the samples (Lab3 - Exercise 2)
y_train = tf.keras.utils.to_categorical(y_train, 5)
y_test = tf.keras.utils.to_categorical(y_test, 5)

# Defining model
model = tf.keras.models.Sequential()

# Add layers to the model
model.add(tf.keras.layers.Dense(24, input_shape=(len(features),), activation='relu'))

# Softmax classification - Converts the output for each class to a probability value between 0-1, which is exponentially normalized among the classes. 
model.add(tf.keras.layers.Dense(5, activation='softmax'))

# TODO: Add comment saying why use Adam
optimiser = tf.keras.optimizers.Adam(learning_rate=0.01)

# Compile the model with categorical_crossentropy loss function, we are using this function
# because it computes the loss between the labels (the price labels we have previously classified) and predictions
model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=100, verbose=1, validation_data=(x_test, y_test))