In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
file_path = 'datasets/melbourne_housing_raw.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Parse the 'Date' column as datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')
df = df[['Date','Lattitude','Longtitude','YearBuilt','BuildingArea','Landsize','Car','Bathroom','Bedroom2','Distance','Price']]

# sort the rows by date
df = df.sort_values('Date')

# make an additional column with day count starting from the first date
df['DayCount'] = (df['Date'] - df['Date'].min())  / np.timedelta64(1,'D')

# divide the price by 1000 to make it easier to read
df['Price'] = df['Price'] / 1000
# Drop rows with missing values
df = df.dropna()


# save the df to a new csv file
df[['Date','Lattitude','Longtitude','YearBuilt','BuildingArea','Landsize','Car','Bathroom','Bedroom2','Distance','Price']].to_csv('datasets/melbourne_housing_clean.csv', index=False)

In [7]:

df = df[['Lattitude','Longtitude','YearBuilt','BuildingArea','Landsize','Car','Bathroom','Bedroom2','Distance']]

# print the min max of each column
print(df.describe().loc[['min','max']])

# print the mean and std of each column
print(df.describe().loc[['mean','std']])

n = 100
# select n random samples (np.array) within the min max range
samples_uniform = np.zeros((n, 9))
for i in range(9):
    samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)


# sample n random normal samples (np.array) within the mean std range
samples_normal = np.zeros((n, 9))
for i in range(9):
    samples_normal[:,i] = np.random.normal(df.describe().loc['mean'][i], df.describe().loc['std'][i], n)


# sample n random samples from the dataset withou replacement
samples_exact = df.sample(n)

anchors = {'uniform': samples_uniform, 'normal': samples_normal, 'exact': samples_exact}
# pickle the samples for later use
import pickle
with open('melbourne_anchor_samples.pkl', 'wb') as f:
    pickle.dump(anchors, f)





     Lattitude  Longtitude  YearBuilt  BuildingArea  Landsize   Car  Bathroom  \
min  -38.17436   144.42379     1196.0           0.0       0.0   0.0       1.0   
max  -37.40720   145.52635     2019.0        3112.0   42800.0  10.0       9.0   

     Bedroom2  Distance  
min       0.0       0.0  
max      12.0      47.4  
      Lattitude  Longtitude    YearBuilt  BuildingArea     Landsize       Car  \
mean -37.804501  144.991393  1965.753348    149.309477   523.480365  1.692247   
std    0.090549    0.118919    37.040876     87.925580  1061.324228  0.975464   

      Bathroom  Bedroom2   Distance  
mean  1.646450  3.078204  11.199887  
std   0.721611  0.966269   6.813402  


  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['min'][i], df.describe().loc['max'][i], n)
  samples_uniform[:,i] = np.random.uniform(df.describe().loc['mi