In [28]:
import pandas as pd
import os
import numpy as np

# Specify the directory containing the CSV files
folder_path = '../output'

# List to hold dataframes from each CSV file
dfs = []

# Loop through each CSV file in the directory and read it into a DataFrame
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Concatenate all the DataFrames into a single DataFrame
all_data = pd.concat(dfs, ignore_index=True)

# Drop duplicates based on 'flatmates_id'
deduplicated_data = all_data.drop_duplicates(subset='flatmates_id', keep='first')
deduplicated_data.head()

Unnamed: 0,flatmates_id,url,suburb,city,price,price_includes_bills,rooms_available,house_type,bedroom_count,bathroom_count,people_count,date
0,P1138770,/studio-sydney-enmore-2042-P1138770,Enmore,Sydney,310,True,1,Studio for rent,1,1,0,25-06-2022
1,P1205914,/share-house-sydney-alexandria-2015-P1205914,Alexandria,Sydney,310,False,2,Share House,4,3,4,25-06-2022
2,P1167439,/share-house-sydney-westmead-2145-P1167439,Westmead,Sydney,250,False,1,Share House,2,1,2,25-06-2022
3,P1032883,/share-house-sydney-carlingford-2118-P1032883,Carlingford,Sydney,170,True,1,Share House,4,2,4,25-06-2022
4,P1119758,/share-house-sydney-ashfield-2131-P1119758,Ashfield,Sydney,335,True,2,Flatshare,2,1,2,25-06-2022


In [29]:
print(df.size)
print(df.dropna().size)


87300
87300


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

# Convert 'date' column into ordinal numbers
df['date'] = pd.to_datetime(df['date']).apply(lambda x: x.toordinal())

df = pd.get_dummies(df, columns=['suburb', 'city', 'house_type'], drop_first=True)

# Split data into features and target variable
X = df.drop(['flatmates_id', 'url', 'price'], axis=1)
y = df['price']

# Split data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

# Predict prices for the test dataset
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")



Mean Squared Error: 9658.882688918826


In [31]:
df.head()

Unnamed: 0,flatmates_id,url,price,price_includes_bills,rooms_available,bedroom_count,bathroom_count,people_count,date,suburb_Aberfoyle Park,...,city_West Melbourne,city_Wundowie,city_Wyndham Vale,house_type_Flatshare,house_type_Granny Flat for rent,house_type_Homestay,house_type_Share House,house_type_Student Accommodation,house_type_Studio for rent,house_type_Whole Property for rent
0,P1300087,/share-house-sydney-mascot-2020-P1300087,255,False,1,3,2,4,738612,0,...,0,0,0,1,0,0,0,0,0,0
1,P1302851,/share-house-sydney-ashfield-2131-P1302851,425,True,2,2,1,3,738612,0,...,0,0,0,1,0,0,0,0,0,0
2,P1300282,/share-house-sydney-bondi-2026-P1300282,700,False,1,2,1,2,738612,0,...,0,0,0,1,0,0,0,0,0,0
3,P1206920,/share-house-sydney-2000-P1206920,340,False,1,2,1,1,738612,0,...,0,0,0,1,0,0,0,0,0,0
4,P1298412,/share-house-sydney-sydney-olympic-park-2127-P...,485,True,3,3,2,3,738612,0,...,0,0,0,1,0,0,0,0,0,0


In [54]:
# Assuming `df` is the DataFrame of your data after preprocessing

# Creating a dictionary with the default values from the DataFrame
default_data = {}

train_features = X_train.columns
print(train_features)

for col in X_train.columns:
    default_data[col] = 0

# Update the dictionary with the data you provided
default_data['price_includes_bills'] = False
default_data['rooms_available'] = 3
default_data['bedroom_count'] = 3
default_data['bathroom_count'] = 1
default_data['suburb_Adelaide'] = 1


default_data['date'] = pd.Timestamp('2023-12-01').to_julian_date()  # Assuming 'date' column is represented in Julian format



# Convert to DataFrame
mock_data_df = pd.DataFrame([default_data])

# Predict using the trained model
predicted_price = model.predict(mock_data_df)

print(f"Predicted rental price in Adelaide on 1st December 2023: ${predicted_price[0]:.2f}")


Index(['price_includes_bills', 'rooms_available', 'bedroom_count',
       'bathroom_count', 'people_count', 'date', 'suburb_Aberfoyle Park',
       'suburb_Acacia Gardens', 'suburb_Acacia Ridge', 'suburb_Adelaide',
       ...
       'city_West Melbourne', 'city_Wundowie', 'city_Wyndham Vale',
       'house_type_Flatshare', 'house_type_Granny Flat for rent',
       'house_type_Homestay', 'house_type_Share House',
       'house_type_Student Accommodation', 'house_type_Studio for rent',
       'house_type_Whole Property for rent'],
      dtype='object', length=1576)
Predicted rental price in Adelaide on 1st December 2023: $479.93


In [55]:
for c in X_train.columns:
    print(c)

price_includes_bills
rooms_available
bedroom_count
bathroom_count
people_count
date
suburb_Aberfoyle Park
suburb_Acacia Gardens
suburb_Acacia Ridge
suburb_Adelaide
suburb_Agnes Banks
suburb_Ainslie
suburb_Airport West
suburb_Alawa
suburb_Albanvale
suburb_Albany Creek
suburb_Albert Park
suburb_Albion
suburb_Alderley
suburb_Aldgate
suburb_Aldinga Beach
suburb_Alexandra Hills
suburb_Alexandria
suburb_Algester
suburb_Alkimos
suburb_Allawah
suburb_Allenby Gardens
suburb_Alphington
suburb_Altona
suburb_Altona Meadows
suburb_Altona North
suburb_Amaroo
suburb_Ambarvale
suburb_Amity Point
suburb_Andrews Farm
suburb_Angle Vale
suburb_Annandale
suburb_Annerley
suburb_Anula
suburb_Applecross
suburb_Arana Hills
suburb_Aranda
suburb_Archerfield
suburb_Ardeer
suburb_Armadale
suburb_Arncliffe
suburb_Artarmon
suburb_Ascot
suburb_Ascot Park
suburb_Ascot Vale
suburb_Ashburton
suburb_Ashbury
suburb_Ashfield
suburb_Ashgrove
suburb_Ashwood
suburb_Aspendale
suburb_Aspley
suburb_Athelstone
suburb_Attadale
sub

In [None]:
pr