In [None]:
#import the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Traditional Machine Learning
from sklearn.tree import DecisionTreeClassifier

# Deep Learning Setup
import tensorflow as tf
from tensorflow.keras.models import Sequential           # Sequential model: stack layers linearly
from tensorflow.keras.layers import Dense, Input         # Dense: fully connected layer, Input: define input shape
from tensorflow.keras.optimizers import Adam             # Adam: an efficient optimizer for training

import warnings
warnings.filterwarnings("ignore")

# Set seeds for reproducibility
import random
seed_value = 42  # Choose any seed value you want
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
PATH = '/content/drive/My Drive/DatasetforDeepLearning/PROIECT_dubai_properties.csv'
QHS=pd.read_csv(PATH)

Mounted at /content/drive


In [None]:
QHS['Address'].value_counts()

Unnamed: 0_level_0,count
Address,Unnamed: 1_level_1
"Khalifa City, Abu Dhabi",79
"Madinat Al Riyadh, Abu Dhabi",52
"Noya, Yas Island, Abu Dhabi",45
"Al Shamkha, Abu Dhabi",37
"Shakhbout City, Abu Dhabi",32
...,...
"MSH3, Shakhbout City, Abu Dhabi",1
"Al Sabeel Building, Al Ghadeer, Abu Dhabi",1
"Beach Tower B, Beach Towers, Shams Abu Dhabi, Al Reem Island, Abu Dhabi",1
"Shabiya 09, Mussafah Community, Mohammed Bin Zayed City, Abu Dhabi",1


In [None]:
#cluster latitude and longitude

In [None]:
QHS['Furnishing'].value_counts()

Unnamed: 0_level_0,count
Furnishing,Unnamed: 1_level_1
Unfurnished,910
Furnished,89


In [None]:
#preview data
QHS.head()

Unnamed: 0,Address,Rent,Beds,Baths,Type,Area_in_sqft,Rent_per_sqft,Rent_category,Frequency,Furnishing,Purpose,Posted_date,Age_of_listing_in_days,Location,City,Latitude,Longitude
0,"The Gate Tower 2, The Gate Tower, Shams Gate D...",124000,3,4,Apartment,1785,69.467787,Medium,Yearly,Unfurnished,For Rent,3/7/2024,45,Al Reem Island,Abu Dhabi,24.493598,54.407841
1,"Water's Edge, Yas Island, Abu Dhabi",140000,3,4,Apartment,1422,98.452883,Medium,Yearly,Unfurnished,For Rent,3/8/2024,44,Yas Island,Abu Dhabi,24.494022,54.607372
2,"Al Raha Lofts, Al Raha Beach, Abu Dhabi",99000,2,3,Apartment,1314,75.342466,Medium,Yearly,Furnished,For Rent,3/21/2024,31,Al Raha Beach,Abu Dhabi,24.485931,54.600939
3,"Marina Heights, Marina Square, Al Reem Island,...",220000,3,4,Penthouse,3843,57.246942,High,Yearly,Unfurnished,For Rent,2/24/2024,57,Al Reem Island,Abu Dhabi,24.493598,54.407841
4,"West Yas, Yas Island, Abu Dhabi",350000,5,7,Villa,6860,51.020408,High,Yearly,Unfurnished,For Rent,2/16/2024,65,Yas Island,Abu Dhabi,24.494022,54.607372


In [None]:
#seeing if there are any missing values
QHS.isnull().sum()

Unnamed: 0,0
Address,0
Rent,0
Beds,0
Baths,0
Type,0
Area_in_sqft,0
Rent_per_sqft,0
Rent_category,0
Frequency,0
Furnishing,0


In [None]:
#Cleaning the missing longitudes and latitudes
QHS.dropna(inplace=True)
#Checking again for missing values
QHS.isnull().sum()

Unnamed: 0,0
Address,0
Rent,0
Beds,0
Baths,0
Type,0
Area_in_sqft,0
Rent_per_sqft,0
Rent_category,0
Frequency,0
Furnishing,0


In [None]:
#Checking for shape
QHS.shape

(989, 17)

In [None]:
#Columns
QHS.columns

Index(['Address', 'Rent', 'Beds', 'Baths', 'Type', 'Area_in_sqft',
       'Rent_per_sqft', 'Rent_category', 'Frequency', 'Furnishing', 'Purpose',
       'Posted_date', 'Age_of_listing_in_days', 'Location', 'City', 'Latitude',
       'Longitude'],
      dtype='object')

The dataset is made out of 17 variables out of which 16 are the features ('Address', 'Rent', 'Beds', 'Baths', 'Type', 'Area_in_sqft',
       'Rent_per_sqft', 'Rent_category', 'Frequency', 'Furnishing', 'Purpose',
       'Posted_date', 'Age_of_listing_in_days', 'Location', 'City', 'Latitude',
       'Longitude') and the Rent is the target
      

In [None]:
QHS.info()


<class 'pandas.core.frame.DataFrame'>
Index: 989 entries, 0 to 998
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Address                 989 non-null    object 
 1   Rent                    989 non-null    int64  
 2   Beds                    989 non-null    int64  
 3   Baths                   989 non-null    int64  
 4   Type                    989 non-null    object 
 5   Area_in_sqft            989 non-null    int64  
 6   Rent_per_sqft           989 non-null    float64
 7   Rent_category           989 non-null    object 
 8   Frequency               989 non-null    object 
 9   Furnishing              989 non-null    object 
 10  Purpose                 989 non-null    object 
 11  Posted_date             989 non-null    object 
 12  Age_of_listing_in_days  989 non-null    int64  
 13  Location                989 non-null    object 
 14  City                    989 non-null    object 

In [None]:
#dummy coding the categorical variables
df_encoded=pd.get_dummies(QHS,drop_first=True)


In [None]:
print(df_encoded.head())

     Rent  Beds  Baths  Area_in_sqft  Rent_per_sqft  Age_of_listing_in_days  \
0  124000     3      4          1785      69.467787                      45   
1  140000     3      4          1422      98.452883                      44   
2   99000     2      3          1314      75.342466                      31   
3  220000     3      4          3843      57.246942                      57   
4  350000     5      7          6860      51.020408                      65   

    Latitude  Longitude  \
0  24.493598  54.407841   
1  24.494022  54.607372   
2  24.485931  54.600939   
3  24.493598  54.407841   
4  24.494022  54.607372   

   Address_Abu Dhabi National Exhibition Centre ADNEC, Capital Centre, Abu Dhabi  \
0                                              False                               
1                                              False                               
2                                              False                               
3                         

In [None]:
#Defining features (X) and target (y)
X=df_encoded.drop('Rent',axis=1)
y=df_encoded['Rent']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Scaling the continuous variables

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print("Shape of the X: ", X_train_scaled.shape)
print("Number of X variables: ", X_train_scaled.shape[1])

Shape of the X:  (791, 499)
Number of X variables:  499


In [None]:
model = Sequential()
model.add(Input(shape=(X_train_scaled.shape[1],)))
model.add(Dense(32, activation='relu'))#one hidden layer should be enough for this type of problem
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))#relu or linear the same

In [None]:
model.compile(optimizer='Adam',
              loss='mse',
              metrics=['mae'])

In [None]:
model.fit(X_train_scaled, y_train, epochs=30, verbose=1)


Epoch 1/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 19202682880.0000 - mae: 113210.2266
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 19202271232.0000 - mae: 113208.6094 
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 19201785856.0000 - mae: 113206.7812 
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 19200991232.0000 - mae: 113203.9141
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 19199674368.0000 - mae: 113199.2812 
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 19197558784.0000 - mae: 113192.0156 
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 19194324992.0000 - mae: 113181.1172
Epoch 8/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 19189600

<keras.src.callbacks.history.History at 0x783f73d5fd90>

In [None]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
#we should calculate the R2 on the test data instead of what we did up

NameError: name 'model' is not defined

In [None]:
#graphs on the epochs

In [None]:
# Predict the rent the neural network
y_pred = model.predict(X_test_scaled)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [None]:
print(y_pred)

[[11479.694   ]
 [ 2521.5225  ]
 [ 3166.7126  ]
 [ 6874.824   ]
 [ 7458.6997  ]
 [ 2599.7903  ]
 [ 1473.0706  ]
 [ 7305.1426  ]
 [   41.73003 ]
 [10687.001   ]
 [ 2255.1516  ]
 [  450.1789  ]
 [ 1986.229   ]
 [ 5776.151   ]
 [ 3886.2378  ]
 [ 3521.2014  ]
 [ 4355.7915  ]
 [   29.248259]
 [10674.9375  ]
 [ 1761.9092  ]
 [  651.41    ]
 [ 1109.045   ]
 [ 5881.2603  ]
 [10226.107   ]
 [ 4981.3364  ]
 [ 2192.4421  ]
 [ 2653.176   ]
 [ 4178.1387  ]
 [  630.87445 ]
 [ 2204.628   ]
 [  412.1567  ]
 [ 1030.9172  ]
 [ 4829.8843  ]
 [  782.17145 ]
 [  795.4205  ]
 [ 6586.902   ]
 [ 3113.522   ]
 [ 1227.4116  ]
 [10921.808   ]
 [ 8243.202   ]
 [ 1154.375   ]
 [  971.894   ]
 [ 2258.6086  ]
 [ 4152.346   ]
 [ 1961.8075  ]
 [ 3171.1094  ]
 [ 4512.059   ]
 [ 2728.2017  ]
 [ 1406.3478  ]
 [ 1135.2825  ]
 [ 5585.075   ]
 [ 7338.8877  ]
 [ 2153.1335  ]
 [ 6320.603   ]
 [ 2269.3325  ]
 [ 1033.3467  ]
 [ 3006.8586  ]
 [ 1498.0353  ]
 [12446.073   ]
 [  907.6692  ]
 [ 7809.6196  ]
 [ 2659.4207  ]
 [ 3888.

In [None]:
#compare R2 from Neuronal Network with a multiple regression (sklearn)

In [None]:
#Significance and slope of the coefficient for the multiple regression

In [None]:
#Prefering the linear method if R2 are equal because its more efficient