# Preprocessing the properties dataset


### Importing the libraries

In [31]:
import mysql.connector
from mysql.connector import Error
import pandas as pd
from dotenv import load_dotenv
from sklearn.preprocessing import OneHotEncoder
import os

### Importing the dataset

In [32]:
# Description: This script is used to connect to the MySQL database and fetch data from the property_info table.
# Load environment variables
load_dotenv()

try:
    connection = mysql.connector.connect(
        host=os.getenv('DB_HOST'),
        user=os.getenv('DB_USER'),
        password=os.getenv('DB_PASSWORD'),
        database='properties'
    )

    if connection.is_connected():
        print('Connected to MySQL database')

    # Define your SQL query and fetch data
    sql_query = "SELECT * FROM property_info"
    df = pd.read_sql(sql_query, connection)
    print(df.head())

except Error as e:
    print(f"Error: {e}")

finally:
    # Close the connection
    if connection.is_connected():
        connection.close()
        print('MySQL connection closed')


Connected to MySQL database
   id                                                url  \
0   1           https://emirates.estate/property/o45496/   
1   2           https://emirates.estate/property/o21299/   
2   3          https://emirates.estate/property/o109134/   
3   4            https://emirates.estate/property/o8201/   
4   5  https://emirates.estate/property/golf-grand-25...   

                                               title   city  \
0                 Apartment in Dubai Marina, № 45496  Dubai   
1  Apartment in GOLF SUITES in Dubai Hills Estate...  Dubai   
2               Townhouse in Nadd Al Sheba, № 109134  Dubai   
3                         Apartment in Dubai, № 8201  Dubai   
4  Apartment in GOLF SUITES in Dubai Hills Estate...  Dubai   

               region       type  num_rooms  num_bathrooms    size      price  
0        Dubai Marina  Apartment          2              2  148.60  6250000.0  
1  Dubai Hills Estate  Apartment          2              2  102.20  1500

  df = pd.read_sql(sql_query, connection)


### Start preprocessing

In [33]:
df.head()

Unnamed: 0,id,url,title,city,region,type,num_rooms,num_bathrooms,size,price
0,1,https://emirates.estate/property/o45496/,"Apartment in Dubai Marina, № 45496",Dubai,Dubai Marina,Apartment,2,2,148.6,6250000.0
1,2,https://emirates.estate/property/o21299/,Apartment in GOLF SUITES in Dubai Hills Estate...,Dubai,Dubai Hills Estate,Apartment,2,2,102.2,1500890.0
2,3,https://emirates.estate/property/o109134/,"Townhouse in Nadd Al Sheba, № 109134",Dubai,Nadd Al Sheba,Townhouse,3,3,287.26,4100000.0
3,4,https://emirates.estate/property/o8201/,"Apartment in Dubai, № 8201",Dubai,,Apartment,2,3,143.0,2000000.0
4,5,https://emirates.estate/property/golf-grand-25...,Apartment in GOLF SUITES in Dubai Hills Estate...,Dubai,Dubai Hills Estate,Apartment,2,2,99.31,2079890.0


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19384 entries, 0 to 19383
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             19384 non-null  int64  
 1   url            19384 non-null  object 
 2   title          19384 non-null  object 
 3   city           19384 non-null  object 
 4   region         16111 non-null  object 
 5   type           19365 non-null  object 
 6   num_rooms      19384 non-null  int64  
 7   num_bathrooms  19384 non-null  int64  
 8   size           19384 non-null  float64
 9   price          19384 non-null  float64
dtypes: float64(2), int64(3), object(5)
memory usage: 1.5+ MB


In [35]:
df.nunique()

id               19384
url              19345
title            19345
city                 1
region             136
type                16
num_rooms           19
num_bathrooms       19
size              5940
price             7211
dtype: int64

In [36]:
df.describe()

Unnamed: 0,id,num_rooms,num_bathrooms,size,price
count,19384.0,19384.0,19384.0,19384.0,19384.0
mean,9692.5,2.385369,2.76981,2545.906,8585803.0
std,5595.823145,3.425411,3.165611,168405.7,41938660.0
min,1.0,1.0,1.0,0.4,250.0
25%,4846.75,1.0,1.0,78.7,1290000.0
50%,9692.5,2.0,2.0,128.4,2365500.0
75%,14538.25,3.0,4.0,228.9025,4800000.0
max,19384.0,356.0,326.0,18580500.0,1958890000.0


In [37]:
df.isnull().sum()

id                  0
url                 0
title               0
city                0
region           3273
type               19
num_rooms           0
num_bathrooms       0
size                0
price               0
dtype: int64

In [38]:
df[df['size'] < 0]

Unnamed: 0,id,url,title,city,region,type,num_rooms,num_bathrooms,size,price


In [39]:
df2 = df.copy(deep=True)
df2.drop(["id", "url", "title", "city"], axis=1, inplace=True)

In [40]:
df2.dropna(subset=['region', 'type'], axis=0, inplace=True)

In [41]:
df2.head()

Unnamed: 0,region,type,num_rooms,num_bathrooms,size,price
0,Dubai Marina,Apartment,2,2,148.6,6250000.0
1,Dubai Hills Estate,Apartment,2,2,102.2,1500890.0
2,Nadd Al Sheba,Townhouse,3,3,287.26,4100000.0
4,Dubai Hills Estate,Apartment,2,2,99.31,2079890.0
5,Umm Suqeim,Apartment,1,2,76.1,1510000.0


In [42]:
df2.isnull().sum()

region           0
type             0
num_rooms        0
num_bathrooms    0
size             0
price            0
dtype: int64

In [43]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16102 entries, 0 to 19383
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   region         16102 non-null  object 
 1   type           16102 non-null  object 
 2   num_rooms      16102 non-null  int64  
 3   num_bathrooms  16102 non-null  int64  
 4   size           16102 non-null  float64
 5   price          16102 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 880.6+ KB


In [44]:
df2.nunique()

region            136
type               16
num_rooms          17
num_bathrooms      18
size             5295
price            6289
dtype: int64

In [45]:
df2.drop_duplicates(inplace=True)

In [46]:
df2 = df2[df2['num_rooms'] != 356]

df2 = df2.reset_index(drop=True)

In [52]:
df2 = df2[df2['num_bathrooms'] != 326]

df2 = df2.reset_index(drop=True)

In [47]:
df2 = df2[df2['num_bathrooms'] != 181]

df2 = df2.reset_index(drop=True)

In [54]:
df2 = df2[df2['num_bathrooms'] != 41]

df2 = df2.reset_index(drop=True)

In [55]:
df2.describe()

Unnamed: 0,num_rooms,num_bathrooms,size,price
count,14492.0,14492.0,14492.0,14492.0
mean,2.248551,2.680582,1208.177,7499375.0
std,1.37815,1.637756,93300.49,30843280.0
min,1.0,1.0,0.4,250.0
25%,1.0,1.0,77.7,1231330.0
50%,2.0,2.0,122.0,2301000.0
75%,3.0,4.0,210.0,4600000.0
max,27.0,27.0,11203500.0,1500000000.0


In [49]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15072 entries, 0 to 15071
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   region         15072 non-null  object 
 1   type           15072 non-null  object 
 2   num_rooms      15072 non-null  int64  
 3   num_bathrooms  15072 non-null  int64  
 4   size           15072 non-null  float64
 5   price          15072 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 706.6+ KB


In [57]:
df2 = df2[df2['region'] != 'Dubai Hills Estate']

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14492 entries, 0 to 14491
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   region         14492 non-null  object 
 1   type           14492 non-null  object 
 2   num_rooms      14492 non-null  int64  
 3   num_bathrooms  14492 non-null  int64  
 4   size           14492 non-null  float64
 5   price          14492 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 679.4+ KB


In [58]:
df2['region'].unique()

array(['Dubai Marina', 'Nadd Al Sheba', 'Umm Suqeim',
       'Jumeirah Lake Towers', 'Meydan One',
       'Downtown Dubai (Downtown Burj Dubai)',
       'Dubai Creek Harbour (The Lagoons)', 'Jumeirah Village Circle',
       'Arjan', 'Al Furjan', 'Sheikh Zayed Road', 'Rukan',
       'Maritime City', 'Palm Jumeirah', 'Business Bay', 'Meydan',
       'Dubai Harbour', 'Dubai Studio City', 'Al Safa',
       'DAMAC Hills (Akoya by DAMAC)', 'International City',
       'Dubai Airport Freezone (DAFZA)', 'Al Warsan', 'Dubai Land',
       'Bluewaters', 'Jumeirah Beach Residence', 'EMAAR South',
       'Burj Khalifa', 'Dubai Silicon Oasis', 'Technology Park',
       'Meydan Avenue', 'Al Barsha', 'Town Square', 'Al Khawaneej',
       'Serena', 'Jumeirah', 'Dubai Sports City', 'Jebel Ali',
       'Al Jaddaf', 'Jumeirah Village Triangle',
       'Mohammed Bin Rashid City', 'Dubai Science Park', 'Remraam',
       'Tilal Al Ghaf', 'Al Sufouh', 'Reem', 'Arabian Ranches 3',
       'Culture Village', 'Ba