In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix , accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import GridSearchCV


import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Read dataset

In [2]:
df = pd.read_csv('villas.csv')

# EDA

In [3]:
# to print all dataset's column
pd.options.display.max_columns = 32

In [4]:
df.tail()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,front,bedrooms,livingrooms,bathrooms,streetWidth,stair,propertyAge,driverRoom,tent,patio,kitchen,outdoorRoom,garage,duplex,squareSpace,adNumber,lastUpdate,adViews,details,price,location,numberOfApartments,maidRoom,elevator,dimentions,furnished,scheme,pool,basement
51840,51840,4716,جنوبية,5,2.0,5+,20.0,1.0,0.0,,1.0,1.0,1.0,1.0,1.0,,385,3547226,قبل يوم,1564,فيلا للبيع في حي المونسية ، الرياض ، الرياض,2250000,شرق الرياض,1.0,1.0,1.0,385م طول - 1م عرض,,,,
51841,51841,4717,غربية,7+,2.0,5+,12.0,1.0,,,1.0,1.0,1.0,1.0,1.0,,500,4391251,قبل 3 أيام,728,فيلا للبيع في شارع بدائع المرير ، حي ظهرة البد...,1050000,غرب الرياض,,1.0,,25م طول - 20م عرض,,,,
51842,51842,4718,غربية,5,3.0,5+,20.0,,,,1.0,1.0,1.0,1.0,1.0,1.0,200,3670119,قبل يوم,249,فيلا للبيع في شارع سدره ، حي طويق ، الرياض ، ا...,1000000,غرب الرياض,,1.0,,,,,,
51843,51843,4719,جنوبية غربية,1,,1,15.0,,0.0,,,,1.0,,,,405,4448200,قبل 12 ساعة,40,فيلا للبيع في شارع اللوز ، حي النهضة ، الرياض ...,2300000,شرق الرياض,2.0,,,20م طول - 20م عرض,,,,
51844,51844,4720,شمالية,6,2.0,5+,15.0,1.0,0.0,,1.0,1.0,1.0,1.0,1.0,,750,4074453,قبل شهرين,355,فيلا للبيع في حي المونسية ، الرياض ، الرياض,4100000,شرق الرياض,2.0,1.0,,,,,,


In [5]:
# print dataset's rows and columns
rows, columns = df.shape
print(f'the dataset contains {rows} rows.')
print(f'the dataset contains {columns} columns.')

the dataset contains 51845 rows.
the dataset contains 31 columns.


In [6]:
# print dataset's details
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51845 entries, 0 to 51844
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0.1        51845 non-null  int64  
 1   Unnamed: 0          51845 non-null  int64  
 2   front               51845 non-null  object 
 3   bedrooms            51845 non-null  object 
 4   livingrooms         50614 non-null  object 
 5   bathrooms           51845 non-null  object 
 6   streetWidth         51640 non-null  float64
 7   stair               35387 non-null  float64
 8   propertyAge         45368 non-null  object 
 9   driverRoom          18412 non-null  float64
 10  tent                23789 non-null  float64
 11  patio               39951 non-null  float64
 12  kitchen             50754 non-null  float64
 13  outdoorRoom         33178 non-null  float64
 14  garage              47031 non-null  float64
 15  duplex              15462 non-null  float64
 16  squa

## Data Cleaning

In [7]:
# Note: Most of the data cleaning steps were applied using Excel.

In [8]:
# drop unnecessary columns
df.drop(['Unnamed: 0.1', 'Unnamed: 0','details', 'dimentions',
         'adViews', 'adNumber', 'lastUpdate', 'scheme'], inplace = True, axis = 1)

In [9]:
# print null values
df.isnull().sum()

front                     0
bedrooms                  0
livingrooms            1231
bathrooms                 0
streetWidth             205
stair                 16458
propertyAge            6477
driverRoom            33433
tent                  28056
patio                 11894
kitchen                1091
outdoorRoom           18667
garage                 4814
duplex                36383
squareSpace               0
price                     0
location                  0
numberOfApartments    29988
maidRoom              12185
elevator              39421
furnished             47783
pool                  46013
basement              49907
dtype: int64

In [10]:
# replace NaN with zeros
df['maidRoom'] = df['maidRoom'].replace(np.nan, 0)
df['furnished'] = df['furnished'].replace(np.nan, 0)
df['elevator'] = df['elevator'].replace(np.nan, 0)
df['pool'] = df['pool'].replace(np.nan, 0)
df['basement'] = df['basement'].replace(np.nan, 0)
df['numberOfApartments'] = df['numberOfApartments'].replace(np.nan, 0)
df['duplex'] = df['duplex'].replace(np.nan, 0)
df['garage'] = df['garage'].replace(np.nan, 0)
df['outdoorRoom'] = df['outdoorRoom'].replace(np.nan, 0)
df['kitchen'] = df['kitchen'].replace(np.nan, 0)
df['patio'] = df['patio'].replace(np.nan, 0)
df['driverRoom'] = df['driverRoom'].replace(np.nan, 0)
df['tent'] = df['tent'].replace(np.nan, 0)
df['stair'] = df['stair'].replace(np.nan, 0)

In [11]:
# print null values after filling some columns
df.isnull().sum()

front                    0
bedrooms                 0
livingrooms           1231
bathrooms                0
streetWidth            205
stair                    0
propertyAge           6477
driverRoom               0
tent                     0
patio                    0
kitchen                  0
outdoorRoom              0
garage                   0
duplex                   0
squareSpace              0
price                    0
location                 0
numberOfApartments       0
maidRoom                 0
elevator                 0
furnished                0
pool                     0
basement                 0
dtype: int64

In [12]:
# drop null values
df.dropna(inplace=True)

In [13]:
# print number of duplicated rows
df.duplicated().sum()

14804

In [14]:
# drop duplicated rows
df.drop_duplicates(inplace= True)

In [15]:
# applied data manupilating on price column
df['price'] = df['price'].replace(',','',regex=True)
df['price'] = pd.to_numeric(df['price']) 

In [16]:
# applied data manupilating on squareSpace column
df['squareSpace'] = df['squareSpace'].replace(',','',regex=True)
df['squareSpace'] = pd.to_numeric(df['squareSpace']) 

In [17]:
# change streetWidth data type
df['streetWidth'] = pd.to_numeric(df['streetWidth']) 

In [18]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()

basement              1185
bathrooms                0
bedrooms                 0
driverRoom               0
duplex                   0
elevator              7093
front                    0
furnished             2425
garage                2358
kitchen                548
livingrooms              0
location                 0
maidRoom              6635
numberOfApartments       0
outdoorRoom              0
patio                 6443
pool                  3402
price                 2052
propertyAge              0
squareSpace           2640
stair                    0
streetWidth           1016
tent                     0
dtype: int64

In [19]:
# print dataset's rows and columns after cleaning stage
rows, columns = df.shape
print(f'the dataset contains {rows} rows.')
print(f'the dataset contains {columns} columns.')

the dataset contains 29557 rows.
the dataset contains 23 columns.


## Data Visualization

# Machine Learning Algorithms

## Prepare data for ML

In [20]:
# the only solution to solve encoder error
df = df.astype({"bathrooms": str})
df = df.astype({"front": str})
df = df.astype({"livingrooms": str})
df = df.astype({"bathrooms": str})
df = df.astype({"propertyAge": str})
df = df.astype({"location": str})
df = df.astype({"numberOfApartments": str})

In [21]:
from sklearn import preprocessing
# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0,front,bedrooms,livingrooms,bathrooms,propertyAge,location,numberOfApartments
0,شرقية,5,4,5+,0,جنوب الرياض,0
1,غربية,4,3,5+,0,غرب الرياض,2
2,جنوبية شرقية,7+,2,5+,31,جنوب الرياض,0
3,غربية,7+,3,5+,3,غرب الرياض,0
4,شمالية,4,2,4,0,شرق الرياض,3


In [22]:
# apply Label encoder to df_categorical
le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0,front,bedrooms,livingrooms,bathrooms,propertyAge,location,numberOfApartments
0,5,4,3,4,0,0,0
1,9,3,2,4,0,3,9
2,3,6,1,4,25,0,0
3,9,6,2,4,23,3,0
4,6,3,1,3,0,1,12


In [23]:
# concat df_categorical with original df
df = df.drop(df_categorical.columns, axis=1)
df = pd.concat([df, df_categorical], axis=1)
df.head()

Unnamed: 0,streetWidth,stair,driverRoom,tent,patio,kitchen,outdoorRoom,garage,duplex,squareSpace,price,maidRoom,elevator,furnished,pool,basement,front,bedrooms,livingrooms,bathrooms,propertyAge,location,numberOfApartments
0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,300,1050000,0.0,0.0,0.0,0.0,0.0,5,4,3,4,0,0,0
1,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,540,3000000,1.0,1.0,0.0,0.0,0.0,9,3,2,4,0,3,9
2,15.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,875,2000000,1.0,0.0,0.0,0.0,0.0,3,6,1,4,25,0,0
3,15.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,200,894000,1.0,0.0,1.0,0.0,0.0,9,6,2,4,23,3,0
4,25.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,400,3500000,1.0,1.0,1.0,0.0,0.0,6,3,1,3,0,1,12


# ML models

### Split dataset into training and testing

In [24]:
x = df.drop(['price'],axis=1)
y = df['price']

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [26]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#### SVM Model

In [27]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 

In [28]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(x_train, y_train)
grid_predictions = grid.predict(x_test)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.032 total time= 3.0min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.032 total time= 7.1min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.032 total time= 6.1min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.032 total time= 2.9min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.032 total time= 3.0min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.067 total time= 1.7min
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.070 total time=15.6min
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.068 total time= 1.7min
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.067 total time= 1.7min
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.071 total time=14.8min
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.065 total time= 1.3min
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.058 total time= 3.6min
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.063 total time= 3.7min
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.058 total time= 3.7min
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.056 total time= 3.7min
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.090 total time= 1.9min
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.080 total time= 1.9min
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.084 total time= 1.9min
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.093 total time= 1.9min
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.089 total time= 1.9min
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.115 total time= 1.7min
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.112 total time= 1.7min
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.111 total time= 1.7min
[CV 4/5] END ....C=1000, gam

In [39]:
print(grid.best_params_)

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}


In [40]:
print(grid.best_estimator_)

SVC(C=100, gamma=0.01)


In [29]:
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
         790       0.00      0.00      0.00         0
        1080       0.00      0.00      0.00         0
       70000       0.00      0.00      0.00         1
       90000       0.00      0.00      0.00         0
      100000       0.00      0.00      0.00         2
      108000       0.00      0.00      0.00         1
      113000       0.00      0.00      0.00         0
      130000       0.00      0.00      0.00         1
      150000       0.50      0.50      0.50         2
      165000       0.00      0.00      0.00         1
      170000       0.00      0.00      0.00         1
      180000       0.00      0.00      0.00         1
      250000       0.00      0.00      0.00         0
      300000       0.00      0.00      0.00         1
      320000       0.00      0.00      0.00         0
      350000       0.00      0.00      0.00         2
      450000       0.00    

In [41]:
svm = SVC(C=100, gamma=0.01, kernel='rbf')
svm.fit(x_train,y_train)
svm_pred = svm.predict(x_test)

In [42]:
print('SVM Model Accuracy :',accuracy_score(svm_pred,y_test)) 
print('Accuracy on training set:',svm.score(x_train,y_train))
print('Accuracy on test set:',svm.score(x_test,y_test))

SVM Model Accuracy : 0.1205683355886333
Accuracy on training set: 0.47791762529886767
Accuracy on test set: 0.1205683355886333


In [43]:
scores = cross_val_score(svm, x, y, cv=10)
print("Accuracy: {:.1f} %".format(scores.mean()*100))

Accuracy: 23.0 %


#### KNN Model

In [33]:
knn = KNeighborsClassifier(weights = 'distance', n_neighbors = 7)
knn.fit(x_train,y_train)
knn_pred = knn.predict(x_test)

In [34]:
print('KNN Model Accuracy :',accuracy_score(knn_pred,y_test)) 
print('Accuracy on training set:',knn.score(x_train,y_train))
print('Accuracy on test set:',knn.score(x_test,y_test))

KNN Model Accuracy : 0.07794316644113668
Accuracy on training set: 0.9644967744845943
Accuracy on test set: 0.07794316644113668


In [35]:
scores = cross_val_score(knn, x, y, cv=10)
print("Accuracy: {:.1f} %".format(scores.mean()*100))

Accuracy: 22.0 %


#### Decision Tree Model

In [36]:
ds = DecisionTreeClassifier()
ds.fit(x_train,y_train)
ds_prediction =  ds.predict(x_test)

In [37]:
print('Ds Model Accuracy :',accuracy_score(ds_prediction,y_test)) 
print('Accuracy on training set:',ds.score(x_train,y_train))
print('Accuracy on test set:',ds.score(x_test,y_test))

Ds Model Accuracy : 0.17550744248985115
Accuracy on training set: 0.9644967744845943
Accuracy on test set: 0.17550744248985115


In [44]:
scores = cross_val_score(ds, x, y, cv=10)
print("Accuracy: {:.1f} %".format(scores.mean()*100))

Accuracy: 18.6 %
