In [105]:
import pandas as pd
import re
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV  # Linear least squares with l2 regularization
from L_model import LLS
#from split import train_test_split

In [106]:
data = pd.read_csv('HousePrice.csv')
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [107]:
data.sample(10)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
2456,105,3,True,True,True,Kahrizak,1312000000.0,43733.33
803,82,2,True,True,True,Narmak,3050000000.0,101666.67
1757,49,1,True,False,True,Dezashib,3150000000.0,105000.0
1919,200,3,True,True,True,Ajudaniye,27000000000.0,900000.0
2178,235,3,True,True,True,Niavaran,31000000000.0,1033333.33
473,150,3,True,True,True,Karimkhan,7800000000.0,260000.0
1980,60,1,False,True,True,Narmak,2100000000.0,70000.0
2072,99,2,True,True,True,,4150000000.0,138333.33
211,125,2,True,True,True,Shahrake Gharb,21500000000.0,716666.67
1483,200,3,True,True,True,Heravi,14950000000.0,498333.33


In [108]:
data.columns

Index(['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address', 'Price',
       'Price(USD)'],
      dtype='object')

In [109]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3479 non-null   object 
 1   Room        3479 non-null   int64  
 2   Parking     3479 non-null   bool   
 3   Warehouse   3479 non-null   bool   
 4   Elevator    3479 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3479 non-null   float64
 7   Price(USD)  3479 non-null   float64
dtypes: bool(3), float64(2), int64(1), object(2)
memory usage: 146.2+ KB


In [110]:
data['Area'] = data['Area'].apply(lambda x: re.sub(',', '', x))
data['Area'] = pd.to_numeric(data['Area'] , errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3479 non-null   int64  
 1   Room        3479 non-null   int64  
 2   Parking     3479 non-null   bool   
 3   Warehouse   3479 non-null   bool   
 4   Elevator    3479 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3479 non-null   float64
 7   Price(USD)  3479 non-null   float64
dtypes: bool(3), float64(2), int64(2), object(1)
memory usage: 146.2+ KB


In [111]:
data.shape

(3479, 8)

In [112]:
data.isnull().sum()

Area           0
Room           0
Parking        0
Warehouse      0
Elevator       0
Address       23
Price          0
Price(USD)     0
dtype: int64

In [113]:
data.dropna(inplace= True)
data.shape

(3456, 8)

In [114]:
boolean_features = ['Parking','Warehouse','Elevator']
data[boolean_features] = data[boolean_features].astype('Int64')
data.head(10)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,1,1,1,Shahran,1850000000.0,61666.67
1,60,1,1,1,1,Shahran,1850000000.0,61666.67
2,79,2,1,1,1,Pardis,550000000.0,18333.33
3,95,2,1,1,1,Shahrake Qods,902500000.0,30083.33
4,123,2,1,1,1,Shahrake Gharb,7000000000.0,233333.33
5,70,2,1,1,0,North Program Organization,2050000000.0,68333.33
6,87,2,1,1,1,Pardis,600000000.0,20000.0
7,59,1,1,1,1,Shahran,2150000000.0,71666.67
8,54,2,1,1,0,Andisheh,493000000.0,16433.33
9,71,1,1,1,1,West Ferdows Boulevard,2370000000.0,79000.0


In [115]:
data2 = data.copy()
data2.loc[:, "Price"] =data["Price"].map('{:,.0f}'.format)

In [116]:
data2

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,1,1,1,Shahran,1850000000,61666.67
1,60,1,1,1,1,Shahran,1850000000,61666.67
2,79,2,1,1,1,Pardis,550000000,18333.33
3,95,2,1,1,1,Shahrake Qods,902500000,30083.33
4,123,2,1,1,1,Shahrake Gharb,7000000000,233333.33
...,...,...,...,...,...,...,...,...
3474,86,2,1,1,1,Southern Janatabad,3500000000,116666.67
3475,83,2,1,1,1,Niavaran,6800000000,226666.67
3476,75,2,0,0,0,Parand,365000000,12166.67
3477,105,2,1,1,1,Dorous,5600000000,186666.67


In [117]:
data['Price(USD)'] = data['Price(USD)'].apply(lambda x : x * 0.6 )

data['Price(USD)']


0        37000.002
1        37000.002
2        10999.998
3        18049.998
4       139999.998
           ...    
3474     70000.002
3475    136000.002
3476      7300.002
3477    112000.002
3478      7200.000
Name: Price(USD), Length: 3456, dtype: float64

In [118]:
data.Address.unique()

array(['Shahran', 'Pardis', 'Shahrake Qods', 'Shahrake Gharb',
       'North Program Organization', 'Andisheh', 'West Ferdows Boulevard',
       'Narmak', 'Saadat Abad', 'Zafar', 'Islamshahr', 'Pirouzi',
       'Shahrake Shahid Bagheri', 'Moniriyeh', 'Velenjak', 'Amirieh',
       'Southern Janatabad', 'Salsabil', 'Zargandeh', 'Feiz Garden',
       'Water Organization', 'ShahrAra', 'Gisha', 'Ray', 'Abbasabad',
       'Ostad Moein', 'Farmanieh', 'Parand', 'Punak', 'Qasr-od-Dasht',
       'Aqdasieh', 'Pakdasht', 'Railway', 'Central Janatabad',
       'East Ferdows Boulevard', 'Pakdasht KhatunAbad', 'Sattarkhan',
       'Baghestan', 'Shahryar', 'Northern Janatabad', 'Daryan No',
       'Southern Program Organization', 'Rudhen', 'West Pars', 'Afsarieh',
       'Marzdaran', 'Dorous', 'Sadeghieh', 'Chahardangeh', 'Baqershahr',
       'Jeyhoon', 'Lavizan', 'Shams Abad', 'Fatemi',
       'Keshavarz Boulevard', 'Kahrizak', 'Qarchak',
       'Northren Jamalzadeh', 'Azarbaijan', 'Bahar',
       'P

In [119]:
len(data.Address.unique())

192

In [120]:
data_address = data['Address'].value_counts().copy()
data_address

Address
Punak                     161
Pardis                    146
West Ferdows Boulevard    145
Gheitarieh                141
Shahran                   130
                         ... 
Chardangeh                  1
Mehrabad                    1
Pakdasht KhatunAbad         1
Kazemabad                   1
Yakhchiabad                 1
Name: count, Length: 192, dtype: int64

In [121]:
data_address_counts = data['Address'].value_counts().reset_index()
data_address_counts.columns = ['Address', 'Counts']

fig = px.bar(data_address_counts, x='Address', y='Counts', title='Address Counts')
fig.show()

In [122]:
top_5 = data.sort_values('Price',ascending=False)[:5]
top_5

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
1707,420,4,1,1,1,Zaferanieh,92400000000.0,1848000.0
1810,705,5,1,1,0,Abazar,91000000000.0,1819999.998
430,400,5,1,1,0,Lavasan,85000000000.0,1699999.998
819,680,5,1,1,0,Ekhtiarieh,81600000000.0,1632000.0
1332,350,4,1,1,1,Niavaran,80500000000.0,1609999.998


In [123]:
top_5['Price(Millions USD)'] = top_5['Price(USD)'] / 1e6
df_sorted = top_5.sort_values('Price(USD)', ascending=False)
fig = px.bar(df_sorted, x='Address', y='Price(Millions USD)', title='The 5 most expensive houses',
             labels={'Price(Millions USD)': 'Price (Millions USD)', 'Address': 'Address'},
             hover_data={'Area': True, 'Price(USD)': ':.2e'},
             text='Area',
             )
fig.update_layout(xaxis_title='Address',
                  yaxis_title='Price (Millions USD)',
                  yaxis_type='log',
                  uniformtext_minsize=8,
                  )
fig.show()

In [124]:
desire_features = ['Area', 'Room','Parking', 'Warehouse', 'Elevator']
x_dataset = data[desire_features].values
x_dataset

array([[63, 1, 1, 1, 1],
       [60, 1, 1, 1, 1],
       [79, 2, 1, 1, 1],
       ...,
       [75, 2, 0, 0, 0],
       [105, 2, 1, 1, 1],
       [82, 2, 0, 1, 1]], dtype=object)

In [125]:
y_dataset = data['Price'].values
x_dataset = x_dataset.astype(np.float64)
y_dataset = y_dataset.astype(np.float64)
x_train,x_test,y_train,y_test = train_test_split(x_dataset, y_dataset, test_size= 0.2)


In [126]:
lss_model = LLS()
lss_model.fit(x_train,y_train)
Y_pred = lss_model.predict(x_test)

print(Y_pred)

[ 4.95964203e+09  4.95964202e+09  2.48728148e+08  4.95964202e+09
  9.67055590e+09  5.75619624e+09  6.14453672e+09  9.67055591e+09
  9.67055588e+09  4.95964202e+09  2.48728154e+08  4.95964201e+09
  4.95964202e+09  1.04528238e+09  4.95964203e+09  2.48728147e+08
  4.95964203e+09  9.67055590e+09  4.95964203e+09  4.95964202e+09
  4.95964201e+09  4.95964202e+09  1.43362284e+09  4.95964202e+09
 -1.39612318e+08  4.95964201e+09  9.67055592e+09  4.95964201e+09
  4.95964203e+09  2.48728150e+08  4.95964201e+09  2.02772785e+10
  9.67055588e+09 -1.39612319e+08  2.48728145e+08  4.95964201e+09
  4.57130156e+09  4.95964203e+09  4.95964202e+09  4.95964202e+09
  4.95964201e+09  4.95964202e+09  9.67055589e+09  2.48728145e+08
  3.91435965e+09  4.95964202e+09  6.14453671e+09  4.95964202e+09
  4.95964203e+09  4.95964202e+09  4.95964201e+09  1.08554506e+10
  4.95964202e+09  2.48728144e+08  4.95964205e+09  4.95964203e+09
  9.67055589e+09  1.43814698e+10  8.23693305e+09  9.81016821e+09
  9.67055592e+09  9.67055

In [127]:
# Calculate MAE
mae_custom = mean_absolute_error(y_test, Y_pred)
print("(MAE):", mae_custom)

# Calculate MSE
mse_custom = mean_squared_error(y_test, Y_pred)
print("(MSE):", mse_custom)

# Calculate RMSE
rmse_custom = np.sqrt(mse_custom)
print("(RMSE):", rmse_custom)


(MAE): 3945447928.533638
(MSE): 5.0515920088264385e+19
(RMSE): 7107455247.011013


In [128]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
lr_predictions = lr_model.predict(x_test)
mae_lr = mean_absolute_error(y_test, lr_predictions)
mse_lr = mean_squared_error(y_test, lr_predictions)
rmse_lr = np.sqrt(mse_lr)

ridgecv_model = RidgeCV()
ridgecv_model.fit(x_train, y_train)
ridgecv_predictions = ridgecv_model.predict(x_test)
mae_ridgecv = mean_absolute_error(y_test, ridgecv_predictions)
mse_ridgecv = mean_squared_error(y_test, ridgecv_predictions)
rmse_ridgecv = np.sqrt(mse_ridgecv)

print("Mean Absolute Error (MAE):")
print("Custom LLS model:", mae_custom)
print("Linear Regression model:", mae_lr)
print("RidgeCV model:", mae_ridgecv)

print("\nMean Squared Error (MSE):")
print("Custom LLS model:", mse_custom)
print("Linear Regression model:", mse_lr)
print("RidgeCV model:", mse_ridgecv)

print("\nRoot Mean Squared Error (RMSE):")
print("Custom LLS model:", rmse_custom)
print("Linear Regression model:", rmse_lr)
print("RidgeCV model:", rmse_ridgecv)


Mean Absolute Error (MAE):
Custom LLS model: 3945447928.533638
Linear Regression model: 3943481392.64588
RidgeCV model: 1463187257221.1091

Mean Squared Error (MSE):
Custom LLS model: 5.0515920088264385e+19
Linear Regression model: 4.622395388093073e+19
RidgeCV model: 1.598741106690641e+26

Root Mean Squared Error (RMSE):
Custom LLS model: 7107455247.011013
Linear Regression model: 6798820035.927611
RidgeCV model: 12644133448720.957


In [129]:
models = ['Custom LLS', 'Linear Regression', 'RidgeCV']
mae_values = [mae_custom, mae_lr, mae_ridgecv]
mse_values = [mse_custom, mse_lr, mse_ridgecv]
rmse_values = [rmse_custom, rmse_lr, rmse_ridgecv]
fig = make_subplots(rows=1, cols=3, subplot_titles=['Mean Absolute Error (MAE)', 'Mean Squared Error (MSE)', 'Root Mean Squared Error (RMSE)'])
fig.add_trace(go.Bar(x=models, y=mae_values, name='MAE'), row=1, col=1)
fig.add_trace(go.Bar(x=models, y=mse_values, name='MSE'), row=1, col=2)
fig.add_trace(go.Bar(x=models, y=rmse_values, name='RMSE'), row=1, col=3)
fig.update_layout(title='MAE and MSE and RMSE Comparison',
                  xaxis=dict(title='Models'),
                  yaxis=dict(title='Error'),
                  template='plotly_white')
fig.show()


In [130]:
models = ['Custom LLS', 'Linear Regression', 'RidgeCV']
mae_values = [mae_custom, mae_lr]
mse_values = [mse_custom, mse_lr]
rmse_values = [rmse_custom, rmse_lr]
fig = make_subplots(rows=1, cols=3, subplot_titles=['Mean Absolute Error (MAE)', 'Mean Squared Error (MSE)', 'Root Mean Squared Error (RMSE)'])
fig.add_trace(go.Bar(x=models, y=mae_values, name='MAE'), row=1, col=1)
fig.add_trace(go.Bar(x=models, y=mse_values, name='MSE'), row=1, col=2)
fig.add_trace(go.Bar(x=models, y=rmse_values, name='RMSE'), row=1, col=3)
fig.update_layout(title='MAE and MSE and RMSE Comparison',
                  xaxis=dict(title='Models'),
                  yaxis=dict(title='Error'),
                  template='plotly_white')
fig.show()

In [131]:
# Punak:
Punak_dataset = data[data['Address'].map(lambda x :x == 'Punak')]
desire_features = ['Area', 'Room','Parking', 'Warehouse', 'Elevator']
x_dataset = Punak_dataset[desire_features].values
y_dataset = Punak_dataset['Price'].values
x_dataset = x_dataset.astype(np.float64)
y_dataset = y_dataset.astype(np.float64)
x_train,x_test,y_train,y_test = train_test_split(x_dataset, y_dataset, test_size= 0.2)
lls_model = LLS()
lls_model.fit(x_train,y_train)
Y_pred = lls_model.predict(x_test)


lr_model_p = LinearRegression()
lr_model_p.fit(x_train, y_train)
lr_pred_p = lr_model_p.predict(x_test)

ridgecv_model_p = RidgeCV()
ridgecv_model_p.fit(x_train, y_train)
ridgecv_pred_p = ridgecv_model_p.predict(x_test)


# Calculate MAE
mae_p = mean_absolute_error(y_test, Y_pred)
mae_lr = mean_absolute_error(y_test, lr_pred_p)
mae_ridgecv = mean_absolute_error(y_test, ridgecv_pred_p)

# Calculate MSE
mse_p = mean_squared_error(y_test, Y_pred)
mse_lr = mean_squared_error(y_test, lr_pred_p)
mse_ridgecv = mean_squared_error(y_test, ridgecv_pred_p)

# Calculate RMSE
rmse_p = np.sqrt(mse_p)
rmse_lr = np.sqrt(mse_lr)
rmse_ridgecv = np.sqrt(mse_ridgecv)

print("Mean Absolute Error (MAE):")
print("Custom LLS model:", mae_p)
print("Linear Regression model:", mae_lr)
print("RidgeCV model:", mae_p)

print("\nMean Squared Error (MSE):")
print("Custom LLS model:", mse_p)
print("Linear Regression model:", mse_lr)
print("RidgeCV model:", mse_p)

print("\nRoot Mean Squared Error (RMSE):")
print("Custom LLS model:", rmse_p)
print("Linear Regression model:", rmse_lr)
print("RidgeCV model:", rmse_ridgecv)


disp_custom = Y_pred[:10]
disp_lls = lr_pred_p[:10]

Mean Absolute Error (MAE):
Custom LLS model: 499487575.0122332
Linear Regression model: 461242726.0899729
RidgeCV model: 499487575.0122332

Mean Squared Error (MSE):
Custom LLS model: 4.353971488883462e+17
Linear Regression model: 3.468089199525905e+17
RidgeCV model: 4.353971488883462e+17

Root Mean Squared Error (RMSE):
Custom LLS model: 659846307.0203137
Linear Regression model: 588904847.9615281
RidgeCV model: 583855294.8133186


In [152]:
disp_custom = Y_pred[:10]
disp_lls = lr_pred_p[:10]
Houses=list(range(1,11))

fig = go.Figure(data=[
    go.Bar(name='Custom LLS', x=Houses, y= disp_custom),
    go.Bar(name='SKlearn LLS', x=Houses, y=disp_lls)
])

fig.update_layout(title='Custom and SKlearn LLS prediction methods Comparison for 10 Houses',
                  barmode='group',
                  yaxis=dict(title='Price'),
                  xaxis=dict(title='House index'),)
fig.show()