In [202]:
import pandas as pd
import re
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV  # Linear least squares with l2 regularization
from L_model import LLS
#from split import train_test_split

In [203]:
data = pd.read_csv('HousePrice.csv')
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [204]:
data.sample(10)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
3359,175,3,True,True,True,ShahrAra,7800000000.0,260000.0
2556,130,2,True,True,False,Jeyhoon,5200000000.0,173333.33
2068,67,2,True,True,False,Punak,2280000000.0,76000.0
1384,110,2,True,True,True,Punak,6490000000.0,216333.33
3274,102,2,True,True,True,Shahran,4000000000.0,133333.33
3346,66,2,False,True,False,Afsarieh,1120000000.0,37333.33
1479,88,2,False,True,True,Shahrake Qods,760000000.0,25333.33
16,155,3,True,True,True,Narmak,6700000000.0,223333.33
312,60,2,True,True,True,Southern Janatabad,2200000000.0,73333.33
2892,83,2,True,True,True,West Ferdows Boulevard,2640000000.0,88000.0


In [205]:
data.columns

Index(['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address', 'Price',
       'Price(USD)'],
      dtype='object')

In [206]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3479 non-null   object 
 1   Room        3479 non-null   int64  
 2   Parking     3479 non-null   bool   
 3   Warehouse   3479 non-null   bool   
 4   Elevator    3479 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3479 non-null   float64
 7   Price(USD)  3479 non-null   float64
dtypes: bool(3), float64(2), int64(1), object(2)
memory usage: 146.2+ KB


In [207]:
data['Area'] = data['Area'].apply(lambda x: re.sub(',', '', x))
data['Area'] = pd.to_numeric(data['Area'] , errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3479 non-null   int64  
 1   Room        3479 non-null   int64  
 2   Parking     3479 non-null   bool   
 3   Warehouse   3479 non-null   bool   
 4   Elevator    3479 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3479 non-null   float64
 7   Price(USD)  3479 non-null   float64
dtypes: bool(3), float64(2), int64(2), object(1)
memory usage: 146.2+ KB


In [208]:
data.shape

(3479, 8)

In [209]:
data.isnull().sum()

Area           0
Room           0
Parking        0
Warehouse      0
Elevator       0
Address       23
Price          0
Price(USD)     0
dtype: int64

In [210]:
data.dropna(inplace= True)
data.shape

(3456, 8)

In [211]:
boolean_features = ['Parking','Warehouse','Elevator']
data[boolean_features] = data[boolean_features].astype('Int64')
data.head(10)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,1,1,1,Shahran,1850000000.0,61666.67
1,60,1,1,1,1,Shahran,1850000000.0,61666.67
2,79,2,1,1,1,Pardis,550000000.0,18333.33
3,95,2,1,1,1,Shahrake Qods,902500000.0,30083.33
4,123,2,1,1,1,Shahrake Gharb,7000000000.0,233333.33
5,70,2,1,1,0,North Program Organization,2050000000.0,68333.33
6,87,2,1,1,1,Pardis,600000000.0,20000.0
7,59,1,1,1,1,Shahran,2150000000.0,71666.67
8,54,2,1,1,0,Andisheh,493000000.0,16433.33
9,71,1,1,1,1,West Ferdows Boulevard,2370000000.0,79000.0


In [212]:
data2 = data.copy()
data2.loc[:, "Price"] =data["Price"].map('{:,.0f}'.format)

In [213]:
data2

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,1,1,1,Shahran,1850000000,61666.67
1,60,1,1,1,1,Shahran,1850000000,61666.67
2,79,2,1,1,1,Pardis,550000000,18333.33
3,95,2,1,1,1,Shahrake Qods,902500000,30083.33
4,123,2,1,1,1,Shahrake Gharb,7000000000,233333.33
...,...,...,...,...,...,...,...,...
3474,86,2,1,1,1,Southern Janatabad,3500000000,116666.67
3475,83,2,1,1,1,Niavaran,6800000000,226666.67
3476,75,2,0,0,0,Parand,365000000,12166.67
3477,105,2,1,1,1,Dorous,5600000000,186666.67


In [214]:
data['Price(USD)'] = data['Price(USD)'].apply(lambda x : x * 0.6 )

data['Price(USD)']


0        37000.002
1        37000.002
2        10999.998
3        18049.998
4       139999.998
           ...    
3474     70000.002
3475    136000.002
3476      7300.002
3477    112000.002
3478      7200.000
Name: Price(USD), Length: 3456, dtype: float64

In [215]:
data.Address.unique()

array(['Shahran', 'Pardis', 'Shahrake Qods', 'Shahrake Gharb',
       'North Program Organization', 'Andisheh', 'West Ferdows Boulevard',
       'Narmak', 'Saadat Abad', 'Zafar', 'Islamshahr', 'Pirouzi',
       'Shahrake Shahid Bagheri', 'Moniriyeh', 'Velenjak', 'Amirieh',
       'Southern Janatabad', 'Salsabil', 'Zargandeh', 'Feiz Garden',
       'Water Organization', 'ShahrAra', 'Gisha', 'Ray', 'Abbasabad',
       'Ostad Moein', 'Farmanieh', 'Parand', 'Punak', 'Qasr-od-Dasht',
       'Aqdasieh', 'Pakdasht', 'Railway', 'Central Janatabad',
       'East Ferdows Boulevard', 'Pakdasht KhatunAbad', 'Sattarkhan',
       'Baghestan', 'Shahryar', 'Northern Janatabad', 'Daryan No',
       'Southern Program Organization', 'Rudhen', 'West Pars', 'Afsarieh',
       'Marzdaran', 'Dorous', 'Sadeghieh', 'Chahardangeh', 'Baqershahr',
       'Jeyhoon', 'Lavizan', 'Shams Abad', 'Fatemi',
       'Keshavarz Boulevard', 'Kahrizak', 'Qarchak',
       'Northren Jamalzadeh', 'Azarbaijan', 'Bahar',
       'P

In [216]:
len(data.Address.unique())

192

In [217]:
data_address = data['Address'].value_counts().copy()
data_address

Address
Punak                     161
Pardis                    146
West Ferdows Boulevard    145
Gheitarieh                141
Shahran                   130
                         ... 
Chardangeh                  1
Mehrabad                    1
Pakdasht KhatunAbad         1
Kazemabad                   1
Yakhchiabad                 1
Name: count, Length: 192, dtype: int64

In [218]:
data_address_counts = data['Address'].value_counts().reset_index()
data_address_counts.columns = ['Address', 'Counts']

fig = px.bar(data_address_counts, x='Address', y='Counts', title='Address Counts')
fig.show()

In [219]:
top_5 = data.sort_values('Price',ascending=False)[:5]
top_5

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
1707,420,4,1,1,1,Zaferanieh,92400000000.0,1848000.0
1810,705,5,1,1,0,Abazar,91000000000.0,1819999.998
430,400,5,1,1,0,Lavasan,85000000000.0,1699999.998
819,680,5,1,1,0,Ekhtiarieh,81600000000.0,1632000.0
1332,350,4,1,1,1,Niavaran,80500000000.0,1609999.998


In [220]:
top_5['Price(Millions USD)'] = top_5['Price(USD)'] / 1e6
df_sorted = top_5.sort_values('Price(USD)', ascending=False)
fig = px.bar(df_sorted, x='Address', y='Price(Millions USD)', title='The 5 most expensive houses',
             labels={'Price(Millions USD)': 'Price (Millions USD)', 'Address': 'Address'},
             hover_data={'Area': True, 'Price(USD)': ':.2e'},
             text='Area',
             )
fig.update_layout(xaxis_title='Address',
                  yaxis_title='Price (Millions USD)',
                  yaxis_type='log',
                  uniformtext_minsize=8,
                  )
fig.show()

In [221]:
desire_features = ['Area', 'Room','Parking', 'Warehouse', 'Elevator']
x_dataset = data[desire_features].values
x_dataset

array([[63, 1, 1, 1, 1],
       [60, 1, 1, 1, 1],
       [79, 2, 1, 1, 1],
       ...,
       [75, 2, 0, 0, 0],
       [105, 2, 1, 1, 1],
       [82, 2, 0, 1, 1]], dtype=object)

In [222]:
y_dataset = data['Price'].values
x_train,x_test,y_train,y_test = train_test_split(x_dataset, y_dataset, test_size= 0.2)
x_dataset = x_dataset.astype(np.float64)
y_dataset = y_dataset.astype(np.float64)

In [223]:
lss_model = LLS()
lss_model.fit(x_dataset,y_dataset)
Y_pred = lss_model.predict(x_test)

print(Y_pred)

[9631402622.484577 4960122211.416073 4671280412.862932 9631402622.484577
 4960122219.670448 9631402620.690147 4527306679.105896 4671280409.991845
 288841822.2396054 9631402661.244246 4960122225.771507 288841818.6507468
 5659016427.5342865 9631402626.073435 -143973722.63157606
 4960122218.234903 10763112530.617159 4960122220.029333 288841821.5218339
 4960122226.848164 6091831982.0953865 4960122211.416073 4960122232.231451
 4960122214.287161 288841816.856318 6091831975.276556 4960122219.670448
 4960122217.876019 8210851046.433292 4960122218.234903 4960122218.952677
 9631402620.690147 1420551573.8979697 8643666594.893332 4960122215.363817
 4960122211.057188 4960122216.79936 1420551577.8457131 3539570653.667966
 9631402618.895718 4960122244.792456 9631402658.37316 4960122221.464876
 4960122228.642593 4960122222.182648 4960122221.464876 288841838.38946676
 9631402621.766806 4960122216.081589 6091831968.098839 288841818.29186106
 4960122217.876019 288841818.6507468 9631402617.460175 49601222

In [224]:
# Calculate MAE
mae_custom = mean_absolute_error(y_test, Y_pred)
print("(MAE):", mae_custom)

# Calculate MSE
mse_custom = mean_squared_error(y_test, Y_pred)
print("(MSE):", mse_custom)

# Calculate RMSE
rmse_custom = np.sqrt(mse_custom)
print("(RMSE):", rmse_custom)


(MAE): 4202964392.406291
(MSE): 6.009651279007788e+19
(RMSE): 7752194062.978421


In [225]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
lr_predictions = lr_model.predict(x_test)
mae_lr = mean_absolute_error(y_test, lr_predictions)
mse_lr = mean_squared_error(y_test, lr_predictions)
rmse_lr = np.sqrt(mse_lr)

ridgecv_model = RidgeCV()
ridgecv_model.fit(x_train, y_train)
ridgecv_predictions = ridgecv_model.predict(x_test)
mae_ridgecv = mean_absolute_error(y_test, ridgecv_predictions)
mse_ridgecv = mean_squared_error(y_test, ridgecv_predictions)
rmse_ridgecv = np.sqrt(mse_ridgecv)

print("Mean Absolute Error (MAE):")
print("Custom LLS model:", mae_custom)
print("Linear Regression model:", mae_lr)
print("RidgeCV model:", mae_ridgecv)

print("\nMean Squared Error (MSE):")
print("Custom LLS model:", mse_custom)
print("Linear Regression model:", mse_lr)
print("RidgeCV model:", mse_ridgecv)

print("\nRoot Mean Squared Error (RMSE):")
print("Custom LLS model:", rmse_custom)
print("Linear Regression model:", rmse_lr)
print("RidgeCV model:", rmse_ridgecv)


Mean Absolute Error (MAE):
Custom LLS model: 4202964392.406291
Linear Regression model: 4135883944.199628
RidgeCV model: 287764175336.03186

Mean Squared Error (MSE):
Custom LLS model: 6.009651279007788e+19
Linear Regression model: 5.517410425296108e+19
RidgeCV model: 8.286330810567401e+22

Root Mean Squared Error (RMSE):
Custom LLS model: 7752194062.978421
Linear Regression model: 7427927318.772114
RidgeCV model: 287859875817.513


In [229]:
models = ['Custom LLS', 'Linear Regression', 'RidgeCV']
mae_values = [mae_custom, mae_lr, mae_ridgecv]
mse_values = [mse_custom, mse_lr, mse_ridgecv]
rmse_values = [rmse_custom, rmse_lr, rmse_ridgecv]
fig = make_subplots(rows=1, cols=3, subplot_titles=['Mean Absolute Error (MAE)', 'Mean Squared Error (MSE)', 'Root Mean Squared Error (RMSE)'])
fig.add_trace(go.Bar(x=models, y=mae_values, name='MAE'), row=1, col=1)
fig.add_trace(go.Bar(x=models, y=mse_values, name='MSE'), row=1, col=2)
fig.add_trace(go.Bar(x=models, y=rmse_values, name='RMSE'), row=1, col=3)
fig.update_layout(title='MAE and MSE and RMSE Comparison',
                  xaxis=dict(title='Models'),
                  yaxis=dict(title='Error'),
                  template='plotly_white')
fig.show()


In [230]:
models = ['Custom LLS', 'Linear Regression', 'RidgeCV']
mae_values = [mae_custom, mae_lr]
mse_values = [mse_custom, mse_lr]
rmse_values = [rmse_custom, rmse_lr]
fig = make_subplots(rows=1, cols=3, subplot_titles=['Mean Absolute Error (MAE)', 'Mean Squared Error (MSE)', 'Root Mean Squared Error (RMSE)'])
fig.add_trace(go.Bar(x=models, y=mae_values, name='MAE'), row=1, col=1)
fig.add_trace(go.Bar(x=models, y=mse_values, name='MSE'), row=1, col=2)
fig.add_trace(go.Bar(x=models, y=rmse_values, name='RMSE'), row=1, col=3)
fig.update_layout(title='MAE and MSE and RMSE Comparison',
                  xaxis=dict(title='Models'),
                  yaxis=dict(title='Error'),
                  template='plotly_white')
fig.show()