In [402]:
#importing essential libraries


import pandas as pd 
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
pd.set_option('display.max_rows', None)

### Reading Dataset

In [403]:
df1= pd.read_csv("D:\Backup and Sync\Personal\Reas Estate Project\Dhaka_City_Apartment_Price_Estimation\DataSet\dhaka_city_apartment_price.csv")
df1.head()

Unnamed: 0,property_type,location,bed,bath,total_sqft,price
0,Apartment,"Darussalam, Mirpur, Dhaka",3.0,3.0,"1,350 sqft",7400000
1,Apartment,"Section 1, Mirpur, Dhaka",3.0,2.0,"1,300 sqft",5800000
2,Apartment,"Chad Uddan Housing, Mohammadpur, Dhaka",2.0,2.0,600 sqft,2600000
3,Apartment,"Chad Uddan Housing, Mohammadpur, Dhaka",2.0,2.0,600 sqft,2600000
4,Apartment,"Bakshi Bazar, Lalbagh, Dhaka",3.0,2.0,"1,100 sqft",8000000


In [404]:
df1.shape

(8900, 6)

#### Data Preprocessing

##### as we do not need property_type column we should drop that. Also total_sqft and price column need to be converted into numeric value.

In [405]:
df1.groupby('property_type')['property_type'].agg('count')

property_type
Apartment    8900
Name: property_type, dtype: int64

In [406]:
df2=df1.drop(['property_type'], axis='columns')
df2.head()

Unnamed: 0,location,bed,bath,total_sqft,price
0,"Darussalam, Mirpur, Dhaka",3.0,3.0,"1,350 sqft",7400000
1,"Section 1, Mirpur, Dhaka",3.0,2.0,"1,300 sqft",5800000
2,"Chad Uddan Housing, Mohammadpur, Dhaka",2.0,2.0,600 sqft,2600000
3,"Chad Uddan Housing, Mohammadpur, Dhaka",2.0,2.0,600 sqft,2600000
4,"Bakshi Bazar, Lalbagh, Dhaka",3.0,2.0,"1,100 sqft",8000000


#### Data Cleaning

##### Handling the missing values (NA Values)

In [407]:
df2.isnull().sum()

location      0
bed           3
bath          3
total_sqft    0
price         0
dtype: int64

In [408]:
df3=df2.dropna()
df3.isnull().sum()

location      0
bed           0
bath          0
total_sqft    0
price         0
dtype: int64

In [409]:
df3.shape

(8897, 5)

In [410]:
df3.duplicated().sum()

3137

In [411]:
df4=df3.drop_duplicates(keep='first').reset_index(drop=True)

In [412]:
df4.head()

Unnamed: 0,location,bed,bath,total_sqft,price
0,"Darussalam, Mirpur, Dhaka",3.0,3.0,"1,350 sqft",7400000
1,"Section 1, Mirpur, Dhaka",3.0,2.0,"1,300 sqft",5800000
2,"Chad Uddan Housing, Mohammadpur, Dhaka",2.0,2.0,600 sqft,2600000
3,"Bakshi Bazar, Lalbagh, Dhaka",3.0,2.0,"1,100 sqft",8000000
4,"Darussalam, Mirpur, Dhaka",3.0,3.0,"1,100 sqft",6000000


In [413]:
df4.shape

(5760, 5)

#### Removing "sqft" from total_sqft column and inside comma

In [414]:
def remove_sqft_text(x):
    sqft = x.split(' ')
    sqft[0]= sqft[0].replace(',', '') 
    return int((sqft[0]))

In [415]:
df4['total_sqft'] = df4['total_sqft'].apply(remove_sqft_text)

In [416]:
df4.head()

Unnamed: 0,location,bed,bath,total_sqft,price
0,"Darussalam, Mirpur, Dhaka",3.0,3.0,1350,7400000
1,"Section 1, Mirpur, Dhaka",3.0,2.0,1300,5800000
2,"Chad Uddan Housing, Mohammadpur, Dhaka",2.0,2.0,600,2600000
3,"Bakshi Bazar, Lalbagh, Dhaka",3.0,2.0,1100,8000000
4,"Darussalam, Mirpur, Dhaka",3.0,3.0,1100,6000000


##### Removing comma from PRICE comun. Also divided the price by one lakh to make it readable. And the value now is in floating point format.

In [417]:
# making price column string at first

df4['price'] = df4['price'].astype(str)

In [418]:
def remove_comma_then_int(x):
    p=x.replace(',', '') 
    return float(int(p)/(100000))
    

In [419]:
df4['price'] = df4['price'].apply(remove_comma_then_int)

In [420]:
df4.head()

Unnamed: 0,location,bed,bath,total_sqft,price
0,"Darussalam, Mirpur, Dhaka",3.0,3.0,1350,74.0
1,"Section 1, Mirpur, Dhaka",3.0,2.0,1300,58.0
2,"Chad Uddan Housing, Mohammadpur, Dhaka",2.0,2.0,600,26.0
3,"Bakshi Bazar, Lalbagh, Dhaka",3.0,2.0,1100,80.0
4,"Darussalam, Mirpur, Dhaka",3.0,3.0,1100,60.0


#### Label Ecoding the Location Colimn

In [421]:
df4['location'].unique()

array(['Darussalam, Mirpur, Dhaka', 'Section 1, Mirpur, Dhaka',
       'Chad Uddan Housing, Mohammadpur, Dhaka',
       'Bakshi Bazar, Lalbagh, Dhaka',
       'Baitul Aman Housing Society, Adabor, Dhaka',
       'Ashi Dag Road, Ibrahimpur, Dhaka', 'Uttar Badda, Badda, Dhaka',
       'Block F, Bashundhara R-A, Dhaka',
       'Mansurabad Housing Society, Adabor, Dhaka',
       'New Elephant Road, Hatirpool, Dhaka',
       'Dolphin Goli Road, Kalabagan, Dhaka',
       'South Banasree Project, Banasree, Dhaka',
       'Block H, Banasree, Dhaka', 'West Shewrapara, Mirpur, Dhaka',
       'Rayer Bazaar, Hazaribag, Dhaka',
       'Topkhana Road, Shegunbagicha, Dhaka',
       'Block C, Bashundhara R-A, Dhaka', 'Meradia, Khilgaon, Dhaka',
       'Sector 1, Uttara, Dhaka', 'Block D, Lalmatia, Dhaka',
       'West Dhanmondi and Shangkar, Dhanmondi, Dhaka', 'Turag, Dhaka',
       'Kallyanpur, Mirpur, Dhaka', 'Section 10, Mirpur, Dhaka',
       'Block I, Bashundhara R-A, Dhaka', 'Lalbagh Road, Lalba

In [422]:
# Import label encoder 
from sklearn import preprocessing 

In [423]:
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

In [424]:
# Encode labels in column 'species'. 
df4['location']= label_encoder.fit_transform(df4['location'])

In [425]:
df4['location'].unique()

array([131, 459, 115,  40,  39,  25, 549,  90, 267, 322, 139, 517,  99,
       574, 373, 546,  73, 274, 468,  81, 560, 547, 215, 460, 101, 247,
        80, 122, 490,  71, 161,  62, 518, 353,  94,  84, 326, 580, 148,
       450, 324, 196, 216, 498, 508, 359, 282, 141, 350, 293, 135, 108,
       559, 469, 448, 102, 390, 377,  26,   5, 466,  48, 295, 462, 184,
       176, 179, 261,  85, 119, 150, 369, 283, 197, 264, 465, 480, 239,
       472, 361, 503, 227,  65, 230, 478, 458, 182, 471, 165,   0, 334,
       277,  67, 341, 107, 124, 456, 204, 289, 127, 539,  47, 492, 272,
       331, 203, 171, 330, 425,  95, 104, 140, 388, 351,  12, 223, 113,
       514, 253, 567, 219, 163, 405,  43, 504, 500, 569, 238, 553, 523,
       427, 363, 537, 394, 189,  75, 134, 294, 501, 444, 250, 541, 233,
       181,  79, 270,  46, 345, 473, 314, 187, 309, 344, 275, 536, 103,
       470, 502, 183,  32, 236, 126, 467, 476,  68, 464,  14, 426,  29,
       208, 443, 338, 463,  23, 160, 114, 226, 491, 178, 566,  5

In [426]:
df5=df4.drop(columns = ['location'])
df5.head()

Unnamed: 0,bed,bath,total_sqft,price
0,3.0,3.0,1350,74.0
1,3.0,2.0,1300,58.0
2,2.0,2.0,600,26.0
3,3.0,2.0,1100,80.0
4,3.0,3.0,1100,60.0


In [427]:
x= df5.values

In [428]:
# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

In [429]:
df5= pd.DataFrame(x_scaled, columns=['bed','bath','total_sqft', 'price'])
df5.head()

Unnamed: 0,bed,bath,total_sqft,price
0,0.4,0.333333,0.12987,0.041667
1,0.4,0.166667,0.123035,0.030914
2,0.2,0.166667,0.027341,0.009409
3,0.4,0.166667,0.095694,0.045699
4,0.4,0.333333,0.095694,0.032258


In [430]:
df6=df5.assign(location=df4['location'])
df6

Unnamed: 0,bed,bath,total_sqft,price,location
0,0.4,0.333333,0.12987,0.041667,131
1,0.4,0.166667,0.123035,0.030914,459
2,0.2,0.166667,0.027341,0.009409,115
3,0.4,0.166667,0.095694,0.045699,40
4,0.4,0.333333,0.095694,0.032258,131
5,0.4,0.333333,0.123035,0.039651,131
6,0.4,0.333333,0.143541,0.045027,131
7,0.4,0.333333,0.157211,0.059644,39
8,0.4,0.333333,0.143541,0.035618,25
9,0.4,0.333333,0.11661,0.040995,549


In [431]:
X=df6.drop('price', axis='columns')
X.head()

Unnamed: 0,bed,bath,total_sqft,location
0,0.4,0.333333,0.12987,131
1,0.4,0.166667,0.123035,459
2,0.2,0.166667,0.027341,115
3,0.4,0.166667,0.095694,40
4,0.4,0.333333,0.095694,131


In [432]:
y = df6.price
y.head()

0    0.041667
1    0.030914
2    0.009409
3    0.045699
4    0.032258
Name: price, dtype: float64

In [433]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=10)

In [434]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test,y_test)

0.7437548087534356

In [435]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv= ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)

cross_val_score(LinearRegression(), X,y, cv=cv)

array([0.74375481, 0.77911764, 0.76105254, 0.76358986, 0.69805642])

In [436]:
X_test

Unnamed: 0,bed,bath,total_sqft,location
1797,0.4,0.166667,0.084757,281
1217,0.4,0.5,0.150376,460
344,0.4,0.333333,0.109364,549
5271,0.4,0.166667,0.095694,283
5463,0.4,0.166667,0.082023,353
3200,0.2,0.166667,0.079973,153
5472,0.2,0.166667,0.085031,26
2817,0.4,0.166667,0.12987,469
2108,0.4,0.5,0.202871,80
3476,0.2,0.166667,0.043746,462


In [444]:
lr_clf.predict(X_test)

array([ 0.0193758 ,  0.0605357 ,  0.03382144,  0.02937379,  0.01742577,
        0.03355298,  0.03720753,  0.06197979,  0.10557335,  0.00282669,
        0.18835336,  0.01274773,  0.02789939, -0.00667491,  0.06446454,
        0.06104986,  0.04609535,  0.02387562,  0.0368783 ,  0.04687332,
        0.02965537,  0.01817498,  0.28375813,  0.10016993,  0.01587212,
        0.03051551,  0.04776977,  0.09584129,  0.01829906,  0.03787044,
        0.03364293,  0.12000446,  0.14574709,  0.16595791,  0.01668534,
       -0.006795  ,  0.03777301,  0.03316209,  0.0108542 ,  0.05861178,
        0.04496065,  0.09310874,  0.04539106,  0.04607984,  0.05039103,
        0.14852579,  0.04175398,  0.0065551 ,  0.0207789 ,  0.03218668,
        0.12702928,  0.10751687,  0.02067597,  0.03147311,  0.03075313,
        0.02247182,  0.0457187 ,  0.07629779,  0.07064318,  0.01670695,
        0.06381389,  0.02317908,  0.01592708,  0.01114977,  0.10261256,
        0.06369714,  0.04669372,  0.05620279,  0.03973638,  0.03

In [443]:
lr_clf.predict(X_test[:1])

array([0.0193758])

In [438]:
y_test

1797    0.018817
1217    0.052419
344     0.024194
5271    0.043683
5463    0.024194
3200    0.020161
5472    0.024194
2817    0.033602
2108    0.118548
3476    0.014785
1350    0.536290
1957    0.028898
799     0.045699
2367    0.004704
2334    0.096102
3226    0.022849
3911    0.049059
3196    0.011425
4526    0.075941
3392    0.040356
511     0.014113
4       0.032258
1012    0.210349
404     0.086022
5714    0.026882
970     0.028898
5438    0.043813
2904    0.089919
3051    0.025538
5441    0.049731
3164    0.028898
3305    0.069220
4859    0.116263
3642    0.126344
3031    0.048723
3692    0.015457
860     0.026882
3032    0.038978
5313    0.016801
3146    0.038978
2787    0.062500
4869    0.068548
4431    0.047327
168     0.038414
2094    0.042339
758     0.160110
263     0.030914
2153    0.019657
2822    0.037634
2370    0.029570
117     0.119624
1172    0.089382
2447    0.030242
3588    0.037634
4594    0.017473
5196    0.023091
5608    0.025538
2487    0.056956
4578    0.0927

In [442]:
df6[:1]

Unnamed: 0,bed,bath,total_sqft,price,location
0,0.4,0.333333,0.12987,0.041667,131
