In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Property_Valuation_Data (1).csv', encoding='latin-1')


In [None]:
data.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,$39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,$120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,$62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,$95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,$51.00


In [None]:
print(f'The data has {data.shape[0]} rows and {data.shape[1]} columns.')


The data has 13320 rows and 9 columns.


In [None]:
data.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

**Drop The Features that are not required**

In [None]:
data1=data.drop(['area_type','society','balcony','availability'],axis='columns')
data1.shape

(13320, 5)

In [None]:
data1.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [None]:
data1=data.dropna()
data1.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

**Add New Features (Intergers) for BHK**

In [None]:
# Convert the 'size' column to string type before applying the split operation
data1['bhk'] = data1['size'].astype(str).apply(lambda x: int(x.split(' ')[0]) if x != 'nan' else 0)
data1.bhk.unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9,  0, 27, 10, 19, 16, 43, 14,
       12, 13, 18])

**Total_Sqft Feature**

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
data1[~data1['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186,4
56,Devanahalli,4 Bedroom,3010 - 3410,,192,4
81,Hennur Road,4 Bedroom,2957 - 3450,,? 224.50,4
122,Hebbal,4 BHK,3067 - 8156,4.0,? 477.00,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,? 54.01,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,? 43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,? 56.80,2
224,Devanahalli,3 BHK,1520 - 1740,,? 74.82,3
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2


**Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range. There are other cases such as 34.46Sq. Meter which one can convert to square ft using unit conversion. I am going to just drop such corner cases to keep things simple**

In [None]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
data2 = data1.copy()
data2.total_sqft = data2.total_sqft.apply(convert_sqft_to_num)
data2 = data2[data2.total_sqft.notnull()]
data2.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,$39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,$120.00,4
2,Uttarahalli,3 BHK,1440.0,2.0,$62.00,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,$95.00,3
4,Kothanur,2 BHK,1200.0,2.0,$51.00,2


**For below row, it shows total_sqft as 2475 which is an average of the range 2100-2850**

In [None]:
data2.loc[30]

location      Yelahanka
size              4 BHK
total_sqft       2475.0
bath                4.0
price               186
bhk                   4
Name: 30, dtype: object

In [None]:
(2100+2850)/2

2475.0

**FEATURE ENGINEERING**

---
Adding new feature called price_per_sqft



In [None]:
data3 = data2.copy()
data3['price_per_sqft'] = data3['price']*100000/data3['total_sqft']
data3.head()

TypeError: unsupported operand type(s) for /: 'str' and 'float'

In [None]:
##Statistical Summary
data_stats = data3['price_per_sqft'].describe()
data_stats

count      7288.000000
mean       6123.191072
std       14096.465643
min         371.428571
25%        4280.706600
50%        5319.148936
75%        6703.910615
max      912343.547691
Name: price_per_sqft, dtype: float64

In [None]:
data2.to_csv('house_pred.csv',index=False)

**Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of locations**

In [None]:
data2.location = data2.location.apply(lambda x: x.strip())
location_stats = data2['location'].value_counts(ascending=False)
location_stats

location
Whitefield           397
Sarjapur  Road       310
Electronic City      238
Kanakpura Road       216
Thanisandra          214
                    ... 
1 Giri Nagar           1
Chikkabidarakallu      1
Mailasandra            1
Kattigenahalli         1
Abshot Layout          1
Name: count, Length: 625, dtype: int64

In [None]:
location_stats.values.sum()

7481

In [None]:
len(location_stats[location_stats>10])

143

In [None]:
len(location_stats)


625

In [None]:
len(location_stats[location_stats<=10])

482

**Dimensionality Reduction**


---


Any location having less than 10 data
points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

location
Arekere                   10
Jalahalli East            10
Amruthahalli              10
Banashankari Stage III    10
Rayasandra                10
                          ..
1 Giri Nagar               1
Chikkabidarakallu          1
Mailasandra                1
Kattigenahalli             1
Abshot Layout              1
Name: count, Length: 482, dtype: int64

In [None]:
len(data2.location.unique())

625

In [None]:
data2.location = data2.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(data2.location.unique())

144

In [None]:
data2.head(10)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,$39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,$120.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,$95.00
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170.0,2.0,1.0,$38.00
11,Plot Area,Ready To Move,Whitefield,4 Bedroom,Prrry M,2785.0,5.0,3.0,$295.00
12,Super built-up Area,Ready To Move,7th Phase JP Nagar,2 BHK,Shncyes,1000.0,2.0,1.0,$38.00
14,Plot Area,Ready To Move,Sarjapur,3 Bedroom,Skityer,2250.0,3.0,2.0,148-150
15,Super built-up Area,Ready To Move,Mysore Road,2 BHK,PrntaEn,1175.0,2.0,2.0,$73.50
16,Super built-up Area,Ready To Move,Bisuvanahalli,3 BHK,Prityel,1180.0,3.0,2.0,$48.00
17,Super built-up Area,Ready To Move,Raja Rajeshwari Nagar,3 BHK,GrrvaGr,1540.0,3.0,3.0,$60.00


As a data scientist when you have a conversation with your business manager (who has expertise in real estate), he will tell you that normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft

In [None]:
data2[data2.total_sqft/data2.bhk<300].head()

AttributeError: 'DataFrame' object has no attribute 'bhk'