In [372]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

## Analyzing Data Set

In [373]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [374]:
df.shape

(13320, 9)

In [375]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [376]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [377]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


## Data Cleaning Process

In [378]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

### Fixing missing values 
* society - drop - 41% data missing 
* location - fill - using mode
* size - fill - using median
* bath - fill - using median
* balcony - fill - using median

In [379]:


df.drop(columns=['society'], inplace=True)  
df.drop(columns=['location'], inplace=True)  

df['size'] = df['size'].fillna(df['size'].mode()[0])
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony'] = df['balcony'].fillna(df['balcony'].median())

In [380]:
df.isna().sum()

area_type       0
availability    0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

### Checking Columns Seperately & Converting Categorical Feature to Numerical Feature

In [381]:
df.head()

Unnamed: 0,area_type,availability,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,2 BHK,1200,2.0,1.0,51.0


#### Using Label Encoder to encode 'area_type' column
* 0 - Built-up Area
* 1 - Carpet  Area
* 2 - Plot Area	
* 3 - Super built-up Area

In [382]:
df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [383]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
df['area_type'] = labelEncoder.fit_transform(df['area_type'])

In [384]:
df.head()

Unnamed: 0,area_type,availability,size,total_sqft,bath,balcony,price
0,3,19-Dec,2 BHK,1056,2.0,1.0,39.07
1,2,Ready To Move,4 Bedroom,2600,5.0,3.0,120.0
2,0,Ready To Move,3 BHK,1440,2.0,3.0,62.0
3,3,Ready To Move,3 BHK,1521,3.0,1.0,95.0
4,3,Ready To Move,2 BHK,1200,2.0,1.0,51.0


#### For 'availability' column
* All dates Converted to DD-MM-YYYY
* For those not having date time, they are converted to 1-Jan-2025 

In [385]:
df['availability'].unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       'Immediate Possession', '19-May', '17-Nov', '20-Oct', '20-Jun',
       '19-Feb', '21-Oct', '21-Jan', '17-Mar', '17-Apr', '22-May',
       '19-Oct', '21-Jul', '21-Nov', '21-Mar', '16-Dec', '22-Mar',
       '20-Jan', '21-Sep', '21-Aug', '14-Nov', '19-Nov', '15-Nov',
       '16-Jul', '15-Jun', '17-Feb', '20-Nov', '20-Jul', '16-Sep',
       '15-Oct', '15-Dec', '16-Oct', '22-Nov', '15-Aug', '17-Jan',
       '16-Nov', '20-Apr', '16-Jan', '14-Jul'], dtype=object)

In [386]:
df.head()

Unnamed: 0,area_type,availability,size,total_sqft,bath,balcony,price
0,3,19-Dec,2 BHK,1056,2.0,1.0,39.07
1,2,Ready To Move,4 Bedroom,2600,5.0,3.0,120.0
2,0,Ready To Move,3 BHK,1440,2.0,3.0,62.0
3,3,Ready To Move,3 BHK,1521,3.0,1.0,95.0
4,3,Ready To Move,2 BHK,1200,2.0,1.0,51.0


In [387]:
def convert_date(value):
    if value in ['Immediate Possession', 'Ready To Move']:
        return '1-Jan-2025'
    else:
        try:
    
            date_obj = datetime.strptime(value, '%d-%b')
            return date_obj.replace(year=2025).strftime('%d-%b-%Y')
        except ValueError:
            return value  


df['availability'] = df['availability'].apply(convert_date)



In [388]:
df['availability'] = pd.to_datetime(df['availability'], format='%d-%b-%Y')

# Extract day, month, and year into separate columns
df['availability_day'] = df['availability'].dt.day
df['availability_month'] = df['availability'].dt.month
df['availability_year'] = df['availability'].dt.year

df = df.drop(columns=['availability'])


In [389]:
df.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price,availability_day,availability_month,availability_year
0,3,2 BHK,1056,2.0,1.0,39.07,19,12,2025
1,2,4 Bedroom,2600,5.0,3.0,120.0,1,1,2025
2,0,3 BHK,1440,2.0,3.0,62.0,1,1,2025
3,3,3 BHK,1521,3.0,1.0,95.0,1,1,2025
4,3,2 BHK,1200,2.0,1.0,51.0,1,1,2025


#### For 'size' column 
* Spliting and storing bhk in size_bhk
* 2 BHK - 2
* 8 Bedroom - 8

In [390]:
df['size'].unique()


array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [391]:
df['size_bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df['size_bhk'].unique()

df = df.drop(columns=['size'])

In [392]:
df.head()

Unnamed: 0,area_type,total_sqft,bath,balcony,price,availability_day,availability_month,availability_year,size_bhk
0,3,1056,2.0,1.0,39.07,19,12,2025,2
1,2,2600,5.0,3.0,120.0,1,1,2025,4
2,0,1440,2.0,3.0,62.0,1,1,2025,3
3,3,1521,3.0,1.0,95.0,1,1,2025,3
4,3,1200,2.0,1.0,51.0,1,1,2025,2


#### For 'total_sqft'
* For normal float value keeping like that
* For '1133 - 1384' - we will take the average
* For 34.46Sq. Meter - just taking float value

In [393]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      shape=(2117,), dtype=object)

In [394]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [395]:
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,area_type,total_sqft,bath,balcony,price,availability_day,availability_month,availability_year,size_bhk
30,3,2100 - 2850,4.0,0.0,186.0,19,12,2025,4
56,0,3010 - 3410,2.0,2.0,192.0,20,2,2025,4
81,0,2957 - 3450,2.0,2.0,224.5,18,10,2025,4
122,3,3067 - 8156,4.0,0.0,477.0,18,3,2025,4
137,3,1042 - 1105,2.0,0.0,54.005,19,3,2025,2
165,3,1145 - 1340,2.0,0.0,43.49,18,12,2025,2
188,3,1015 - 1540,2.0,0.0,56.8,1,1,2025,2
224,3,1520 - 1740,2.0,2.0,74.82,19,12,2025,3
410,3,34.46Sq. Meter,1.0,0.0,18.5,1,1,2025,1
549,3,1195 - 1440,2.0,0.0,63.77,18,9,2025,2


In [396]:
import re
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        try:
            return (float(tokens[0].strip()) + float(tokens[1].strip())) / 2
        except ValueError:
            return None 
    
   
    num_match = re.findall(r"[-+]?\d*\.?\d+", x)  
    if num_match:
        try:
            return float(num_match[0])  
        except ValueError:
            return None

    return None

In [397]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df['total_sqft'].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ], shape=(2027,))

In [403]:
df.head(10)

Unnamed: 0,area_type,total_sqft,bath,balcony,price,availability_day,availability_month,availability_year,size_bhk
0,3,1056.0,2.0,1.0,39.07,19,12,2025,2
1,2,2600.0,5.0,3.0,120.0,1,1,2025,4
2,0,1440.0,2.0,3.0,62.0,1,1,2025,3
3,3,1521.0,3.0,1.0,95.0,1,1,2025,3
4,3,1200.0,2.0,1.0,51.0,1,1,2025,2
5,3,1170.0,2.0,1.0,38.0,1,1,2025,2
6,3,2732.0,4.0,2.0,204.0,18,5,2025,4
7,3,3300.0,4.0,2.0,600.0,1,1,2025,4
8,3,1310.0,3.0,1.0,63.25,1,1,2025,3
9,2,1020.0,6.0,2.0,370.0,1,1,2025,6


#### All Clear for 'bath','balcony' & 'price'

In [410]:
print(df['bath'].unique())
print(df['balcony'].unique())
print(df['price'].unique())

[ 2.  5.  3.  4.  6.  1.  9.  8.  7. 11. 10. 14. 27. 12. 16. 40. 15. 13.
 18.]
[1. 3. 2. 0.]
[ 39.07 120.    62.   ...  40.14 231.   488.  ]
